From 2024f91e965f7d7a38364874031c2132dcd11992 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 19 Aug 2020 13:47:15 +0200
Subject: ns: Add a common refcount into ns_common

Currently every namespace type has its own lifetime counter which is stored
in the specific namespace struct. The lifetime counters are used
identically for all namespaces types. Namespaces may of course have
additional unrelated counters and these are not altered.

This introduces a common lifetime counter into struct ns_common. The
ns_common struct encompasses information that all namespaces share. That
should include the lifetime counter since its common for all of them.

It also allows us to unify the type of the counters across all namespaces.
Most of them use refcount_t but one uses atomic_t and at least one uses
kref. Especially the last one doesn't make much sense since it's just a
wrapper around refcount_t since 2016 and actually complicates cleanup
operations by having to use container_of() to cast the correct namespace
struct out of struct ns_common.

Having the lifetime counter for the namespaces in one place reduces
maintenance cost. Not just because after switching all namespaces over we
will have removed more code than we added but also because the logic is
more easily understandable and we indicate to the user that the basic
lifetime requirements for all namespaces are currently identical.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
[christian.brauner@ubuntu.com: rewrite commit & split into two patches]
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/ns_common.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 5fbc4000358f..0f1d024bd958 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -2,12 +2,15 @@
 #ifndef _LINUX_NS_COMMON_H
 #define _LINUX_NS_COMMON_H
 
+#include <linux/refcount.h>
+
 struct proc_ns_operations;
 
 struct ns_common {
 	atomic_long_t stashed;
 	const struct proc_ns_operations *ops;
 	unsigned int inum;
+	refcount_t count;
 };
 
 #endif
-- 
cgit 


From 9a56493f6942c0e2df1579986128721da96e00d8 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 3 Aug 2020 13:16:21 +0300
Subject: uts: Use generic ns_common::count

Switch over uts namespaces to use the newly introduced common lifetime
counter.

Currently every namespace type has its own lifetime counter which is stored
in the specific namespace struct. The lifetime counters are used
identically for all namespaces types. Namespaces may of course have
additional unrelated counters and these are not altered.

This introduces a common lifetime counter into struct ns_common. The
ns_common struct encompasses information that all namespaces share. That
should include the lifetime counter since its common for all of them.

It also allows us to unify the type of the counters across all namespaces.
Most of them use refcount_t but one uses atomic_t and at least one uses
kref. Especially the last one doesn't make much sense since it's just a
wrapper around refcount_t since 2016 and actually complicates cleanup
operations by having to use container_of() to cast the correct namespace
struct out of struct ns_common.

Having the lifetime counter for the namespaces in one place reduces
maintenance cost. Not just because after switching all namespaces over we
will have removed more code than we added but also because the logic is
more easily understandable and we indicate to the user that the basic
lifetime requirements for all namespaces are currently identical.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/159644978167.604812.1773586504374412107.stgit@localhost.localdomain
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/utsname.h | 9 ++++-----
 init/version.c          | 2 +-
 kernel/utsname.c        | 7 ++-----
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 44429d9142ca..2b1737c9b244 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -4,7 +4,6 @@
 
 
 #include <linux/sched.h>
-#include <linux/kref.h>
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/err.h>
@@ -22,7 +21,6 @@ struct user_namespace;
 extern struct user_namespace init_user_ns;
 
 struct uts_namespace {
-	struct kref kref;
 	struct new_utsname name;
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
@@ -33,16 +31,17 @@ extern struct uts_namespace init_uts_ns;
 #ifdef CONFIG_UTS_NS
 static inline void get_uts_ns(struct uts_namespace *ns)
 {
-	kref_get(&ns->kref);
+	refcount_inc(&ns->ns.count);
 }
 
 extern struct uts_namespace *copy_utsname(unsigned long flags,
 	struct user_namespace *user_ns, struct uts_namespace *old_ns);
-extern void free_uts_ns(struct kref *kref);
+extern void free_uts_ns(struct uts_namespace *ns);
 
 static inline void put_uts_ns(struct uts_namespace *ns)
 {
-	kref_put(&ns->kref, free_uts_ns);
+	if (refcount_dec_and_test(&ns->ns.count))
+		free_uts_ns(ns);
 }
 
 void uts_ns_init(void);
diff --git a/init/version.c b/init/version.c
index cba341161b58..80d2b7566b39 100644
--- a/init/version.c
+++ b/init/version.c
@@ -25,7 +25,7 @@ int version_string(LINUX_VERSION_CODE);
 #endif
 
 struct uts_namespace init_uts_ns = {
-	.kref = KREF_INIT(2),
+	.ns.count = REFCOUNT_INIT(2),
 	.name = {
 		.sysname	= UTS_SYSNAME,
 		.nodename	= UTS_NODENAME,
diff --git a/kernel/utsname.c b/kernel/utsname.c
index e488d0e2ab45..b1ac3ca870f2 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -33,7 +33,7 @@ static struct uts_namespace *create_uts_ns(void)
 
 	uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
 	if (uts_ns)
-		kref_init(&uts_ns->kref);
+		refcount_set(&uts_ns->ns.count, 1);
 	return uts_ns;
 }
 
@@ -103,11 +103,8 @@ struct uts_namespace *copy_utsname(unsigned long flags,
 	return new_ns;
 }
 
-void free_uts_ns(struct kref *kref)
+void free_uts_ns(struct uts_namespace *ns)
 {
-	struct uts_namespace *ns;
-
-	ns = container_of(kref, struct uts_namespace, kref);
 	dec_uts_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
-- 
cgit 


From 137ec390fad41928307216ea9f91acf5cf6f4204 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 3 Aug 2020 13:16:27 +0300
Subject: ipc: Use generic ns_common::count

Switch over ipc namespaces to use the newly introduced common lifetime
counter.

Currently every namespace type has its own lifetime counter which is stored
in the specific namespace struct. The lifetime counters are used
identically for all namespaces types. Namespaces may of course have
additional unrelated counters and these are not altered.

This introduces a common lifetime counter into struct ns_common. The
ns_common struct encompasses information that all namespaces share. That
should include the lifetime counter since its common for all of them.

It also allows us to unify the type of the counters across all namespaces.
Most of them use refcount_t but one uses atomic_t and at least one uses
kref. Especially the last one doesn't make much sense since it's just a
wrapper around refcount_t since 2016 and actually complicates cleanup
operations by having to use container_of() to cast the correct namespace
struct out of struct ns_common.

Having the lifetime counter for the namespaces in one place reduces
maintenance cost. Not just because after switching all namespaces over we
will have removed more code than we added but also because the logic is
more easily understandable and we indicate to the user that the basic
lifetime requirements for all namespaces are currently identical.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/159644978697.604812.16592754423881032385.stgit@localhost.localdomain
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/ipc_namespace.h | 3 +--
 ipc/msgutil.c                 | 2 +-
 ipc/namespace.c               | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index a06a78c67f19..05e22770af51 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -27,7 +27,6 @@ struct ipc_ids {
 };
 
 struct ipc_namespace {
-	refcount_t	count;
 	struct ipc_ids	ids[3];
 
 	int		sem_ctls[4];
@@ -128,7 +127,7 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
-		refcount_inc(&ns->count);
+		refcount_inc(&ns->ns.count);
 	return ns;
 }
 
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 3149b4a379de..d0a0e877cadd 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -26,7 +26,7 @@ DEFINE_SPINLOCK(mq_lock);
  * and not CONFIG_IPC_NS.
  */
 struct ipc_namespace init_ipc_ns = {
-	.count		= REFCOUNT_INIT(1),
+	.ns.count = REFCOUNT_INIT(1),
 	.user_ns = &init_user_ns,
 	.ns.inum = PROC_IPC_INIT_INO,
 #ifdef CONFIG_IPC_NS
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 24e7b45320f7..7bd0766ddc3b 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -51,7 +51,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 		goto fail_free;
 	ns->ns.ops = &ipcns_operations;
 
-	refcount_set(&ns->count, 1);
+	refcount_set(&ns->ns.count, 1);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 
@@ -164,7 +164,7 @@ static DECLARE_WORK(free_ipc_work, free_ipc);
  */
 void put_ipc_ns(struct ipc_namespace *ns)
 {
-	if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
+	if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) {
 		mq_clear_sbinfo(ns);
 		spin_unlock(&mq_lock);
 
-- 
cgit 


From 8eb71d95f34a009cc22084e05e78eb9686f7ea28 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 3 Aug 2020 13:16:32 +0300
Subject: pid: Use generic ns_common::count

Switch over pid namespaces to use the newly introduced common lifetime
counter.

Currently every namespace type has its own lifetime counter which is stored
in the specific namespace struct. The lifetime counters are used
identically for all namespaces types. Namespaces may of course have
additional unrelated counters and these are not altered.

This introduces a common lifetime counter into struct ns_common. The
ns_common struct encompasses information that all namespaces share. That
should include the lifetime counter since its common for all of them.

It also allows us to unify the type of the counters across all namespaces.
Most of them use refcount_t but one uses atomic_t and at least one uses
kref. Especially the last one doesn't make much sense since it's just a
wrapper around refcount_t since 2016 and actually complicates cleanup
operations by having to use container_of() to cast the correct namespace
struct out of struct ns_common.

Having the lifetime counter for the namespaces in one place reduces
maintenance cost. Not just because after switching all namespaces over we
will have removed more code than we added but also because the logic is
more easily understandable and we indicate to the user that the basic
lifetime requirements for all namespaces are currently identical.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/159644979226.604812.7512601754841882036.stgit@localhost.localdomain
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/pid_namespace.h |  4 +---
 kernel/pid.c                  |  2 +-
 kernel/pid_namespace.c        | 13 +++----------
 3 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 5a5cb45ac57e..7c7e627503d2 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -8,7 +8,6 @@
 #include <linux/workqueue.h>
 #include <linux/threads.h>
 #include <linux/nsproxy.h>
-#include <linux/kref.h>
 #include <linux/ns_common.h>
 #include <linux/idr.h>
 
@@ -18,7 +17,6 @@
 struct fs_pin;
 
 struct pid_namespace {
-	struct kref kref;
 	struct idr idr;
 	struct rcu_head rcu;
 	unsigned int pid_allocated;
@@ -43,7 +41,7 @@ extern struct pid_namespace init_pid_ns;
 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 {
 	if (ns != &init_pid_ns)
-		kref_get(&ns->kref);
+		refcount_inc(&ns->ns.count);
 	return ns;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index b2562a7ce525..2b97bedc1d9f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -72,7 +72,7 @@ int pid_max_max = PID_MAX_LIMIT;
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
-	.kref = KREF_INIT(2),
+	.ns.count = REFCOUNT_INIT(2),
 	.idr = IDR_INIT(init_pid_ns.idr),
 	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ac135bd600eb..166a91cdd387 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -102,7 +102,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 		goto out_free_idr;
 	ns->ns.ops = &pidns_operations;
 
-	kref_init(&ns->kref);
+	refcount_set(&ns->ns.count, 1);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
@@ -148,22 +148,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
 	return create_pid_namespace(user_ns, old_ns);
 }
 
-static void free_pid_ns(struct kref *kref)
-{
-	struct pid_namespace *ns;
-
-	ns = container_of(kref, struct pid_namespace, kref);
-	destroy_pid_namespace(ns);
-}
-
 void put_pid_ns(struct pid_namespace *ns)
 {
 	struct pid_namespace *parent;
 
 	while (ns != &init_pid_ns) {
 		parent = ns->parent;
-		if (!kref_put(&ns->kref, free_pid_ns))
+		if (!refcount_dec_and_test(&ns->ns.count))
 			break;
+		destroy_pid_namespace(ns);
 		ns = parent;
 	}
 }
-- 
cgit 


From 265cbd62e034cb09a9da7cbff9072c8082f8df65 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 3 Aug 2020 13:16:37 +0300
Subject: user: Use generic ns_common::count

Switch over user namespaces to use the newly introduced common lifetime
counter.

Currently every namespace type has its own lifetime counter which is stored
in the specific namespace struct. The lifetime counters are used
identically for all namespaces types. Namespaces may of course have
additional unrelated counters and these are not altered.

This introduces a common lifetime counter into struct ns_common. The
ns_common struct encompasses information that all namespaces share. That
should include the lifetime counter since its common for all of them.

It also allows us to unify the type of the counters across all namespaces.
Most of them use refcount_t but one uses atomic_t and at least one uses
kref. Especially the last one doesn't make much sense since it's just a
wrapper around refcount_t since 2016 and actually complicates cleanup
operations by having to use container_of() to cast the correct namespace
struct out of struct ns_common.

Having the lifetime counter for the namespaces in one place reduces
maintenance cost. Not just because after switching all namespaces over we
will have removed more code than we added but also because the logic is
more easily understandable and we indicate to the user that the basic
lifetime requirements for all namespaces are currently identical.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/159644979754.604812.601625186726406922.stgit@localhost.localdomain
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/user_namespace.h | 5 ++---
 kernel/user.c                  | 2 +-
 kernel/user_namespace.c        | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 6ef1c7109fc4..64cf8ebdc4ec 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -57,7 +57,6 @@ struct user_namespace {
 	struct uid_gid_map	uid_map;
 	struct uid_gid_map	gid_map;
 	struct uid_gid_map	projid_map;
-	atomic_t		count;
 	struct user_namespace	*parent;
 	int			level;
 	kuid_t			owner;
@@ -109,7 +108,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
-		atomic_inc(&ns->count);
+		refcount_inc(&ns->ns.count);
 	return ns;
 }
 
@@ -119,7 +118,7 @@ extern void __put_user_ns(struct user_namespace *ns);
 
 static inline void put_user_ns(struct user_namespace *ns)
 {
-	if (ns && atomic_dec_and_test(&ns->count))
+	if (ns && refcount_dec_and_test(&ns->ns.count))
 		__put_user_ns(ns);
 }
 
diff --git a/kernel/user.c b/kernel/user.c
index b1635d94a1f2..a2478cddf536 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,7 +55,7 @@ struct user_namespace init_user_ns = {
 			},
 		},
 	},
-	.count = ATOMIC_INIT(3),
+	.ns.count = REFCOUNT_INIT(3),
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
 	.ns.inum = PROC_USER_INIT_INO,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 87804e0371fe..7c2bbe8f3e45 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -111,7 +111,7 @@ int create_user_ns(struct cred *new)
 		goto fail_free;
 	ns->ns.ops = &userns_operations;
 
-	atomic_set(&ns->count, 1);
+	refcount_set(&ns->ns.count, 1);
 	/* Leave the new->user_ns reference with the new user namespace. */
 	ns->parent = parent_ns;
 	ns->level = parent_ns->level + 1;
@@ -197,7 +197,7 @@ static void free_user_ns(struct work_struct *work)
 		kmem_cache_free(user_ns_cachep, ns);
 		dec_user_namespaces(ucounts);
 		ns = parent;
-	} while (atomic_dec_and_test(&parent->count));
+	} while (refcount_dec_and_test(&parent->ns.count));
 }
 
 void __put_user_ns(struct user_namespace *ns)
-- 
cgit 


From f387882d8d3eda7c7b13e330c73907035569ce4a Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 3 Aug 2020 13:16:50 +0300
Subject: cgroup: Use generic ns_common::count

Switch over cgroup namespaces to use the newly introduced common lifetime
counter.

Currently every namespace type has its own lifetime counter which is stored
in the specific namespace struct. The lifetime counters are used
identically for all namespaces types. Namespaces may of course have
additional unrelated counters and these are not altered.

This introduces a common lifetime counter into struct ns_common. The
ns_common struct encompasses information that all namespaces share. That
should include the lifetime counter since its common for all of them.

It also allows us to unify the type of the counters across all namespaces.
Most of them use refcount_t but one uses atomic_t and at least one uses
kref. Especially the last one doesn't make much sense since it's just a
wrapper around refcount_t since 2016 and actually complicates cleanup
operations by having to use container_of() to cast the correct namespace
struct out of struct ns_common.

Having the lifetime counter for the namespaces in one place reduces
maintenance cost. Not just because after switching all namespaces over we
will have removed more code than we added but also because the logic is
more easily understandable and we indicate to the user that the basic
lifetime requirements for all namespaces are currently identical.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/159644980994.604812.383801057081594972.stgit@localhost.localdomain
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/cgroup.h    | 5 ++---
 kernel/cgroup/cgroup.c    | 2 +-
 kernel/cgroup/namespace.c | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 618838c48313..451c2d26a5db 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -854,7 +854,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
 #endif	/* CONFIG_CGROUP_DATA */
 
 struct cgroup_namespace {
-	refcount_t		count;
 	struct ns_common	ns;
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
@@ -889,12 +888,12 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
 static inline void get_cgroup_ns(struct cgroup_namespace *ns)
 {
 	if (ns)
-		refcount_inc(&ns->count);
+		refcount_inc(&ns->ns.count);
 }
 
 static inline void put_cgroup_ns(struct cgroup_namespace *ns)
 {
-	if (ns && refcount_dec_and_test(&ns->count))
+	if (ns && refcount_dec_and_test(&ns->ns.count))
 		free_cgroup_ns(ns);
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dd247747ec14..22e466926853 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -199,7 +199,7 @@ static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-	.count		= REFCOUNT_INIT(2),
+	.ns.count	= REFCOUNT_INIT(2),
 	.user_ns	= &init_user_ns,
 	.ns.ops		= &cgroupns_operations,
 	.ns.inum	= PROC_CGROUP_INIT_INO,
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 812a61afd538..f5e8828c109c 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -32,7 +32,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
 		kfree(new_ns);
 		return ERR_PTR(ret);
 	}
-	refcount_set(&new_ns->count, 1);
+	refcount_set(&new_ns->ns.count, 1);
 	new_ns->ns.ops = &cgroupns_operations;
 	return new_ns;
 }
-- 
cgit 


From 28c41efd08bf97fc64f75304035ee3943995b68e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 3 Aug 2020 13:17:00 +0300
Subject: time: Use generic ns_common::count

Switch over time namespaces to use the newly introduced common lifetime
counter.

Currently every namespace type has its own lifetime counter which is stored
in the specific namespace struct. The lifetime counters are used
identically for all namespaces types. Namespaces may of course have
additional unrelated counters and these are not altered.

This introduces a common lifetime counter into struct ns_common. The
ns_common struct encompasses information that all namespaces share. That
should include the lifetime counter since its common for all of them.

It also allows us to unify the type of the counters across all namespaces.
Most of them use refcount_t but one uses atomic_t and at least one uses
kref. Especially the last one doesn't make much sense since it's just a
wrapper around refcount_t since 2016 and actually complicates cleanup
operations by having to use container_of() to cast the correct namespace
struct out of struct ns_common.

Having the lifetime counter for the namespaces in one place reduces
maintenance cost. Not just because after switching all namespaces over we
will have removed more code than we added but also because the logic is
more easily understandable and we indicate to the user that the basic
lifetime requirements for all namespaces are currently identical.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/159644982033.604812.9406853013011123238.stgit@localhost.localdomain
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/time_namespace.h | 9 ++++-----
 kernel/time/namespace.c        | 9 +++------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 5b6031385db0..a51ffc089219 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -4,7 +4,6 @@
 
 
 #include <linux/sched.h>
-#include <linux/kref.h>
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/err.h>
@@ -18,7 +17,6 @@ struct timens_offsets {
 };
 
 struct time_namespace {
-	struct kref		kref;
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
 	struct ns_common	ns;
@@ -37,20 +35,21 @@ extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);
 
 static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
 {
-	kref_get(&ns->kref);
+	refcount_inc(&ns->ns.count);
 	return ns;
 }
 
 struct time_namespace *copy_time_ns(unsigned long flags,
 				    struct user_namespace *user_ns,
 				    struct time_namespace *old_ns);
-void free_time_ns(struct kref *kref);
+void free_time_ns(struct time_namespace *ns);
 int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
 struct vdso_data *arch_get_vdso_data(void *vvar_page);
 
 static inline void put_time_ns(struct time_namespace *ns)
 {
-	kref_put(&ns->kref, free_time_ns);
+	if (refcount_dec_and_test(&ns->ns.count))
+		free_time_ns(ns);
 }
 
 void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index afc65e6be33e..c4c829eb3511 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -92,7 +92,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
 	if (!ns)
 		goto fail_dec;
 
-	kref_init(&ns->kref);
+	refcount_set(&ns->ns.count, 1);
 
 	ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!ns->vvar_page)
@@ -226,11 +226,8 @@ out:
 	mutex_unlock(&offset_lock);
 }
 
-void free_time_ns(struct kref *kref)
+void free_time_ns(struct time_namespace *ns)
 {
-	struct time_namespace *ns;
-
-	ns = container_of(kref, struct time_namespace, kref);
 	dec_time_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
@@ -464,7 +461,7 @@ const struct proc_ns_operations timens_for_children_operations = {
 };
 
 struct time_namespace init_time_ns = {
-	.kref		= KREF_INIT(3),
+	.ns.count	= REFCOUNT_INIT(3),
 	.user_ns	= &init_user_ns,
 	.ns.inum	= PROC_TIME_INIT_INO,
 	.ns.ops		= &timens_operations,
-- 
cgit 


From 01fd30da0474d5528d96a154cb50ce3b0352fd19 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 25 Sep 2020 13:55:58 +0200
Subject: dma-buf: Add struct dma-buf-map for storing struct dma_buf.vaddr_ptr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new type struct dma_buf_map represents a mapping of dma-buf memory
into kernel space. It contains a flag, is_iomem, that signals users to
access the mapped memory with I/O operations instead of regular loads
and stores.

It was assumed that DMA buffer memory can be accessed with regular load
and store operations. Some architectures, such as sparc64, require the
use of I/O operations to access dma-map buffers that are located in I/O
memory. Providing struct dma_buf_map allows drivers to implement this.
This was specifically a problem when refreshing the graphics framebuffer
on such systems. [1]

As the first step, struct dma_buf stores an instance of struct dma_buf_map
internally. Afterwards, dma-buf's vmap and vunmap interfaces are be
converted. Finally, affected drivers can be fixed.

v3:
	* moved documentation into separate patch
	* test for NULL pointers with !<ptr>

[1] https://lore.kernel.org/dri-devel/20200725191012.GA434957@ravnborg.org/

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20200925115601.23955-2-tzimmermann@suse.de
---
 drivers/dma-buf/dma-buf.c   | 14 ++++----
 include/linux/dma-buf-map.h | 82 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dma-buf.h     |  3 +-
 3 files changed, 91 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/dma-buf-map.h

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 58564d82a3a2..5e849ca241a0 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1207,12 +1207,12 @@ void *dma_buf_vmap(struct dma_buf *dmabuf)
 	mutex_lock(&dmabuf->lock);
 	if (dmabuf->vmapping_counter) {
 		dmabuf->vmapping_counter++;
-		BUG_ON(!dmabuf->vmap_ptr);
-		ptr = dmabuf->vmap_ptr;
+		BUG_ON(dma_buf_map_is_null(&dmabuf->vmap_ptr));
+		ptr = dmabuf->vmap_ptr.vaddr;
 		goto out_unlock;
 	}
 
-	BUG_ON(dmabuf->vmap_ptr);
+	BUG_ON(dma_buf_map_is_set(&dmabuf->vmap_ptr));
 
 	ptr = dmabuf->ops->vmap(dmabuf);
 	if (WARN_ON_ONCE(IS_ERR(ptr)))
@@ -1220,7 +1220,7 @@ void *dma_buf_vmap(struct dma_buf *dmabuf)
 	if (!ptr)
 		goto out_unlock;
 
-	dmabuf->vmap_ptr = ptr;
+	dmabuf->vmap_ptr.vaddr = ptr;
 	dmabuf->vmapping_counter = 1;
 
 out_unlock:
@@ -1239,15 +1239,15 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
 	if (WARN_ON(!dmabuf))
 		return;
 
-	BUG_ON(!dmabuf->vmap_ptr);
+	BUG_ON(dma_buf_map_is_null(&dmabuf->vmap_ptr));
 	BUG_ON(dmabuf->vmapping_counter == 0);
-	BUG_ON(dmabuf->vmap_ptr != vaddr);
+	BUG_ON(!dma_buf_map_is_vaddr(&dmabuf->vmap_ptr, vaddr));
 
 	mutex_lock(&dmabuf->lock);
 	if (--dmabuf->vmapping_counter == 0) {
 		if (dmabuf->ops->vunmap)
 			dmabuf->ops->vunmap(dmabuf, vaddr);
-		dmabuf->vmap_ptr = NULL;
+		dma_buf_map_clear(&dmabuf->vmap_ptr);
 	}
 	mutex_unlock(&dmabuf->lock);
 }
diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
new file mode 100644
index 000000000000..00143c88feb6
--- /dev/null
+++ b/include/linux/dma-buf-map.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Pointer to dma-buf-mapped memory, plus helpers.
+ */
+
+#ifndef __DMA_BUF_MAP_H__
+#define __DMA_BUF_MAP_H__
+
+#include <linux/io.h>
+
+/**
+ * struct dma_buf_map - Pointer to vmap'ed dma-buf memory.
+ * @vaddr_iomem:	The buffer's address if in I/O memory
+ * @vaddr:		The buffer's address if in system memory
+ * @is_iomem:		True if the dma-buf memory is located in I/O
+ *			memory, or false otherwise.
+ */
+struct dma_buf_map {
+	union {
+		void __iomem *vaddr_iomem;
+		void *vaddr;
+	};
+	bool is_iomem;
+};
+
+/* API transition helper */
+static inline bool dma_buf_map_is_vaddr(const struct dma_buf_map *map, const void *vaddr)
+{
+	return !map->is_iomem && (map->vaddr == vaddr);
+}
+
+/**
+ * dma_buf_map_is_null - Tests for a dma-buf mapping to be NULL
+ * @map:	The dma-buf mapping structure
+ *
+ * Depending on the state of struct dma_buf_map.is_iomem, tests if the
+ * mapping is NULL.
+ *
+ * Returns:
+ * True if the mapping is NULL, or false otherwise.
+ */
+static inline bool dma_buf_map_is_null(const struct dma_buf_map *map)
+{
+	if (map->is_iomem)
+		return !map->vaddr_iomem;
+	return !map->vaddr;
+}
+
+/**
+ * dma_buf_map_is_set - Tests is the dma-buf mapping has been set
+ * @map:	The dma-buf mapping structure
+ *
+ * Depending on the state of struct dma_buf_map.is_iomem, tests if the
+ * mapping has been set.
+ *
+ * Returns:
+ * True if the mapping is been set, or false otherwise.
+ */
+static inline bool dma_buf_map_is_set(const struct dma_buf_map *map)
+{
+	return !dma_buf_map_is_null(map);
+}
+
+/**
+ * dma_buf_map_clear - Clears a dma-buf mapping structure
+ * @map:	The dma-buf mapping structure
+ *
+ * Clears all fields to zero; including struct dma_buf_map.is_iomem. So
+ * mapping structures that were set to point to I/O memory are reset for
+ * system memory. Pointers are cleared to NULL. This is the default.
+ */
+static inline void dma_buf_map_clear(struct dma_buf_map *map)
+{
+	if (map->is_iomem) {
+		map->vaddr_iomem = NULL;
+		map->is_iomem = false;
+	} else {
+		map->vaddr = NULL;
+	}
+}
+
+#endif /* __DMA_BUF_MAP_H__ */
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 957b398d30e5..fcc2ddfb6d18 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -13,6 +13,7 @@
 #ifndef __DMA_BUF_H__
 #define __DMA_BUF_H__
 
+#include <linux/dma-buf-map.h>
 #include <linux/file.h>
 #include <linux/err.h>
 #include <linux/scatterlist.h>
@@ -309,7 +310,7 @@ struct dma_buf {
 	const struct dma_buf_ops *ops;
 	struct mutex lock;
 	unsigned vmapping_counter;
-	void *vmap_ptr;
+	struct dma_buf_map vmap_ptr;
 	const char *exp_name;
 	const char *name;
 	spinlock_t name_lock;
-- 
cgit 


From 6619ccf1bb1d0ebb071f758111efa83918b216fc Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 25 Sep 2020 13:55:59 +0200
Subject: dma-buf: Use struct dma_buf_map in dma_buf_vmap() interfaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch updates dma_buf_vmap() and dma-buf's vmap callback to use
struct dma_buf_map.

The interfaces used to return a buffer address. This address now gets
stored in an instance of the structure that is given as an additional
argument. The functions return an errno code on errors.

Users of the functions are updated accordingly. This is only an interface
change. It is currently expected that dma-buf memory can be accessed with
system memory load/store operations.

v3:
	* update fastrpc driver (kernel test robot)
v2:
	* always clear map parameter in dma_buf_vmap() (Daniel)
	* include dma-buf-heaps and i915 selftests (kernel test robot)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Tomasz Figa <tfiga@chromium.org>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20200925115601.23955-3-tzimmermann@suse.de
---
 drivers/dma-buf/dma-buf.c                          | 28 ++++++++++++----------
 drivers/dma-buf/heaps/heap-helpers.c               |  8 +++++--
 drivers/gpu/drm/drm_gem_cma_helper.c               | 13 +++++-----
 drivers/gpu/drm/drm_gem_shmem_helper.c             | 14 +++++++----
 drivers/gpu/drm/drm_prime.c                        |  9 ++++---
 drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c        |  8 ++++++-
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c         | 11 +++++++--
 .../gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c   | 12 +++++++---
 drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c   | 10 ++++++--
 drivers/gpu/drm/tegra/gem.c                        | 18 +++++++++-----
 .../media/common/videobuf2/videobuf2-dma-contig.c  | 14 +++++++----
 drivers/media/common/videobuf2/videobuf2-dma-sg.c  | 16 +++++++++----
 drivers/media/common/videobuf2/videobuf2-vmalloc.c | 15 ++++++++----
 drivers/misc/fastrpc.c                             |  6 +++--
 include/drm/drm_prime.h                            |  3 ++-
 include/linux/dma-buf-map.h                        | 13 ++++++++++
 include/linux/dma-buf.h                            |  6 ++---
 17 files changed, 143 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 5e849ca241a0..61bd24d21b38 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1186,46 +1186,50 @@ EXPORT_SYMBOL_GPL(dma_buf_mmap);
  * dma_buf_vmap - Create virtual mapping for the buffer object into kernel
  * address space. Same restrictions as for vmap and friends apply.
  * @dmabuf:	[in]	buffer to vmap
+ * @map:	[out]	returns the vmap pointer
  *
  * This call may fail due to lack of virtual mapping address space.
  * These calls are optional in drivers. The intended use for them
  * is for mapping objects linear in kernel space for high use objects.
  * Please attempt to use kmap/kunmap before thinking about these interfaces.
  *
- * Returns NULL on error.
+ * Returns 0 on success, or a negative errno code otherwise.
  */
-void *dma_buf_vmap(struct dma_buf *dmabuf)
+int dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map)
 {
-	void *ptr;
+	struct dma_buf_map ptr;
+	int ret = 0;
+
+	dma_buf_map_clear(map);
 
 	if (WARN_ON(!dmabuf))
-		return NULL;
+		return -EINVAL;
 
 	if (!dmabuf->ops->vmap)
-		return NULL;
+		return -EINVAL;
 
 	mutex_lock(&dmabuf->lock);
 	if (dmabuf->vmapping_counter) {
 		dmabuf->vmapping_counter++;
 		BUG_ON(dma_buf_map_is_null(&dmabuf->vmap_ptr));
-		ptr = dmabuf->vmap_ptr.vaddr;
+		*map = dmabuf->vmap_ptr;
 		goto out_unlock;
 	}
 
 	BUG_ON(dma_buf_map_is_set(&dmabuf->vmap_ptr));
 
-	ptr = dmabuf->ops->vmap(dmabuf);
-	if (WARN_ON_ONCE(IS_ERR(ptr)))
-		ptr = NULL;
-	if (!ptr)
+	ret = dmabuf->ops->vmap(dmabuf, &ptr);
+	if (WARN_ON_ONCE(ret))
 		goto out_unlock;
 
-	dmabuf->vmap_ptr.vaddr = ptr;
+	dmabuf->vmap_ptr = ptr;
 	dmabuf->vmapping_counter = 1;
 
+	*map = dmabuf->vmap_ptr;
+
 out_unlock:
 	mutex_unlock(&dmabuf->lock);
-	return ptr;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(dma_buf_vmap);
 
diff --git a/drivers/dma-buf/heaps/heap-helpers.c b/drivers/dma-buf/heaps/heap-helpers.c
index 9f964ca3f59c..27e54d9c120c 100644
--- a/drivers/dma-buf/heaps/heap-helpers.c
+++ b/drivers/dma-buf/heaps/heap-helpers.c
@@ -236,7 +236,7 @@ static int dma_heap_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
 	return 0;
 }
 
-static void *dma_heap_dma_buf_vmap(struct dma_buf *dmabuf)
+static int dma_heap_dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map)
 {
 	struct heap_helper_buffer *buffer = dmabuf->priv;
 	void *vaddr;
@@ -245,7 +245,11 @@ static void *dma_heap_dma_buf_vmap(struct dma_buf *dmabuf)
 	vaddr = dma_heap_buffer_vmap_get(buffer);
 	mutex_unlock(&buffer->lock);
 
-	return vaddr;
+	if (!vaddr)
+		return -ENOMEM;
+	dma_buf_map_set_vaddr(map, vaddr);
+
+	return 0;
 }
 
 static void dma_heap_dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
index 1e5b389c435c..eb412e45f896 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -634,22 +634,23 @@ drm_gem_cma_prime_import_sg_table_vmap(struct drm_device *dev,
 {
 	struct drm_gem_cma_object *cma_obj;
 	struct drm_gem_object *obj;
-	void *vaddr;
+	struct dma_buf_map map;
+	int ret;
 
-	vaddr = dma_buf_vmap(attach->dmabuf);
-	if (!vaddr) {
+	ret = dma_buf_vmap(attach->dmabuf, &map);
+	if (ret) {
 		DRM_ERROR("Failed to vmap PRIME buffer\n");
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(ret);
 	}
 
 	obj = drm_gem_cma_prime_import_sg_table(dev, attach, sgt);
 	if (IS_ERR(obj)) {
-		dma_buf_vunmap(attach->dmabuf, vaddr);
+		dma_buf_vunmap(attach->dmabuf, map.vaddr);
 		return obj;
 	}
 
 	cma_obj = to_drm_gem_cma_obj(obj);
-	cma_obj->vaddr = vaddr;
+	cma_obj->vaddr = map.vaddr;
 
 	return obj;
 }
diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
index 0a952f27c184..ad10a57cfece 100644
--- a/drivers/gpu/drm/drm_gem_shmem_helper.c
+++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
@@ -261,13 +261,16 @@ EXPORT_SYMBOL(drm_gem_shmem_unpin);
 static void *drm_gem_shmem_vmap_locked(struct drm_gem_shmem_object *shmem)
 {
 	struct drm_gem_object *obj = &shmem->base;
-	int ret;
+	struct dma_buf_map map;
+	int ret = 0;
 
 	if (shmem->vmap_use_count++ > 0)
 		return shmem->vaddr;
 
 	if (obj->import_attach) {
-		shmem->vaddr = dma_buf_vmap(obj->import_attach->dmabuf);
+		ret = dma_buf_vmap(obj->import_attach->dmabuf, &map);
+		if (!ret)
+			shmem->vaddr = map.vaddr;
 	} else {
 		pgprot_t prot = PAGE_KERNEL;
 
@@ -279,11 +282,12 @@ static void *drm_gem_shmem_vmap_locked(struct drm_gem_shmem_object *shmem)
 			prot = pgprot_writecombine(prot);
 		shmem->vaddr = vmap(shmem->pages, obj->size >> PAGE_SHIFT,
 				    VM_MAP, prot);
+		if (!shmem->vaddr)
+			ret = -ENOMEM;
 	}
 
-	if (!shmem->vaddr) {
-		DRM_DEBUG_KMS("Failed to vmap pages\n");
-		ret = -ENOMEM;
+	if (ret) {
+		DRM_DEBUG_KMS("Failed to vmap pages, error %d\n", ret);
 		goto err_put_pages;
 	}
 
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 194784a5b344..2d75e58d21fe 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -662,22 +662,25 @@ EXPORT_SYMBOL(drm_gem_unmap_dma_buf);
 /**
  * drm_gem_dmabuf_vmap - dma_buf vmap implementation for GEM
  * @dma_buf: buffer to be mapped
+ * @map: the virtual address of the buffer
  *
  * Sets up a kernel virtual mapping. This can be used as the &dma_buf_ops.vmap
  * callback. Calls into &drm_gem_object_funcs.vmap for device specific handling.
  *
  * Returns the kernel virtual address or NULL on failure.
  */
-void *drm_gem_dmabuf_vmap(struct dma_buf *dma_buf)
+int drm_gem_dmabuf_vmap(struct dma_buf *dma_buf, struct dma_buf_map *map)
 {
 	struct drm_gem_object *obj = dma_buf->priv;
 	void *vaddr;
 
 	vaddr = drm_gem_vmap(obj);
 	if (IS_ERR(vaddr))
-		vaddr = NULL;
+		return PTR_ERR(vaddr);
 
-	return vaddr;
+	dma_buf_map_set_vaddr(map, vaddr);
+
+	return 0;
 }
 EXPORT_SYMBOL(drm_gem_dmabuf_vmap);
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
index 4aa3426a9ba4..80a9fc143bbb 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
@@ -85,9 +85,15 @@ static void etnaviv_gem_prime_release(struct etnaviv_gem_object *etnaviv_obj)
 
 static void *etnaviv_gem_prime_vmap_impl(struct etnaviv_gem_object *etnaviv_obj)
 {
+	struct dma_buf_map map;
+	int ret;
+
 	lockdep_assert_held(&etnaviv_obj->lock);
 
-	return dma_buf_vmap(etnaviv_obj->base.import_attach->dmabuf);
+	ret = dma_buf_vmap(etnaviv_obj->base.import_attach->dmabuf, &map);
+	if (ret)
+		return NULL;
+	return map.vaddr;
 }
 
 static int etnaviv_gem_prime_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 27fddc22a7c6..77b363d3000b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -82,11 +82,18 @@ static void i915_gem_unmap_dma_buf(struct dma_buf_attachment *attachment,
 	i915_gem_object_unpin_pages(obj);
 }
 
-static void *i915_gem_dmabuf_vmap(struct dma_buf *dma_buf)
+static int i915_gem_dmabuf_vmap(struct dma_buf *dma_buf, struct dma_buf_map *map)
 {
 	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
+	void *vaddr;
 
-	return i915_gem_object_pin_map(obj, I915_MAP_WB);
+	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
+	if (IS_ERR(vaddr))
+		return PTR_ERR(vaddr);
+
+	dma_buf_map_set_vaddr(map, vaddr);
+
+	return 0;
 }
 
 static void i915_gem_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
index 0845ce1ae37c..3aad7643b0b7 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
@@ -82,6 +82,7 @@ static int igt_dmabuf_import(void *arg)
 	struct drm_i915_gem_object *obj;
 	struct dma_buf *dmabuf;
 	void *obj_map, *dma_map;
+	struct dma_buf_map map;
 	u32 pattern[] = { 0, 0xaa, 0xcc, 0x55, 0xff };
 	int err, i;
 
@@ -110,7 +111,8 @@ static int igt_dmabuf_import(void *arg)
 		goto out_obj;
 	}
 
-	dma_map = dma_buf_vmap(dmabuf);
+	err = dma_buf_vmap(dmabuf, &map);
+	dma_map = err ? NULL : map.vaddr;
 	if (!dma_map) {
 		pr_err("dma_buf_vmap failed\n");
 		err = -ENOMEM;
@@ -163,6 +165,7 @@ static int igt_dmabuf_import_ownership(void *arg)
 	struct drm_i915_private *i915 = arg;
 	struct drm_i915_gem_object *obj;
 	struct dma_buf *dmabuf;
+	struct dma_buf_map map;
 	void *ptr;
 	int err;
 
@@ -170,7 +173,8 @@ static int igt_dmabuf_import_ownership(void *arg)
 	if (IS_ERR(dmabuf))
 		return PTR_ERR(dmabuf);
 
-	ptr = dma_buf_vmap(dmabuf);
+	err = dma_buf_vmap(dmabuf, &map);
+	ptr = err ? NULL : map.vaddr;
 	if (!ptr) {
 		pr_err("dma_buf_vmap failed\n");
 		err = -ENOMEM;
@@ -212,6 +216,7 @@ static int igt_dmabuf_export_vmap(void *arg)
 	struct drm_i915_private *i915 = arg;
 	struct drm_i915_gem_object *obj;
 	struct dma_buf *dmabuf;
+	struct dma_buf_map map;
 	void *ptr;
 	int err;
 
@@ -228,7 +233,8 @@ static int igt_dmabuf_export_vmap(void *arg)
 	}
 	i915_gem_object_put(obj);
 
-	ptr = dma_buf_vmap(dmabuf);
+	err = dma_buf_vmap(dmabuf, &map);
+	ptr = err ? NULL : map.vaddr;
 	if (!ptr) {
 		pr_err("dma_buf_vmap failed\n");
 		err = -ENOMEM;
diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
index debaf7b18ab5..dcae46441f7d 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
@@ -62,11 +62,17 @@ static void mock_dmabuf_release(struct dma_buf *dma_buf)
 	kfree(mock);
 }
 
-static void *mock_dmabuf_vmap(struct dma_buf *dma_buf)
+static int mock_dmabuf_vmap(struct dma_buf *dma_buf, struct dma_buf_map *map)
 {
 	struct mock_dmabuf *mock = to_mock(dma_buf);
+	void *vaddr;
 
-	return vm_map_ram(mock->pages, mock->npages, 0);
+	vaddr = vm_map_ram(mock->pages, mock->npages, 0);
+	if (!vaddr)
+		return -ENOMEM;
+	dma_buf_map_set_vaddr(map, vaddr);
+
+	return 0;
 }
 
 static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index d481dea4738d..2b6e06c7a2c5 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -132,14 +132,18 @@ static void tegra_bo_unpin(struct device *dev, struct sg_table *sgt)
 static void *tegra_bo_mmap(struct host1x_bo *bo)
 {
 	struct tegra_bo *obj = host1x_to_tegra_bo(bo);
+	struct dma_buf_map map;
+	int ret;
 
-	if (obj->vaddr)
+	if (obj->vaddr) {
 		return obj->vaddr;
-	else if (obj->gem.import_attach)
-		return dma_buf_vmap(obj->gem.import_attach->dmabuf);
-	else
+	} else if (obj->gem.import_attach) {
+		ret = dma_buf_vmap(obj->gem.import_attach->dmabuf, &map);
+		return ret ? NULL : map.vaddr;
+	} else {
 		return vmap(obj->pages, obj->num_pages, VM_MAP,
 			    pgprot_writecombine(PAGE_KERNEL));
+	}
 }
 
 static void tegra_bo_munmap(struct host1x_bo *bo, void *addr)
@@ -649,12 +653,14 @@ static int tegra_gem_prime_mmap(struct dma_buf *buf, struct vm_area_struct *vma)
 	return __tegra_gem_mmap(gem, vma);
 }
 
-static void *tegra_gem_prime_vmap(struct dma_buf *buf)
+static int tegra_gem_prime_vmap(struct dma_buf *buf, struct dma_buf_map *map)
 {
 	struct drm_gem_object *gem = buf->priv;
 	struct tegra_bo *bo = to_tegra_bo(gem);
 
-	return bo->vaddr;
+	dma_buf_map_set_vaddr(map, bo->vaddr);
+
+	return 0;
 }
 
 static void tegra_gem_prime_vunmap(struct dma_buf *buf, void *vaddr)
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
index ec3446cc45b8..11428287bdf3 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
@@ -81,9 +81,13 @@ static void *vb2_dc_cookie(void *buf_priv)
 static void *vb2_dc_vaddr(void *buf_priv)
 {
 	struct vb2_dc_buf *buf = buf_priv;
+	struct dma_buf_map map;
+	int ret;
 
-	if (!buf->vaddr && buf->db_attach)
-		buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf);
+	if (!buf->vaddr && buf->db_attach) {
+		ret = dma_buf_vmap(buf->db_attach->dmabuf, &map);
+		buf->vaddr = ret ? NULL : map.vaddr;
+	}
 
 	return buf->vaddr;
 }
@@ -365,11 +369,13 @@ vb2_dc_dmabuf_ops_end_cpu_access(struct dma_buf *dbuf,
 	return 0;
 }
 
-static void *vb2_dc_dmabuf_ops_vmap(struct dma_buf *dbuf)
+static int vb2_dc_dmabuf_ops_vmap(struct dma_buf *dbuf, struct dma_buf_map *map)
 {
 	struct vb2_dc_buf *buf = dbuf->priv;
 
-	return buf->vaddr;
+	dma_buf_map_set_vaddr(map, buf->vaddr);
+
+	return 0;
 }
 
 static int vb2_dc_dmabuf_ops_mmap(struct dma_buf *dbuf,
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index 0a40e00f0d7e..c51170e9c1b9 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -300,14 +300,18 @@ static void vb2_dma_sg_put_userptr(void *buf_priv)
 static void *vb2_dma_sg_vaddr(void *buf_priv)
 {
 	struct vb2_dma_sg_buf *buf = buf_priv;
+	struct dma_buf_map map;
+	int ret;
 
 	BUG_ON(!buf);
 
 	if (!buf->vaddr) {
-		if (buf->db_attach)
-			buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf);
-		else
+		if (buf->db_attach) {
+			ret = dma_buf_vmap(buf->db_attach->dmabuf, &map);
+			buf->vaddr = ret ? NULL : map.vaddr;
+		} else {
 			buf->vaddr = vm_map_ram(buf->pages, buf->num_pages, -1);
+		}
 	}
 
 	/* add offset in case userptr is not page-aligned */
@@ -489,11 +493,13 @@ vb2_dma_sg_dmabuf_ops_end_cpu_access(struct dma_buf *dbuf,
 	return 0;
 }
 
-static void *vb2_dma_sg_dmabuf_ops_vmap(struct dma_buf *dbuf)
+static int vb2_dma_sg_dmabuf_ops_vmap(struct dma_buf *dbuf, struct dma_buf_map *map)
 {
 	struct vb2_dma_sg_buf *buf = dbuf->priv;
 
-	return vb2_dma_sg_vaddr(buf);
+	dma_buf_map_set_vaddr(map, buf->vaddr);
+
+	return 0;
 }
 
 static int vb2_dma_sg_dmabuf_ops_mmap(struct dma_buf *dbuf,
diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
index c66fda4a65e4..7b68e2379c65 100644
--- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c
+++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
@@ -318,11 +318,13 @@ static void vb2_vmalloc_dmabuf_ops_release(struct dma_buf *dbuf)
 	vb2_vmalloc_put(dbuf->priv);
 }
 
-static void *vb2_vmalloc_dmabuf_ops_vmap(struct dma_buf *dbuf)
+static int vb2_vmalloc_dmabuf_ops_vmap(struct dma_buf *dbuf, struct dma_buf_map *map)
 {
 	struct vb2_vmalloc_buf *buf = dbuf->priv;
 
-	return buf->vaddr;
+	dma_buf_map_set_vaddr(map, buf->vaddr);
+
+	return 0;
 }
 
 static int vb2_vmalloc_dmabuf_ops_mmap(struct dma_buf *dbuf,
@@ -374,10 +376,15 @@ static struct dma_buf *vb2_vmalloc_get_dmabuf(void *buf_priv, unsigned long flag
 static int vb2_vmalloc_map_dmabuf(void *mem_priv)
 {
 	struct vb2_vmalloc_buf *buf = mem_priv;
+	struct dma_buf_map map;
+	int ret;
 
-	buf->vaddr = dma_buf_vmap(buf->dbuf);
+	ret = dma_buf_vmap(buf->dbuf, &map);
+	if (ret)
+		return -EFAULT;
+	buf->vaddr = map.vaddr;
 
-	return buf->vaddr ? 0 : -EFAULT;
+	return 0;
 }
 
 static void vb2_vmalloc_unmap_dmabuf(void *mem_priv)
diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index 7939c55daceb..b39e198533f0 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -581,11 +581,13 @@ static void fastrpc_dma_buf_detatch(struct dma_buf *dmabuf,
 	kfree(a);
 }
 
-static void *fastrpc_vmap(struct dma_buf *dmabuf)
+static int fastrpc_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map)
 {
 	struct fastrpc_buf *buf = dmabuf->priv;
 
-	return buf->virt;
+	dma_buf_map_set_vaddr(map, buf->virt);
+
+	return 0;
 }
 
 static int fastrpc_mmap(struct dma_buf *dmabuf,
diff --git a/include/drm/drm_prime.h b/include/drm/drm_prime.h
index bf141e74a1c2..3ee22639ff77 100644
--- a/include/drm/drm_prime.h
+++ b/include/drm/drm_prime.h
@@ -54,6 +54,7 @@ struct device;
 struct dma_buf_export_info;
 struct dma_buf;
 struct dma_buf_attachment;
+struct dma_buf_map;
 
 enum dma_data_direction;
 
@@ -82,7 +83,7 @@ struct sg_table *drm_gem_map_dma_buf(struct dma_buf_attachment *attach,
 void drm_gem_unmap_dma_buf(struct dma_buf_attachment *attach,
 			   struct sg_table *sgt,
 			   enum dma_data_direction dir);
-void *drm_gem_dmabuf_vmap(struct dma_buf *dma_buf);
+int drm_gem_dmabuf_vmap(struct dma_buf *dma_buf, struct dma_buf_map *map);
 void drm_gem_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr);
 
 int drm_gem_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index 00143c88feb6..5ae732d373eb 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -23,6 +23,19 @@ struct dma_buf_map {
 	bool is_iomem;
 };
 
+/**
+ * dma_buf_map_set_vaddr - Sets a dma-buf mapping structure to an address in system memory
+ * @map:	The dma-buf mapping structure
+ * @vaddr:	A system-memory address
+ *
+ * Sets the address and clears the I/O-memory flag.
+ */
+static inline void dma_buf_map_set_vaddr(struct dma_buf_map *map, void *vaddr)
+{
+	map->vaddr = vaddr;
+	map->is_iomem = false;
+}
+
 /* API transition helper */
 static inline bool dma_buf_map_is_vaddr(const struct dma_buf_map *map, const void *vaddr)
 {
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index fcc2ddfb6d18..7237997cfa38 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -266,7 +266,7 @@ struct dma_buf_ops {
 	 */
 	int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
 
-	void *(*vmap)(struct dma_buf *);
+	int (*vmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
 	void (*vunmap)(struct dma_buf *, void *vaddr);
 };
 
@@ -503,6 +503,6 @@ int dma_buf_end_cpu_access(struct dma_buf *dma_buf,
 
 int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
 		 unsigned long);
-void *dma_buf_vmap(struct dma_buf *);
-void dma_buf_vunmap(struct dma_buf *, void *vaddr);
+int dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
+void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr);
 #endif /* __DMA_BUF_H__ */
-- 
cgit 


From 20e76f1a70596590dec32a5d1f598fba04859526 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 25 Sep 2020 13:56:00 +0200
Subject: dma-buf: Use struct dma_buf_map in dma_buf_vunmap() interfaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch updates dma_buf_vunmap() and dma-buf's vunmap callback to
use struct dma_buf_map. The interfaces used to receive a buffer address.
This address is now given in an instance of the structure.

Users of the functions are updated accordingly. This is only an interface
change. It is currently expected that dma-buf memory can be accessed with
system memory load/store operations.

v2:
	* include dma-buf-heaps and i915 selftests (kernel test robot)
	* initialize cma_obj before using it in drm_gem_cma_free_object()
	  (kernel test robot)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Tomasz Figa <tfiga@chromium.org>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20200925115601.23955-4-tzimmermann@suse.de
---
 drivers/dma-buf/dma-buf.c                          |  8 +++---
 drivers/dma-buf/heaps/heap-helpers.c               |  2 +-
 drivers/gpu/drm/drm_gem_cma_helper.c               |  9 +++---
 drivers/gpu/drm/drm_gem_shmem_helper.c             |  3 +-
 drivers/gpu/drm/drm_prime.c                        |  6 ++--
 drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c        |  5 ++--
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c         |  2 +-
 .../gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c   |  6 ++--
 drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c   |  4 +--
 drivers/gpu/drm/tegra/gem.c                        |  5 ++--
 .../media/common/videobuf2/videobuf2-dma-contig.c  |  3 +-
 drivers/media/common/videobuf2/videobuf2-dma-sg.c  |  3 +-
 drivers/media/common/videobuf2/videobuf2-vmalloc.c |  6 ++--
 include/drm/drm_prime.h                            |  2 +-
 include/linux/dma-buf-map.h                        | 32 ++++++++++++++++++++--
 include/linux/dma-buf.h                            |  4 +--
 16 files changed, 66 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 61bd24d21b38..a6ba4d598f0e 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1236,21 +1236,21 @@ EXPORT_SYMBOL_GPL(dma_buf_vmap);
 /**
  * dma_buf_vunmap - Unmap a vmap obtained by dma_buf_vmap.
  * @dmabuf:	[in]	buffer to vunmap
- * @vaddr:	[in]	vmap to vunmap
+ * @map:	[in]	vmap pointer to vunmap
  */
-void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
+void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map)
 {
 	if (WARN_ON(!dmabuf))
 		return;
 
 	BUG_ON(dma_buf_map_is_null(&dmabuf->vmap_ptr));
 	BUG_ON(dmabuf->vmapping_counter == 0);
-	BUG_ON(!dma_buf_map_is_vaddr(&dmabuf->vmap_ptr, vaddr));
+	BUG_ON(!dma_buf_map_is_equal(&dmabuf->vmap_ptr, map));
 
 	mutex_lock(&dmabuf->lock);
 	if (--dmabuf->vmapping_counter == 0) {
 		if (dmabuf->ops->vunmap)
-			dmabuf->ops->vunmap(dmabuf, vaddr);
+			dmabuf->ops->vunmap(dmabuf, map);
 		dma_buf_map_clear(&dmabuf->vmap_ptr);
 	}
 	mutex_unlock(&dmabuf->lock);
diff --git a/drivers/dma-buf/heaps/heap-helpers.c b/drivers/dma-buf/heaps/heap-helpers.c
index 27e54d9c120c..e9fb2818f2d4 100644
--- a/drivers/dma-buf/heaps/heap-helpers.c
+++ b/drivers/dma-buf/heaps/heap-helpers.c
@@ -252,7 +252,7 @@ static int dma_heap_dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map
 	return 0;
 }
 
-static void dma_heap_dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
+static void dma_heap_dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map)
 {
 	struct heap_helper_buffer *buffer = dmabuf->priv;
 
diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
index eb412e45f896..5e9d29961307 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -175,13 +175,12 @@ drm_gem_cma_create_with_handle(struct drm_file *file_priv,
  */
 void drm_gem_cma_free_object(struct drm_gem_object *gem_obj)
 {
-	struct drm_gem_cma_object *cma_obj;
-
-	cma_obj = to_drm_gem_cma_obj(gem_obj);
+	struct drm_gem_cma_object *cma_obj = to_drm_gem_cma_obj(gem_obj);
+	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(cma_obj->vaddr);
 
 	if (gem_obj->import_attach) {
 		if (cma_obj->vaddr)
-			dma_buf_vunmap(gem_obj->import_attach->dmabuf, cma_obj->vaddr);
+			dma_buf_vunmap(gem_obj->import_attach->dmabuf, &map);
 		drm_prime_gem_destroy(gem_obj, cma_obj->sgt);
 	} else if (cma_obj->vaddr) {
 		dma_free_wc(gem_obj->dev->dev, cma_obj->base.size,
@@ -645,7 +644,7 @@ drm_gem_cma_prime_import_sg_table_vmap(struct drm_device *dev,
 
 	obj = drm_gem_cma_prime_import_sg_table(dev, attach, sgt);
 	if (IS_ERR(obj)) {
-		dma_buf_vunmap(attach->dmabuf, map.vaddr);
+		dma_buf_vunmap(attach->dmabuf, &map);
 		return obj;
 	}
 
diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
index ad10a57cfece..3c2e8cde69a8 100644
--- a/drivers/gpu/drm/drm_gem_shmem_helper.c
+++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
@@ -337,6 +337,7 @@ EXPORT_SYMBOL(drm_gem_shmem_vmap);
 static void drm_gem_shmem_vunmap_locked(struct drm_gem_shmem_object *shmem)
 {
 	struct drm_gem_object *obj = &shmem->base;
+	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(shmem->vaddr);
 
 	if (WARN_ON_ONCE(!shmem->vmap_use_count))
 		return;
@@ -345,7 +346,7 @@ static void drm_gem_shmem_vunmap_locked(struct drm_gem_shmem_object *shmem)
 		return;
 
 	if (obj->import_attach)
-		dma_buf_vunmap(obj->import_attach->dmabuf, shmem->vaddr);
+		dma_buf_vunmap(obj->import_attach->dmabuf, &map);
 	else
 		vunmap(shmem->vaddr);
 
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 2d75e58d21fe..4910c446db83 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -687,16 +687,16 @@ EXPORT_SYMBOL(drm_gem_dmabuf_vmap);
 /**
  * drm_gem_dmabuf_vunmap - dma_buf vunmap implementation for GEM
  * @dma_buf: buffer to be unmapped
- * @vaddr: the virtual address of the buffer
+ * @map: the virtual address of the buffer
  *
  * Releases a kernel virtual mapping. This can be used as the
  * &dma_buf_ops.vunmap callback. Calls into &drm_gem_object_funcs.vunmap for device specific handling.
  */
-void drm_gem_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
+void drm_gem_dmabuf_vunmap(struct dma_buf *dma_buf, struct dma_buf_map *map)
 {
 	struct drm_gem_object *obj = dma_buf->priv;
 
-	drm_gem_vunmap(obj, vaddr);
+	drm_gem_vunmap(obj, map->vaddr);
 }
 EXPORT_SYMBOL(drm_gem_dmabuf_vunmap);
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
index 80a9fc143bbb..135fbff6fecf 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
@@ -70,9 +70,10 @@ void etnaviv_gem_prime_unpin(struct drm_gem_object *obj)
 
 static void etnaviv_gem_prime_release(struct etnaviv_gem_object *etnaviv_obj)
 {
+	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(etnaviv_obj->vaddr);
+
 	if (etnaviv_obj->vaddr)
-		dma_buf_vunmap(etnaviv_obj->base.import_attach->dmabuf,
-			       etnaviv_obj->vaddr);
+		dma_buf_vunmap(etnaviv_obj->base.import_attach->dmabuf, &map);
 
 	/* Don't drop the pages for imported dmabuf, as they are not
 	 * ours, just free the array we allocated:
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 77b363d3000b..fec0e1e3dc3e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -96,7 +96,7 @@ static int i915_gem_dmabuf_vmap(struct dma_buf *dma_buf, struct dma_buf_map *map
 	return 0;
 }
 
-static void i915_gem_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
+static void i915_gem_dmabuf_vunmap(struct dma_buf *dma_buf, struct dma_buf_map *map)
 {
 	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
index 3aad7643b0b7..b6d43880b0c1 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
@@ -152,7 +152,7 @@ static int igt_dmabuf_import(void *arg)
 
 	err = 0;
 out_dma_map:
-	dma_buf_vunmap(dmabuf, dma_map);
+	dma_buf_vunmap(dmabuf, &map);
 out_obj:
 	i915_gem_object_put(obj);
 out_dmabuf:
@@ -182,7 +182,7 @@ static int igt_dmabuf_import_ownership(void *arg)
 	}
 
 	memset(ptr, 0xc5, PAGE_SIZE);
-	dma_buf_vunmap(dmabuf, ptr);
+	dma_buf_vunmap(dmabuf, &map);
 
 	obj = to_intel_bo(i915_gem_prime_import(&i915->drm, dmabuf));
 	if (IS_ERR(obj)) {
@@ -250,7 +250,7 @@ static int igt_dmabuf_export_vmap(void *arg)
 	memset(ptr, 0xc5, dmabuf->size);
 
 	err = 0;
-	dma_buf_vunmap(dmabuf, ptr);
+	dma_buf_vunmap(dmabuf, &map);
 out:
 	dma_buf_put(dmabuf);
 	return err;
diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
index dcae46441f7d..7dfef531f6d4 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
@@ -75,11 +75,11 @@ static int mock_dmabuf_vmap(struct dma_buf *dma_buf, struct dma_buf_map *map)
 	return 0;
 }
 
-static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
+static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, struct dma_buf_map *map)
 {
 	struct mock_dmabuf *mock = to_mock(dma_buf);
 
-	vm_unmap_ram(vaddr, mock->npages);
+	vm_unmap_ram(map->vaddr, mock->npages);
 }
 
 static int mock_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *vma)
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 2b6e06c7a2c5..5df577682878 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -149,11 +149,12 @@ static void *tegra_bo_mmap(struct host1x_bo *bo)
 static void tegra_bo_munmap(struct host1x_bo *bo, void *addr)
 {
 	struct tegra_bo *obj = host1x_to_tegra_bo(bo);
+	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(addr);
 
 	if (obj->vaddr)
 		return;
 	else if (obj->gem.import_attach)
-		dma_buf_vunmap(obj->gem.import_attach->dmabuf, addr);
+		dma_buf_vunmap(obj->gem.import_attach->dmabuf, &map);
 	else
 		vunmap(addr);
 }
@@ -663,7 +664,7 @@ static int tegra_gem_prime_vmap(struct dma_buf *buf, struct dma_buf_map *map)
 	return 0;
 }
 
-static void tegra_gem_prime_vunmap(struct dma_buf *buf, void *vaddr)
+static void tegra_gem_prime_vunmap(struct dma_buf *buf, struct dma_buf_map *map)
 {
 }
 
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
index 11428287bdf3..a1eb8279b113 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
@@ -648,6 +648,7 @@ static void vb2_dc_unmap_dmabuf(void *mem_priv)
 {
 	struct vb2_dc_buf *buf = mem_priv;
 	struct sg_table *sgt = buf->dma_sgt;
+	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(buf->vaddr);
 
 	if (WARN_ON(!buf->db_attach)) {
 		pr_err("trying to unpin a not attached buffer\n");
@@ -660,7 +661,7 @@ static void vb2_dc_unmap_dmabuf(void *mem_priv)
 	}
 
 	if (buf->vaddr) {
-		dma_buf_vunmap(buf->db_attach->dmabuf, buf->vaddr);
+		dma_buf_vunmap(buf->db_attach->dmabuf, &map);
 		buf->vaddr = NULL;
 	}
 	dma_buf_unmap_attachment(buf->db_attach, sgt, buf->dma_dir);
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index c51170e9c1b9..d5157e903e27 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -580,6 +580,7 @@ static void vb2_dma_sg_unmap_dmabuf(void *mem_priv)
 {
 	struct vb2_dma_sg_buf *buf = mem_priv;
 	struct sg_table *sgt = buf->dma_sgt;
+	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(buf->vaddr);
 
 	if (WARN_ON(!buf->db_attach)) {
 		pr_err("trying to unpin a not attached buffer\n");
@@ -592,7 +593,7 @@ static void vb2_dma_sg_unmap_dmabuf(void *mem_priv)
 	}
 
 	if (buf->vaddr) {
-		dma_buf_vunmap(buf->db_attach->dmabuf, buf->vaddr);
+		dma_buf_vunmap(buf->db_attach->dmabuf, &map);
 		buf->vaddr = NULL;
 	}
 	dma_buf_unmap_attachment(buf->db_attach, sgt, buf->dma_dir);
diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
index 7b68e2379c65..11ba0eb1315b 100644
--- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c
+++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
@@ -390,17 +390,19 @@ static int vb2_vmalloc_map_dmabuf(void *mem_priv)
 static void vb2_vmalloc_unmap_dmabuf(void *mem_priv)
 {
 	struct vb2_vmalloc_buf *buf = mem_priv;
+	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(buf->vaddr);
 
-	dma_buf_vunmap(buf->dbuf, buf->vaddr);
+	dma_buf_vunmap(buf->dbuf, &map);
 	buf->vaddr = NULL;
 }
 
 static void vb2_vmalloc_detach_dmabuf(void *mem_priv)
 {
 	struct vb2_vmalloc_buf *buf = mem_priv;
+	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(buf->vaddr);
 
 	if (buf->vaddr)
-		dma_buf_vunmap(buf->dbuf, buf->vaddr);
+		dma_buf_vunmap(buf->dbuf, &map);
 
 	kfree(buf);
 }
diff --git a/include/drm/drm_prime.h b/include/drm/drm_prime.h
index 3ee22639ff77..093f760cc131 100644
--- a/include/drm/drm_prime.h
+++ b/include/drm/drm_prime.h
@@ -84,7 +84,7 @@ void drm_gem_unmap_dma_buf(struct dma_buf_attachment *attach,
 			   struct sg_table *sgt,
 			   enum dma_data_direction dir);
 int drm_gem_dmabuf_vmap(struct dma_buf *dma_buf, struct dma_buf_map *map);
-void drm_gem_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr);
+void drm_gem_dmabuf_vunmap(struct dma_buf *dma_buf, struct dma_buf_map *map);
 
 int drm_gem_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
 int drm_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *vma);
diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index 5ae732d373eb..c173a4abf4ba 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -23,6 +23,16 @@ struct dma_buf_map {
 	bool is_iomem;
 };
 
+/**
+ * DMA_BUF_MAP_INIT_VADDR - Initializes struct dma_buf_map to an address in system memory
+ * @vaddr:	A system-memory address
+ */
+#define DMA_BUF_MAP_INIT_VADDR(vaddr_) \
+	{ \
+		.vaddr = (vaddr_), \
+		.is_iomem = false, \
+	}
+
 /**
  * dma_buf_map_set_vaddr - Sets a dma-buf mapping structure to an address in system memory
  * @map:	The dma-buf mapping structure
@@ -36,10 +46,26 @@ static inline void dma_buf_map_set_vaddr(struct dma_buf_map *map, void *vaddr)
 	map->is_iomem = false;
 }
 
-/* API transition helper */
-static inline bool dma_buf_map_is_vaddr(const struct dma_buf_map *map, const void *vaddr)
+/**
+ * dma_buf_map_is_equal - Compares two dma-buf mapping structures for equality
+ * @lhs:	The dma-buf mapping structure
+ * @rhs:	A dma-buf mapping structure to compare with
+ *
+ * Two dma-buf mapping structures are equal if they both refer to the same type of memory
+ * and to the same address within that memory.
+ *
+ * Returns:
+ * True is both structures are equal, or false otherwise.
+ */
+static inline bool dma_buf_map_is_equal(const struct dma_buf_map *lhs,
+					const struct dma_buf_map *rhs)
 {
-	return !map->is_iomem && (map->vaddr == vaddr);
+	if (lhs->is_iomem != rhs->is_iomem)
+		return false;
+	else if (lhs->is_iomem)
+		return lhs->vaddr_iomem == rhs->vaddr_iomem;
+	else
+		return lhs->vaddr == rhs->vaddr;
 }
 
 /**
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 7237997cfa38..cf77cc15f4ba 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -267,7 +267,7 @@ struct dma_buf_ops {
 	int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
 
 	int (*vmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
-	void (*vunmap)(struct dma_buf *, void *vaddr);
+	void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
 };
 
 /**
@@ -504,5 +504,5 @@ int dma_buf_end_cpu_access(struct dma_buf *dma_buf,
 int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
 		 unsigned long);
 int dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
-void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr);
+void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
 #endif /* __DMA_BUF_H__ */
-- 
cgit 


From ccc22d41bd9acec58cdc7c3b012e1887aba40af4 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 25 Sep 2020 13:56:01 +0200
Subject: dma-buf: Document struct dma_buf_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds struct dma_buf_map and its helpers to the documentation. A
short tutorial is included.

v3:
	* update documentation in a separate patch
	* expand docs (Daniel)
	* carry-over acks from patch 1

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20200925115601.23955-5-tzimmermann@suse.de
---
 Documentation/driver-api/dma-buf.rst |  9 +++++
 include/linux/dma-buf-map.h          | 72 ++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst
index 13ea0cc0a3fa..6dbcc4714b0b 100644
--- a/Documentation/driver-api/dma-buf.rst
+++ b/Documentation/driver-api/dma-buf.rst
@@ -115,6 +115,15 @@ Kernel Functions and Structures Reference
 .. kernel-doc:: include/linux/dma-buf.h
    :internal:
 
+Buffer Mapping Helpers
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: include/linux/dma-buf-map.h
+   :doc: overview
+
+.. kernel-doc:: include/linux/dma-buf-map.h
+   :internal:
+
 Reservation Objects
 -------------------
 
diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index c173a4abf4ba..fd1aba545fdf 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -8,6 +8,78 @@
 
 #include <linux/io.h>
 
+/**
+ * DOC: overview
+ *
+ * Calling dma-buf's vmap operation returns a pointer to the buffer's memory.
+ * Depending on the location of the buffer, users may have to access it with
+ * I/O operations or memory load/store operations. For example, copying to
+ * system memory could be done with memcpy(), copying to I/O memory would be
+ * done with memcpy_toio().
+ *
+ * .. code-block:: c
+ *
+ *	void *vaddr = ...; // pointer to system memory
+ *	memcpy(vaddr, src, len);
+ *
+ *	void *vaddr_iomem = ...; // pointer to I/O memory
+ *	memcpy_toio(vaddr, _iomem, src, len);
+ *
+ * When using dma-buf's vmap operation, the returned pointer is encoded as
+ * :c:type:`struct dma_buf_map <dma_buf_map>`.
+ * :c:type:`struct dma_buf_map <dma_buf_map>` stores the buffer's address in
+ * system or I/O memory and a flag that signals the required method of
+ * accessing the buffer. Use the returned instance and the helper functions
+ * to access the buffer's memory in the correct way.
+ *
+ * Open-coding access to :c:type:`struct dma_buf_map <dma_buf_map>` is
+ * considered bad style. Rather then accessing its fields directly, use one
+ * of the provided helper functions, or implement your own. For example,
+ * instances of :c:type:`struct dma_buf_map <dma_buf_map>` can be initialized
+ * statically with DMA_BUF_MAP_INIT_VADDR(), or at runtime with
+ * dma_buf_map_set_vaddr(). These helpers will set an address in system memory.
+ *
+ * .. code-block:: c
+ *
+ *	struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(0xdeadbeaf);
+ *
+ *	dma_buf_map_set_vaddr(&map. 0xdeadbeaf);
+ *
+ * Test if a mapping is valid with either dma_buf_map_is_set() or
+ * dma_buf_map_is_null().
+ *
+ * .. code-block:: c
+ *
+ *	if (dma_buf_map_is_set(&map) != dma_buf_map_is_null(&map))
+ *		// always true
+ *
+ * Instances of :c:type:`struct dma_buf_map <dma_buf_map>` can be compared
+ * for equality with dma_buf_map_is_equal(). Mappings the point to different
+ * memory spaces, system or I/O, are never equal. That's even true if both
+ * spaces are located in the same address space, both mappings contain the
+ * same address value, or both mappings refer to NULL.
+ *
+ * .. code-block:: c
+ *
+ *	struct dma_buf_map sys_map; // refers to system memory
+ *	struct dma_buf_map io_map; // refers to I/O memory
+ *
+ *	if (dma_buf_map_is_equal(&sys_map, &io_map))
+ *		// always false
+ *
+ * Instances of struct dma_buf_map do not have to be cleaned up, but
+ * can be cleared to NULL with dma_buf_map_clear(). Cleared mappings
+ * always refer to system memory.
+ *
+ * The type :c:type:`struct dma_buf_map <dma_buf_map>` and its helpers are
+ * actually independent from the dma-buf infrastructure. When sharing buffers
+ * among devices, drivers have to know the location of the memory to access
+ * the buffers in a safe way. :c:type:`struct dma_buf_map <dma_buf_map>`
+ * solves this problem for dma-buf and its users. If other drivers or
+ * sub-systems require similar functionality, the type could be generalized
+ * and moved to a more prominent header file.
+ */
+
 /**
  * struct dma_buf_map - Pointer to vmap'ed dma-buf memory.
  * @vaddr_iomem:	The buffer's address if in I/O memory
-- 
cgit 


From ead1c9f376dbb2805796098ed6d2a70921c77ee5 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Wed, 30 Sep 2020 16:50:48 +0300
Subject: iio: adc: at91_adc: remove platform data and move defs in driver file

The AT91 ADC driver no longer uses the 'at91_add_device_adc' platform data
type. This is no longer used (at least in mainline boards).

This change removes the platform-data initialization from the driver, since
it is mostly dead code now.

Some definitions [from the platform data at91_adc.h include] have been
moved in the driver, since they are needed in the driver.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20200930135048.11530-5-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/at91_adc.c             | 60 +++++++++++++---------------------
 include/linux/platform_data/at91_adc.h | 49 ---------------------------
 2 files changed, 22 insertions(+), 87 deletions(-)
 delete mode 100644 include/linux/platform_data/at91_adc.h

(limited to 'include/linux')

diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
index 473bffe84fbd..601708168082 100644
--- a/drivers/iio/adc/at91_adc.c
+++ b/drivers/iio/adc/at91_adc.c
@@ -22,8 +22,6 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 
-#include <linux/platform_data/at91_adc.h>
-
 #include <linux/iio/iio.h>
 #include <linux/iio/buffer.h>
 #include <linux/iio/trigger.h>
@@ -153,6 +151,25 @@
 #define TOUCH_SHTIM                    0xa
 #define TOUCH_SCTIM_US		10		/* 10us for the Touchscreen Switches Closure Time */
 
+enum atmel_adc_ts_type {
+	ATMEL_ADC_TOUCHSCREEN_NONE = 0,
+	ATMEL_ADC_TOUCHSCREEN_4WIRE = 4,
+	ATMEL_ADC_TOUCHSCREEN_5WIRE = 5,
+};
+
+/**
+ * struct at91_adc_trigger - description of triggers
+ * @name:		name of the trigger advertised to the user
+ * @value:		value to set in the ADC's trigger setup register
+ *			to enable the trigger
+ * @is_external:	Does the trigger rely on an external pin?
+ */
+struct at91_adc_trigger {
+	const char	*name;
+	u8		value;
+	bool		is_external;
+};
+
 /**
  * struct at91_adc_reg_desc - Various informations relative to registers
  * @channel_base:	Base offset for the channel data registers
@@ -873,9 +890,6 @@ static int at91_adc_probe_dt(struct iio_dev *idev,
 	int i = 0, ret;
 	u32 prop;
 
-	if (!node)
-		return -EINVAL;
-
 	st->caps = of_device_get_match_data(&pdev->dev);
 
 	st->use_external = of_property_read_bool(node, "atmel,adc-use-external-triggers");
@@ -957,30 +971,6 @@ error_ret:
 	return ret;
 }
 
-static int at91_adc_probe_pdata(struct at91_adc_state *st,
-				struct platform_device *pdev)
-{
-	struct at91_adc_data *pdata = pdev->dev.platform_data;
-
-	if (!pdata)
-		return -EINVAL;
-
-	st->caps = (struct at91_adc_caps *)
-			platform_get_device_id(pdev)->driver_data;
-
-	st->use_external = pdata->use_external_triggers;
-	st->vref_mv = pdata->vref;
-	st->channels_mask = pdata->channels_used;
-	st->num_channels = st->caps->num_channels;
-	st->startup_time = pdata->startup_time;
-	st->trigger_number = pdata->trigger_number;
-	st->trigger_list = pdata->trigger_list;
-	st->registers = &st->caps->registers;
-	st->touchscreen_type = pdata->touchscreen_type;
-
-	return 0;
-}
-
 static const struct iio_info at91_adc_info = {
 	.read_raw = &at91_adc_read_raw,
 };
@@ -1157,15 +1147,9 @@ static int at91_adc_probe(struct platform_device *pdev)
 
 	st = iio_priv(idev);
 
-	if (pdev->dev.of_node)
-		ret = at91_adc_probe_dt(idev, pdev);
-	else
-		ret = at91_adc_probe_pdata(st, pdev);
-
-	if (ret) {
-		dev_err(&pdev->dev, "No platform data available.\n");
-		return -EINVAL;
-	}
+	ret = at91_adc_probe_dt(idev, pdev);
+	if (ret)
+		return ret;
 
 	platform_set_drvdata(pdev, idev);
 
diff --git a/include/linux/platform_data/at91_adc.h b/include/linux/platform_data/at91_adc.h
deleted file mode 100644
index f20eaeb827ce..000000000000
--- a/include/linux/platform_data/at91_adc.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2011 Free Electrons
- */
-
-#ifndef _AT91_ADC_H_
-#define _AT91_ADC_H_
-
-enum atmel_adc_ts_type {
-	ATMEL_ADC_TOUCHSCREEN_NONE = 0,
-	ATMEL_ADC_TOUCHSCREEN_4WIRE = 4,
-	ATMEL_ADC_TOUCHSCREEN_5WIRE = 5,
-};
-
-/**
- * struct at91_adc_trigger - description of triggers
- * @name:		name of the trigger advertised to the user
- * @value:		value to set in the ADC's trigger setup register
-			to enable the trigger
- * @is_external:	Does the trigger rely on an external pin?
- */
-struct at91_adc_trigger {
-	const char	*name;
-	u8		value;
-	bool		is_external;
-};
-
-/**
- * struct at91_adc_data - platform data for ADC driver
- * @channels_used:		channels in use on the board as a bitmask
- * @startup_time:		startup time of the ADC in microseconds
- * @trigger_list:		Triggers available in the ADC
- * @trigger_number:		Number of triggers available in the ADC
- * @use_external_triggers:	does the board has external triggers availables
- * @vref:			Reference voltage for the ADC in millivolts
- * @touchscreen_type:		If a touchscreen is connected, its type (4 or 5 wires)
- */
-struct at91_adc_data {
-	unsigned long			channels_used;
-	u8				startup_time;
-	struct at91_adc_trigger		*trigger_list;
-	u8				trigger_number;
-	bool				use_external_triggers;
-	u16				vref;
-	enum atmel_adc_ts_type		touchscreen_type;
-};
-
-extern void __init at91_add_device_adc(struct at91_adc_data *data);
-#endif
-- 
cgit 


From 5483b8d5015bb366c372870cfe4448742082e41f Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Fri, 2 Oct 2020 11:27:23 +0300
Subject: iio: adc: ad7887: invert/rework external ref logic

This change inverts/reworks the logic to use an external reference via a
provided regulator.

Now the driver tries to obtain a regulator. If one is found, then it is
used. The rest of the driver logic already checks if there is a non-NULL
reference to a regulator, so it should be fine.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20201002082723.184810-1-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad7887.c             | 12 ++++++++----
 include/linux/platform_data/ad7887.h |  4 ----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ad7887.c b/drivers/iio/adc/ad7887.c
index 037bcb47693c..99a480ad3985 100644
--- a/drivers/iio/adc/ad7887.c
+++ b/drivers/iio/adc/ad7887.c
@@ -246,11 +246,15 @@ static int ad7887_probe(struct spi_device *spi)
 
 	st = iio_priv(indio_dev);
 
-	if (!pdata || !pdata->use_onchip_ref) {
-		st->reg = devm_regulator_get(&spi->dev, "vref");
-		if (IS_ERR(st->reg))
+	st->reg = devm_regulator_get_optional(&spi->dev, "vref");
+	if (IS_ERR(st->reg)) {
+		if (PTR_ERR(st->reg) != -ENODEV)
 			return PTR_ERR(st->reg);
 
+		st->reg = NULL;
+	}
+
+	if (st->reg) {
 		ret = regulator_enable(st->reg);
 		if (ret)
 			return ret;
@@ -269,7 +273,7 @@ static int ad7887_probe(struct spi_device *spi)
 	/* Setup default message */
 
 	mode = AD7887_PM_MODE4;
-	if (!pdata || !pdata->use_onchip_ref)
+	if (!st->reg)
 		mode |= AD7887_REF_DIS;
 	if (pdata && pdata->en_dual)
 		mode |= AD7887_DUAL;
diff --git a/include/linux/platform_data/ad7887.h b/include/linux/platform_data/ad7887.h
index 732af46b2d16..9b4dca6ae70b 100644
--- a/include/linux/platform_data/ad7887.h
+++ b/include/linux/platform_data/ad7887.h
@@ -13,13 +13,9 @@
  *	second input channel, and Vref is internally connected to Vdd. If set to
  *	false the device is used in single channel mode and AIN1/Vref is used as
  *	VREF input.
- * @use_onchip_ref: Whether to use the onchip reference. If set to true the
- *	internal 2.5V reference is used. If set to false a external reference is
- *	used.
  */
 struct ad7887_platform_data {
 	bool en_dual;
-	bool use_onchip_ref;
 };
 
 #endif /* IIO_ADC_AD7887_H_ */
-- 
cgit 


From 28963f2f6b46d75bda8fed15bd5ce9923427a40d Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Thu, 1 Oct 2020 17:10:48 +0300
Subject: iio: adc: ad7298: rework external ref setup & remove platform data

This change removes the old platform data for ad7298. It is only used to
provide whether to use an external regulator as a reference.

So, the logic is inverted a bit. The driver now tries to obtain a
regulator. If one is provided, then the external ref is used. The rest of
the logic should work as before.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20201001141048.69050-1-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad7298.c             | 17 +++++++++--------
 include/linux/platform_data/ad7298.h | 19 -------------------
 2 files changed, 9 insertions(+), 27 deletions(-)
 delete mode 100644 include/linux/platform_data/ad7298.h

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ad7298.c b/drivers/iio/adc/ad7298.c
index 48d43cb0f932..fa1047f74a1f 100644
--- a/drivers/iio/adc/ad7298.c
+++ b/drivers/iio/adc/ad7298.c
@@ -23,8 +23,6 @@
 #include <linux/iio/trigger_consumer.h>
 #include <linux/iio/triggered_buffer.h>
 
-#include <linux/platform_data/ad7298.h>
-
 #define AD7298_WRITE	BIT(15) /* write to the control register */
 #define AD7298_REPEAT	BIT(14) /* repeated conversion enable */
 #define AD7298_CH(x)	BIT(13 - (x)) /* channel select */
@@ -283,7 +281,6 @@ static const struct iio_info ad7298_info = {
 
 static int ad7298_probe(struct spi_device *spi)
 {
-	struct ad7298_platform_data *pdata = spi->dev.platform_data;
 	struct ad7298_state *st;
 	struct iio_dev *indio_dev;
 	int ret;
@@ -294,14 +291,18 @@ static int ad7298_probe(struct spi_device *spi)
 
 	st = iio_priv(indio_dev);
 
-	if (pdata && pdata->ext_ref)
+	st->reg = devm_regulator_get_optional(&spi->dev, "vref");
+	if (!IS_ERR(st->reg)) {
 		st->ext_ref = AD7298_EXTREF;
+	} else {
+		ret = PTR_ERR(st->reg);
+		if (ret != -ENODEV)
+			return ret;
 
-	if (st->ext_ref) {
-		st->reg = devm_regulator_get(&spi->dev, "vref");
-		if (IS_ERR(st->reg))
-			return PTR_ERR(st->reg);
+		st->reg = NULL;
+	}
 
+	if (st->reg) {
 		ret = regulator_enable(st->reg);
 		if (ret)
 			return ret;
diff --git a/include/linux/platform_data/ad7298.h b/include/linux/platform_data/ad7298.h
deleted file mode 100644
index 3e0ffe2d5d3d..000000000000
--- a/include/linux/platform_data/ad7298.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * AD7298 SPI ADC driver
- *
- * Copyright 2011 Analog Devices Inc.
- */
-
-#ifndef __LINUX_PLATFORM_DATA_AD7298_H__
-#define __LINUX_PLATFORM_DATA_AD7298_H__
-
-/**
- * struct ad7298_platform_data - Platform data for the ad7298 ADC driver
- * @ext_ref: Whether to use an external reference voltage.
- **/
-struct ad7298_platform_data {
-	bool ext_ref;
-};
-
-#endif /* IIO_ADC_AD7298_H_ */
-- 
cgit 


From 223f4d9517f8d97721647297b1cde41241a45233 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Thu, 1 Oct 2020 17:10:04 +0300
Subject: iio: dac: ad7303: remove platform data header

The information in the ad7303 platform_data header is unused, so it's dead
code.
This change removes it and it's inclusion from the driver.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20201001141004.53846-1-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/dac/ad7303.c             |  2 --
 include/linux/platform_data/ad7303.h | 20 --------------------
 2 files changed, 22 deletions(-)
 delete mode 100644 include/linux/platform_data/ad7303.h

(limited to 'include/linux')

diff --git a/drivers/iio/dac/ad7303.c b/drivers/iio/dac/ad7303.c
index 2e46def9d8ee..dbb4645ab6b1 100644
--- a/drivers/iio/dac/ad7303.c
+++ b/drivers/iio/dac/ad7303.c
@@ -17,8 +17,6 @@
 #include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
 
-#include <linux/platform_data/ad7303.h>
-
 #define AD7303_CFG_EXTERNAL_VREF BIT(15)
 #define AD7303_CFG_POWER_DOWN(ch) BIT(11 + (ch))
 #define AD7303_CFG_ADDR_OFFSET	10
diff --git a/include/linux/platform_data/ad7303.h b/include/linux/platform_data/ad7303.h
deleted file mode 100644
index c2bd0a13bea1..000000000000
--- a/include/linux/platform_data/ad7303.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Analog Devices AD7303 DAC driver
- *
- * Copyright 2013 Analog Devices Inc.
- */
-
-#ifndef __IIO_ADC_AD7303_H__
-#define __IIO_ADC_AD7303_H__
-
-/**
- * struct ad7303_platform_data - AD7303 platform data
- * @use_external_ref: If set to true use an external voltage reference connected
- * to the REF pin, otherwise use the internal reference derived from Vdd.
- */
-struct ad7303_platform_data {
-	bool use_external_ref;
-};
-
-#endif
-- 
cgit 


From 476c5818c37a7828d558f34ae01f0c32f8bfadde Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 29 Aug 2020 22:03:13 +0900
Subject: llist: Add nonatomic __llist_add() and __llist_dell_all()

We'll use these in the new, lockless kretprobes code.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/159870619318.1229682.13027387548510028721.stgit@devnote2
---
 drivers/gpu/drm/i915/i915_request.c |  6 ------
 include/linux/llist.h               | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 0b2fe55e6194..0e851b925c8c 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -357,12 +357,6 @@ void i915_request_retire_upto(struct i915_request *rq)
 	} while (i915_request_retire(tmp) && tmp != rq);
 }
 
-static void __llist_add(struct llist_node *node, struct llist_head *head)
-{
-	node->next = head->first;
-	head->first = node;
-}
-
 static struct i915_request * const *
 __engine_active(struct intel_engine_cs *engine)
 {
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 2e9c7215882b..24f207b0190b 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -197,6 +197,16 @@ static inline struct llist_node *llist_next(struct llist_node *node)
 extern bool llist_add_batch(struct llist_node *new_first,
 			    struct llist_node *new_last,
 			    struct llist_head *head);
+
+static inline bool __llist_add_batch(struct llist_node *new_first,
+				     struct llist_node *new_last,
+				     struct llist_head *head)
+{
+	new_last->next = head->first;
+	head->first = new_first;
+	return new_last->next == NULL;
+}
+
 /**
  * llist_add - add a new entry
  * @new:	new entry to be added
@@ -209,6 +219,11 @@ static inline bool llist_add(struct llist_node *new, struct llist_head *head)
 	return llist_add_batch(new, new, head);
 }
 
+static inline bool __llist_add(struct llist_node *new, struct llist_head *head)
+{
+	return __llist_add_batch(new, new, head);
+}
+
 /**
  * llist_del_all - delete all entries from lock-less list
  * @head:	the head of lock-less list to delete all entries
@@ -222,6 +237,14 @@ static inline struct llist_node *llist_del_all(struct llist_head *head)
 	return xchg(&head->first, NULL);
 }
 
+static inline struct llist_node *__llist_del_all(struct llist_head *head)
+{
+	struct llist_node *first = head->first;
+
+	head->first = NULL;
+	return first;
+}
+
 extern struct llist_node *llist_del_first(struct llist_head *head);
 
 struct llist_node *llist_reverse_order(struct llist_node *head);
-- 
cgit 


From d741bf41d7c7db4898bacfcb020353cddc032fd8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 29 Aug 2020 22:03:24 +0900
Subject: kprobes: Remove kretprobe hash

The kretprobe hash is mostly superfluous, replace it with a per-task
variable.

This gets rid of the task hash and it's related locking.

Note that this may change the kprobes module-exported API for kretprobe
handlers. If any out-of-tree kretprobe user uses ri->rp, use
get_kretprobe(ri) instead.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/159870620431.1229682.16325792502413731312.stgit@devnote2
---
 include/linux/kprobes.h     |  19 +++-
 include/linux/sched.h       |   4 +
 kernel/fork.c               |   4 +
 kernel/kprobes.c            | 236 +++++++++++++-------------------------------
 kernel/trace/trace_kprobe.c |   3 +-
 5 files changed, 97 insertions(+), 169 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 5c8c271fa1e9..00cf4421efd5 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -27,6 +27,7 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <linux/refcount.h>
 #include <asm/kprobes.h>
 
 #ifdef CONFIG_KPROBES
@@ -144,6 +145,11 @@ static inline int kprobe_ftrace(struct kprobe *p)
  * ignored, due to maxactive being too low.
  *
  */
+struct kretprobe_holder {
+	struct kretprobe	*rp;
+	refcount_t		ref;
+};
+
 struct kretprobe {
 	struct kprobe kp;
 	kretprobe_handler_t handler;
@@ -152,17 +158,18 @@ struct kretprobe {
 	int nmissed;
 	size_t data_size;
 	struct hlist_head free_instances;
+	struct kretprobe_holder *rph;
 	raw_spinlock_t lock;
 };
 
 struct kretprobe_instance {
 	union {
+		struct llist_node llist;
 		struct hlist_node hlist;
 		struct rcu_head rcu;
 	};
-	struct kretprobe *rp;
+	struct kretprobe_holder *rph;
 	kprobe_opcode_t *ret_addr;
-	struct task_struct *task;
 	void *fp;
 	char data[];
 };
@@ -221,6 +228,14 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
 	return ret;
 }
 
+static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
+{
+	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
+		"Kretprobe is accessed from instance under preemptive context");
+
+	return READ_ONCE(ri->rph->rp);
+}
+
 #else /* CONFIG_KRETPROBES */
 static inline void arch_prepare_kretprobe(struct kretprobe *rp,
 					struct pt_regs *regs)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index afe01e232935..5911805cafde 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1315,6 +1315,10 @@ struct task_struct {
 	struct callback_head		mce_kill_me;
 #endif
 
+#ifdef CONFIG_KRETPROBES
+	struct llist_head               kretprobe_instances;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/kernel/fork.c b/kernel/fork.c
index 49677d668de4..53a1f508a097 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2161,6 +2161,10 @@ static __latent_entropy struct task_struct *copy_process(
 	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 
+#ifdef CONFIG_KRETPROBES
+	p->kretprobe_instances.first = NULL;
+#endif
+
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted the the new process's css_set can be changed
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3b61ae8ff5da..850ee36a4051 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -53,7 +53,6 @@ static int kprobes_initialized;
  * - RCU hlist traversal under disabling preempt (breakpoint handlers)
  */
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
-static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
 /* NOTE: change this value only with kprobe_mutex held */
 static bool kprobes_all_disarmed;
@@ -61,9 +60,6 @@ static bool kprobes_all_disarmed;
 /* This protects kprobe_table and optimizing_list */
 static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
-static struct {
-	raw_spinlock_t lock ____cacheline_aligned_in_smp;
-} kretprobe_table_locks[KPROBE_TABLE_SIZE];
 
 kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
 					unsigned int __unused)
@@ -71,11 +67,6 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
 	return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
 }
 
-static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
-{
-	return &(kretprobe_table_locks[hash].lock);
-}
-
 /* Blacklist -- list of struct kprobe_blacklist_entry */
 static LIST_HEAD(kprobe_blacklist);
 
@@ -1223,65 +1214,30 @@ void kprobes_inc_nmissed_count(struct kprobe *p)
 }
 NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
 
+static void free_rp_inst_rcu(struct rcu_head *head)
+{
+	struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+
+	if (refcount_dec_and_test(&ri->rph->ref))
+		kfree(ri->rph);
+	kfree(ri);
+}
+NOKPROBE_SYMBOL(free_rp_inst_rcu);
+
 static void recycle_rp_inst(struct kretprobe_instance *ri)
 {
-	struct kretprobe *rp = ri->rp;
+	struct kretprobe *rp = get_kretprobe(ri);
 
-	/* remove rp inst off the rprobe_inst_table */
-	hlist_del(&ri->hlist);
 	INIT_HLIST_NODE(&ri->hlist);
 	if (likely(rp)) {
 		raw_spin_lock(&rp->lock);
 		hlist_add_head(&ri->hlist, &rp->free_instances);
 		raw_spin_unlock(&rp->lock);
 	} else
-		kfree_rcu(ri, rcu);
+		call_rcu(&ri->rcu, free_rp_inst_rcu);
 }
 NOKPROBE_SYMBOL(recycle_rp_inst);
 
-static void kretprobe_hash_lock(struct task_struct *tsk,
-			 struct hlist_head **head, unsigned long *flags)
-__acquires(hlist_lock)
-{
-	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-	raw_spinlock_t *hlist_lock;
-
-	*head = &kretprobe_inst_table[hash];
-	hlist_lock = kretprobe_table_lock_ptr(hash);
-	raw_spin_lock_irqsave(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_hash_lock);
-
-static void kretprobe_table_lock(unsigned long hash,
-				 unsigned long *flags)
-__acquires(hlist_lock)
-{
-	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-	raw_spin_lock_irqsave(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_table_lock);
-
-static void kretprobe_hash_unlock(struct task_struct *tsk,
-			   unsigned long *flags)
-__releases(hlist_lock)
-{
-	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-	raw_spinlock_t *hlist_lock;
-
-	hlist_lock = kretprobe_table_lock_ptr(hash);
-	raw_spin_unlock_irqrestore(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_hash_unlock);
-
-static void kretprobe_table_unlock(unsigned long hash,
-				   unsigned long *flags)
-__releases(hlist_lock)
-{
-	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-	raw_spin_unlock_irqrestore(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_table_unlock);
-
 static struct kprobe kprobe_busy = {
 	.addr = (void *) get_kprobe,
 };
@@ -1311,24 +1267,21 @@ void kprobe_busy_end(void)
 void kprobe_flush_task(struct task_struct *tk)
 {
 	struct kretprobe_instance *ri;
-	struct hlist_head *head;
-	struct hlist_node *tmp;
-	unsigned long hash, flags = 0;
+	struct llist_node *node;
 
+	/* Early boot, not yet initialized. */
 	if (unlikely(!kprobes_initialized))
-		/* Early boot.  kretprobe_table_locks not yet initialized. */
 		return;
 
 	kprobe_busy_begin();
 
-	hash = hash_ptr(tk, KPROBE_HASH_BITS);
-	head = &kretprobe_inst_table[hash];
-	kretprobe_table_lock(hash, &flags);
-	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
-		if (ri->task == tk)
-			recycle_rp_inst(ri);
+	node = __llist_del_all(&tk->kretprobe_instances);
+	while (node) {
+		ri = container_of(node, struct kretprobe_instance, llist);
+		node = node->next;
+
+		recycle_rp_inst(ri);
 	}
-	kretprobe_table_unlock(hash, &flags);
 
 	kprobe_busy_end();
 }
@@ -1338,36 +1291,19 @@ static inline void free_rp_inst(struct kretprobe *rp)
 {
 	struct kretprobe_instance *ri;
 	struct hlist_node *next;
+	int count = 0;
 
 	hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
+		count++;
 	}
-}
-
-static void cleanup_rp_inst(struct kretprobe *rp)
-{
-	unsigned long flags, hash;
-	struct kretprobe_instance *ri;
-	struct hlist_node *next;
-	struct hlist_head *head;
 
-	/* To avoid recursive kretprobe by NMI, set kprobe busy here */
-	kprobe_busy_begin();
-	for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
-		kretprobe_table_lock(hash, &flags);
-		head = &kretprobe_inst_table[hash];
-		hlist_for_each_entry_safe(ri, next, head, hlist) {
-			if (ri->rp == rp)
-				ri->rp = NULL;
-		}
-		kretprobe_table_unlock(hash, &flags);
+	if (refcount_sub_and_test(count, &rp->rph->ref)) {
+		kfree(rp->rph);
+		rp->rph = NULL;
 	}
-	kprobe_busy_end();
-
-	free_rp_inst(rp);
 }
-NOKPROBE_SYMBOL(cleanup_rp_inst);
 
 /* Add the new probe to ap->list */
 static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
@@ -1928,88 +1864,56 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
 					     void *trampoline_address,
 					     void *frame_pointer)
 {
-	struct kretprobe_instance *ri = NULL, *last = NULL;
-	struct hlist_head *head;
-	struct hlist_node *tmp;
-	unsigned long flags;
 	kprobe_opcode_t *correct_ret_addr = NULL;
-	bool skipped = false;
+	struct kretprobe_instance *ri = NULL;
+	struct llist_node *first, *node;
+	struct kretprobe *rp;
 
-	kretprobe_hash_lock(current, &head, &flags);
+	/* Find all nodes for this frame. */
+	first = node = current->kretprobe_instances.first;
+	while (node) {
+		ri = container_of(node, struct kretprobe_instance, llist);
 
-	/*
-	 * It is possible to have multiple instances associated with a given
-	 * task either because multiple functions in the call path have
-	 * return probes installed on them, and/or more than one
-	 * return probe was registered for a target function.
-	 *
-	 * We can handle this because:
-	 *     - instances are always pushed into the head of the list
-	 *     - when multiple return probes are registered for the same
-	 *	 function, the (chronologically) first instance's ret_addr
-	 *	 will be the real return address, and all the rest will
-	 *	 point to kretprobe_trampoline.
-	 */
-	hlist_for_each_entry(ri, head, hlist) {
-		if (ri->task != current)
-			/* another task is sharing our hash bucket */
-			continue;
-		/*
-		 * Return probes must be pushed on this hash list correct
-		 * order (same as return order) so that it can be popped
-		 * correctly. However, if we find it is pushed it incorrect
-		 * order, this means we find a function which should not be
-		 * probed, because the wrong order entry is pushed on the
-		 * path of processing other kretprobe itself.
-		 */
-		if (ri->fp != frame_pointer) {
-			if (!skipped)
-				pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
-			skipped = true;
-			continue;
-		}
+		BUG_ON(ri->fp != frame_pointer);
 
-		correct_ret_addr = ri->ret_addr;
-		if (skipped)
-			pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
-				ri->rp->kp.addr);
-
-		if (correct_ret_addr != trampoline_address)
+		if (ri->ret_addr != trampoline_address) {
+			correct_ret_addr = ri->ret_addr;
 			/*
 			 * This is the real return address. Any other
 			 * instances associated with this task are for
 			 * other calls deeper on the call stack
 			 */
-			break;
+			goto found;
+		}
+
+		node = node->next;
 	}
+	pr_err("Oops! Kretprobe fails to find correct return address.\n");
+	BUG_ON(1);
 
-	BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address));
-	last = ri;
+found:
+	/* Unlink all nodes for this frame. */
+	current->kretprobe_instances.first = node->next;
+	node->next = NULL;
 
-	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
-		if (ri->task != current)
-			/* another task is sharing our hash bucket */
-			continue;
-		if (ri->fp != frame_pointer)
-			continue;
+	/* Run them..  */
+	while (first) {
+		ri = container_of(first, struct kretprobe_instance, llist);
+		first = first->next;
 
-		if (ri->rp && ri->rp->handler) {
+		rp = get_kretprobe(ri);
+		if (rp && rp->handler) {
 			struct kprobe *prev = kprobe_running();
 
-			__this_cpu_write(current_kprobe, &ri->rp->kp);
+			__this_cpu_write(current_kprobe, &rp->kp);
 			ri->ret_addr = correct_ret_addr;
-			ri->rp->handler(ri, regs);
+			rp->handler(ri, regs);
 			__this_cpu_write(current_kprobe, prev);
 		}
 
 		recycle_rp_inst(ri);
-
-		if (ri == last)
-			break;
 	}
 
-	kretprobe_hash_unlock(current, &flags);
-
 	return (unsigned long)correct_ret_addr;
 }
 NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
@@ -2021,11 +1925,10 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
 static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-	unsigned long hash, flags = 0;
+	unsigned long flags = 0;
 	struct kretprobe_instance *ri;
 
 	/* TODO: consider to only swap the RA after the last pre_handler fired */
-	hash = hash_ptr(current, KPROBE_HASH_BITS);
 	raw_spin_lock_irqsave(&rp->lock, flags);
 	if (!hlist_empty(&rp->free_instances)) {
 		ri = hlist_entry(rp->free_instances.first,
@@ -2033,9 +1936,6 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 		hlist_del(&ri->hlist);
 		raw_spin_unlock_irqrestore(&rp->lock, flags);
 
-		ri->rp = rp;
-		ri->task = current;
-
 		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
 			raw_spin_lock_irqsave(&rp->lock, flags);
 			hlist_add_head(&ri->hlist, &rp->free_instances);
@@ -2045,11 +1945,8 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 
 		arch_prepare_kretprobe(ri, regs);
 
-		/* XXX(hch): why is there no hlist_move_head? */
-		INIT_HLIST_NODE(&ri->hlist);
-		kretprobe_table_lock(hash, &flags);
-		hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
-		kretprobe_table_unlock(hash, &flags);
+		__llist_add(&ri->llist, &current->kretprobe_instances);
+
 	} else {
 		rp->nmissed++;
 		raw_spin_unlock_irqrestore(&rp->lock, flags);
@@ -2112,16 +2009,24 @@ int register_kretprobe(struct kretprobe *rp)
 	}
 	raw_spin_lock_init(&rp->lock);
 	INIT_HLIST_HEAD(&rp->free_instances);
+	rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
+	if (!rp->rph)
+		return -ENOMEM;
+
+	rp->rph->rp = rp;
 	for (i = 0; i < rp->maxactive; i++) {
-		inst = kmalloc(sizeof(struct kretprobe_instance) +
+		inst = kzalloc(sizeof(struct kretprobe_instance) +
 			       rp->data_size, GFP_KERNEL);
 		if (inst == NULL) {
+			refcount_set(&rp->rph->ref, i);
 			free_rp_inst(rp);
 			return -ENOMEM;
 		}
+		inst->rph = rp->rph;
 		INIT_HLIST_NODE(&inst->hlist);
 		hlist_add_head(&inst->hlist, &rp->free_instances);
 	}
+	refcount_set(&rp->rph->ref, i);
 
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
@@ -2163,16 +2068,18 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
 	if (num <= 0)
 		return;
 	mutex_lock(&kprobe_mutex);
-	for (i = 0; i < num; i++)
+	for (i = 0; i < num; i++) {
 		if (__unregister_kprobe_top(&rps[i]->kp) < 0)
 			rps[i]->kp.addr = NULL;
+		rps[i]->rph->rp = NULL;
+	}
 	mutex_unlock(&kprobe_mutex);
 
 	synchronize_rcu();
 	for (i = 0; i < num; i++) {
 		if (rps[i]->kp.addr) {
 			__unregister_kprobe_bottom(&rps[i]->kp);
-			cleanup_rp_inst(rps[i]);
+			free_rp_inst(rps[i]);
 		}
 	}
 }
@@ -2535,11 +2442,8 @@ static int __init init_kprobes(void)
 
 	/* FIXME allocate the probe table, currently defined statically */
 	/* initialize all list heads */
-	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++)
 		INIT_HLIST_HEAD(&kprobe_table[i]);
-		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
-		raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
-	}
 
 	err = populate_kprobe_blacklist(__start_kprobe_blacklist,
 					__stop_kprobe_blacklist);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aefb6065b508..07baf6f6cecc 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1714,7 +1714,8 @@ NOKPROBE_SYMBOL(kprobe_dispatcher);
 static int
 kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
-	struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
+	struct kretprobe *rp = get_kretprobe(ri);
+	struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp);
 
 	raw_cpu_inc(*tk->nhit);
 
-- 
cgit 


From 29f006fdefe6f88abde973a0b0f20d2704e93fd4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 29 Aug 2020 22:03:35 +0900
Subject: asm-generic/atomic: Add try_cmpxchg() fallbacks

Only x86 provides try_cmpxchg() outside of the atomic_t interfaces,
provide generic fallbacks to create this interface from the widely
available cmpxchg() function.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/159870621515.1229682.15506193091065001742.stgit@devnote2
---
 arch/x86/include/asm/atomic.h             |   2 +-
 arch/x86/include/asm/atomic64_64.h        |   2 +-
 arch/x86/include/asm/cmpxchg.h            |   2 +-
 include/asm-generic/atomic-instrumented.h | 216 ++++++++++++++++++------------
 include/linux/atomic-arch-fallback.h      |  90 +++++++++++--
 include/linux/atomic-fallback.h           |  90 +++++++++++--
 scripts/atomic/gen-atomic-fallback.sh     |  63 ++++++++-
 scripts/atomic/gen-atomic-instrumented.sh |  29 +++-
 8 files changed, 372 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index b6cac6e9bb70..f732741ad7c7 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -199,7 +199,7 @@ static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
 
 static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
 {
-	return try_cmpxchg(&v->counter, old, new);
+	return arch_try_cmpxchg(&v->counter, old, new);
 }
 #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
 
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 809bd010a751..7886d0578fc9 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -187,7 +187,7 @@ static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
 
 static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
 {
-	return try_cmpxchg(&v->counter, old, new);
+	return arch_try_cmpxchg(&v->counter, old, new);
 }
 #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
 
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index a8bfac131256..4d4ec5cbdc51 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -221,7 +221,7 @@ extern void __add_wrong_size(void)
 #define __try_cmpxchg(ptr, pold, new, size)				\
 	__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
 
-#define try_cmpxchg(ptr, pold, new) 					\
+#define arch_try_cmpxchg(ptr, pold, new) 				\
 	__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
 
 /*
diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index 379986e40159..2cab3fdaae3b 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -1642,148 +1642,192 @@ atomic64_dec_if_positive(atomic64_t *v)
 #endif
 
 #if !defined(arch_xchg_relaxed) || defined(arch_xchg)
-#define xchg(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_xchg(__ai_ptr, __VA_ARGS__);				\
+#define xchg(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_xchg(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_xchg_acquire)
-#define xchg_acquire(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_xchg_acquire(__ai_ptr, __VA_ARGS__);				\
+#define xchg_acquire(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_xchg_acquire(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_xchg_release)
-#define xchg_release(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_xchg_release(__ai_ptr, __VA_ARGS__);				\
+#define xchg_release(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_xchg_release(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_xchg_relaxed)
-#define xchg_relaxed(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_xchg_relaxed(__ai_ptr, __VA_ARGS__);				\
+#define xchg_relaxed(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_xchg_relaxed(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if !defined(arch_cmpxchg_relaxed) || defined(arch_cmpxchg)
-#define cmpxchg(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_cmpxchg_acquire)
-#define cmpxchg_acquire(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg_acquire(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg_acquire(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_cmpxchg_release)
-#define cmpxchg_release(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg_release(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg_release(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg_release(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_cmpxchg_relaxed)
-#define cmpxchg_relaxed(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg_relaxed(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if !defined(arch_cmpxchg64_relaxed) || defined(arch_cmpxchg64)
-#define cmpxchg64(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg64(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg64(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg64(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_cmpxchg64_acquire)
-#define cmpxchg64_acquire(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg64_acquire(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_cmpxchg64_release)
-#define cmpxchg64_release(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg64_release(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg64_release(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
 #if defined(arch_cmpxchg64_relaxed)
-#define cmpxchg64_relaxed(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg64_relaxed(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \
 })
 #endif
 
-#define cmpxchg_local(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg_local(__ai_ptr, __VA_ARGS__);				\
+#if !defined(arch_try_cmpxchg_relaxed) || defined(arch_try_cmpxchg)
+#define try_cmpxchg(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+#endif
+
+#if defined(arch_try_cmpxchg_acquire)
+#define try_cmpxchg_acquire(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+#endif
+
+#if defined(arch_try_cmpxchg_release)
+#define try_cmpxchg_release(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+#endif
+
+#if defined(arch_try_cmpxchg_relaxed)
+#define try_cmpxchg_relaxed(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+#endif
+
+#define cmpxchg_local(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg_local(__ai_ptr, __VA_ARGS__); \
 })
 
-#define cmpxchg64_local(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_cmpxchg64_local(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg64_local(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \
 })
 
-#define sync_cmpxchg(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr));		\
-	arch_sync_cmpxchg(__ai_ptr, __VA_ARGS__);				\
+#define sync_cmpxchg(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
 })
 
-#define cmpxchg_double(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr));		\
-	arch_cmpxchg_double(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg_double(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
+	arch_cmpxchg_double(__ai_ptr, __VA_ARGS__); \
 })
 
 
-#define cmpxchg_double_local(ptr, ...)						\
-({									\
-	typeof(ptr) __ai_ptr = (ptr);					\
-	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr));		\
-	arch_cmpxchg_double_local(__ai_ptr, __VA_ARGS__);				\
+#define cmpxchg_double_local(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
+	arch_cmpxchg_double_local(__ai_ptr, __VA_ARGS__); \
 })
 
 #endif /* _ASM_GENERIC_ATOMIC_INSTRUMENTED_H */
-// 89bf97f3a7509b740845e51ddf31055b48a81f40
+// ff0fe7f81ee97f01f13bb78b0e3ce800bc56d9dd
diff --git a/include/linux/atomic-arch-fallback.h b/include/linux/atomic-arch-fallback.h
index bcb6aa27cfa6..a3dba31df01e 100644
--- a/include/linux/atomic-arch-fallback.h
+++ b/include/linux/atomic-arch-fallback.h
@@ -9,9 +9,9 @@
 #include <linux/compiler.h>
 
 #ifndef arch_xchg_relaxed
-#define arch_xchg_relaxed		arch_xchg
-#define arch_xchg_acquire		arch_xchg
-#define arch_xchg_release		arch_xchg
+#define arch_xchg_acquire arch_xchg
+#define arch_xchg_release arch_xchg
+#define arch_xchg_relaxed arch_xchg
 #else /* arch_xchg_relaxed */
 
 #ifndef arch_xchg_acquire
@@ -32,9 +32,9 @@
 #endif /* arch_xchg_relaxed */
 
 #ifndef arch_cmpxchg_relaxed
-#define arch_cmpxchg_relaxed		arch_cmpxchg
-#define arch_cmpxchg_acquire		arch_cmpxchg
-#define arch_cmpxchg_release		arch_cmpxchg
+#define arch_cmpxchg_acquire arch_cmpxchg
+#define arch_cmpxchg_release arch_cmpxchg
+#define arch_cmpxchg_relaxed arch_cmpxchg
 #else /* arch_cmpxchg_relaxed */
 
 #ifndef arch_cmpxchg_acquire
@@ -55,9 +55,9 @@
 #endif /* arch_cmpxchg_relaxed */
 
 #ifndef arch_cmpxchg64_relaxed
-#define arch_cmpxchg64_relaxed		arch_cmpxchg64
-#define arch_cmpxchg64_acquire		arch_cmpxchg64
-#define arch_cmpxchg64_release		arch_cmpxchg64
+#define arch_cmpxchg64_acquire arch_cmpxchg64
+#define arch_cmpxchg64_release arch_cmpxchg64
+#define arch_cmpxchg64_relaxed arch_cmpxchg64
 #else /* arch_cmpxchg64_relaxed */
 
 #ifndef arch_cmpxchg64_acquire
@@ -77,6 +77,76 @@
 
 #endif /* arch_cmpxchg64_relaxed */
 
+#ifndef arch_try_cmpxchg_relaxed
+#ifdef arch_try_cmpxchg
+#define arch_try_cmpxchg_acquire arch_try_cmpxchg
+#define arch_try_cmpxchg_release arch_try_cmpxchg
+#define arch_try_cmpxchg_relaxed arch_try_cmpxchg
+#endif /* arch_try_cmpxchg */
+
+#ifndef arch_try_cmpxchg
+#define arch_try_cmpxchg(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg */
+
+#ifndef arch_try_cmpxchg_acquire
+#define arch_try_cmpxchg_acquire(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg_acquire((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg_acquire */
+
+#ifndef arch_try_cmpxchg_release
+#define arch_try_cmpxchg_release(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg_release((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg_release */
+
+#ifndef arch_try_cmpxchg_relaxed
+#define arch_try_cmpxchg_relaxed(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg_relaxed((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg_relaxed */
+
+#else /* arch_try_cmpxchg_relaxed */
+
+#ifndef arch_try_cmpxchg_acquire
+#define arch_try_cmpxchg_acquire(...) \
+	__atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef arch_try_cmpxchg_release
+#define arch_try_cmpxchg_release(...) \
+	__atomic_op_release(arch_try_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef arch_try_cmpxchg
+#define arch_try_cmpxchg(...) \
+	__atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__)
+#endif
+
+#endif /* arch_try_cmpxchg_relaxed */
+
 #ifndef arch_atomic_read_acquire
 static __always_inline int
 arch_atomic_read_acquire(const atomic_t *v)
@@ -2288,4 +2358,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v)
 #endif
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 90cd26cfd69d2250303d654955a0cc12620fb91b
+// cca554917d7ea73d5e3e7397dd70c484cad9b2c4
diff --git a/include/linux/atomic-fallback.h b/include/linux/atomic-fallback.h
index fd525c71d676..2a3f55d98be9 100644
--- a/include/linux/atomic-fallback.h
+++ b/include/linux/atomic-fallback.h
@@ -9,9 +9,9 @@
 #include <linux/compiler.h>
 
 #ifndef xchg_relaxed
-#define xchg_relaxed		xchg
-#define xchg_acquire		xchg
-#define xchg_release		xchg
+#define xchg_acquire xchg
+#define xchg_release xchg
+#define xchg_relaxed xchg
 #else /* xchg_relaxed */
 
 #ifndef xchg_acquire
@@ -32,9 +32,9 @@
 #endif /* xchg_relaxed */
 
 #ifndef cmpxchg_relaxed
-#define cmpxchg_relaxed		cmpxchg
-#define cmpxchg_acquire		cmpxchg
-#define cmpxchg_release		cmpxchg
+#define cmpxchg_acquire cmpxchg
+#define cmpxchg_release cmpxchg
+#define cmpxchg_relaxed cmpxchg
 #else /* cmpxchg_relaxed */
 
 #ifndef cmpxchg_acquire
@@ -55,9 +55,9 @@
 #endif /* cmpxchg_relaxed */
 
 #ifndef cmpxchg64_relaxed
-#define cmpxchg64_relaxed		cmpxchg64
-#define cmpxchg64_acquire		cmpxchg64
-#define cmpxchg64_release		cmpxchg64
+#define cmpxchg64_acquire cmpxchg64
+#define cmpxchg64_release cmpxchg64
+#define cmpxchg64_relaxed cmpxchg64
 #else /* cmpxchg64_relaxed */
 
 #ifndef cmpxchg64_acquire
@@ -77,6 +77,76 @@
 
 #endif /* cmpxchg64_relaxed */
 
+#ifndef try_cmpxchg_relaxed
+#ifdef try_cmpxchg
+#define try_cmpxchg_acquire try_cmpxchg
+#define try_cmpxchg_release try_cmpxchg
+#define try_cmpxchg_relaxed try_cmpxchg
+#endif /* try_cmpxchg */
+
+#ifndef try_cmpxchg
+#define try_cmpxchg(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = cmpxchg((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* try_cmpxchg */
+
+#ifndef try_cmpxchg_acquire
+#define try_cmpxchg_acquire(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = cmpxchg_acquire((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* try_cmpxchg_acquire */
+
+#ifndef try_cmpxchg_release
+#define try_cmpxchg_release(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = cmpxchg_release((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* try_cmpxchg_release */
+
+#ifndef try_cmpxchg_relaxed
+#define try_cmpxchg_relaxed(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = cmpxchg_relaxed((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* try_cmpxchg_relaxed */
+
+#else /* try_cmpxchg_relaxed */
+
+#ifndef try_cmpxchg_acquire
+#define try_cmpxchg_acquire(...) \
+	__atomic_op_acquire(try_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef try_cmpxchg_release
+#define try_cmpxchg_release(...) \
+	__atomic_op_release(try_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef try_cmpxchg
+#define try_cmpxchg(...) \
+	__atomic_op_fence(try_cmpxchg, __VA_ARGS__)
+#endif
+
+#endif /* try_cmpxchg_relaxed */
+
 #define arch_atomic_read atomic_read
 #define arch_atomic_read_acquire atomic_read_acquire
 
@@ -2522,4 +2592,4 @@ atomic64_dec_if_positive(atomic64_t *v)
 #endif
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 9d95b56f98d82a2a26c7b79ccdd0c47572d50a6f
+// d78e6c293c661c15188f0ec05bce45188c8d5892
diff --git a/scripts/atomic/gen-atomic-fallback.sh b/scripts/atomic/gen-atomic-fallback.sh
index 693dfa1de430..317a6cec76e1 100755
--- a/scripts/atomic/gen-atomic-fallback.sh
+++ b/scripts/atomic/gen-atomic-fallback.sh
@@ -144,15 +144,11 @@ gen_proto_order_variants()
 	printf "#endif /* ${basename}_relaxed */\n\n"
 }
 
-gen_xchg_fallbacks()
+gen_order_fallbacks()
 {
 	local xchg="$1"; shift
+
 cat <<EOF
-#ifndef ${xchg}_relaxed
-#define ${xchg}_relaxed		${xchg}
-#define ${xchg}_acquire		${xchg}
-#define ${xchg}_release		${xchg}
-#else /* ${xchg}_relaxed */
 
 #ifndef ${xchg}_acquire
 #define ${xchg}_acquire(...) \\
@@ -169,11 +165,62 @@ cat <<EOF
 	__atomic_op_fence(${xchg}, __VA_ARGS__)
 #endif
 
-#endif /* ${xchg}_relaxed */
+EOF
+}
+
+gen_xchg_fallbacks()
+{
+	local xchg="$1"; shift
+	printf "#ifndef ${xchg}_relaxed\n"
+
+	gen_basic_fallbacks ${xchg}
+
+	printf "#else /* ${xchg}_relaxed */\n"
+
+	gen_order_fallbacks ${xchg}
+
+	printf "#endif /* ${xchg}_relaxed */\n\n"
+}
+
+gen_try_cmpxchg_fallback()
+{
+	local order="$1"; shift;
+
+cat <<EOF
+#ifndef ${ARCH}try_cmpxchg${order}
+#define ${ARCH}try_cmpxchg${order}(_ptr, _oldp, _new) \\
+({ \\
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \\
+	___r = ${ARCH}cmpxchg${order}((_ptr), ___o, (_new)); \\
+	if (unlikely(___r != ___o)) \\
+		*___op = ___r; \\
+	likely(___r == ___o); \\
+})
+#endif /* ${ARCH}try_cmpxchg${order} */
 
 EOF
 }
 
+gen_try_cmpxchg_fallbacks()
+{
+	printf "#ifndef ${ARCH}try_cmpxchg_relaxed\n"
+	printf "#ifdef ${ARCH}try_cmpxchg\n"
+
+	gen_basic_fallbacks "${ARCH}try_cmpxchg"
+
+	printf "#endif /* ${ARCH}try_cmpxchg */\n\n"
+
+	for order in "" "_acquire" "_release" "_relaxed"; do
+		gen_try_cmpxchg_fallback "${order}"
+	done
+
+	printf "#else /* ${ARCH}try_cmpxchg_relaxed */\n"
+
+	gen_order_fallbacks "${ARCH}try_cmpxchg"
+
+	printf "#endif /* ${ARCH}try_cmpxchg_relaxed */\n\n"
+}
+
 cat << EOF
 // SPDX-License-Identifier: GPL-2.0
 
@@ -191,6 +238,8 @@ for xchg in "${ARCH}xchg" "${ARCH}cmpxchg" "${ARCH}cmpxchg64"; do
 	gen_xchg_fallbacks "${xchg}"
 done
 
+gen_try_cmpxchg_fallbacks
+
 grep '^[a-z]' "$1" | while read name meta args; do
 	gen_proto "${meta}" "${name}" "${ARCH}" "atomic" "int" ${args}
 done
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 6afadf73da17..85dc25685c0d 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -103,14 +103,31 @@ gen_xchg()
 	local xchg="$1"; shift
 	local mult="$1"; shift
 
+	if [ "${xchg%${xchg#try_cmpxchg}}" = "try_cmpxchg" ] ; then
+
+cat <<EOF
+#define ${xchg}(ptr, oldp, ...) \\
+({ \\
+	typeof(ptr) __ai_ptr = (ptr); \\
+	typeof(oldp) __ai_oldp = (oldp); \\
+	instrument_atomic_write(__ai_ptr, ${mult}sizeof(*__ai_ptr)); \\
+	instrument_atomic_write(__ai_oldp, ${mult}sizeof(*__ai_oldp)); \\
+	arch_${xchg}(__ai_ptr, __ai_oldp, __VA_ARGS__); \\
+})
+EOF
+
+	else
+
 cat <<EOF
-#define ${xchg}(ptr, ...)						\\
-({									\\
-	typeof(ptr) __ai_ptr = (ptr);					\\
-	instrument_atomic_write(__ai_ptr, ${mult}sizeof(*__ai_ptr));		\\
-	arch_${xchg}(__ai_ptr, __VA_ARGS__);				\\
+#define ${xchg}(ptr, ...) \\
+({ \\
+	typeof(ptr) __ai_ptr = (ptr); \\
+	instrument_atomic_write(__ai_ptr, ${mult}sizeof(*__ai_ptr)); \\
+	arch_${xchg}(__ai_ptr, __VA_ARGS__); \\
 })
 EOF
+
+	fi
 }
 
 gen_optional_xchg()
@@ -160,7 +177,7 @@ grep '^[a-z]' "$1" | while read name meta args; do
 	gen_proto "${meta}" "${name}" "atomic64" "s64" ${args}
 done
 
-for xchg in "xchg" "cmpxchg" "cmpxchg64"; do
+for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg"; do
 	for order in "" "_acquire" "_release" "_relaxed"; do
 		gen_optional_xchg "${xchg}" "${order}"
 	done
-- 
cgit 


From e563604a5f5a891283b6a8db4001cee833a7c6b8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 29 Aug 2020 22:03:46 +0900
Subject: freelist: Implement lockless freelist

A simple CAS-based lock-free free list. Not the fastest thing in the world
under heavy contention, but simple and correct (assuming nodes are never
freed until after the free list is destroyed), and fairly speedy under low
contention.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/159870622579.1229682.16729440870040944993.stgit@devnote2
---
 include/linux/freelist.h | 129 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 include/linux/freelist.h

(limited to 'include/linux')

diff --git a/include/linux/freelist.h b/include/linux/freelist.h
new file mode 100644
index 000000000000..fc1842b96469
--- /dev/null
+++ b/include/linux/freelist.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
+#ifndef FREELIST_H
+#define FREELIST_H
+
+#include <linux/atomic.h>
+
+/*
+ * Copyright: cameron@moodycamel.com
+ *
+ * A simple CAS-based lock-free free list. Not the fastest thing in the world
+ * under heavy contention, but simple and correct (assuming nodes are never
+ * freed until after the free list is destroyed), and fairly speedy under low
+ * contention.
+ *
+ * Adapted from: https://moodycamel.com/blog/2014/solving-the-aba-problem-for-lock-free-free-lists
+ */
+
+struct freelist_node {
+	atomic_t		refs;
+	struct freelist_node	*next;
+};
+
+struct freelist_head {
+	struct freelist_node	*head;
+};
+
+#define REFS_ON_FREELIST 0x80000000
+#define REFS_MASK	 0x7FFFFFFF
+
+static inline void __freelist_add(struct freelist_node *node, struct freelist_head *list)
+{
+	/*
+	 * Since the refcount is zero, and nobody can increase it once it's
+	 * zero (except us, and we run only one copy of this method per node at
+	 * a time, i.e. the single thread case), then we know we can safely
+	 * change the next pointer of the node; however, once the refcount is
+	 * back above zero, then other threads could increase it (happens under
+	 * heavy contention, when the refcount goes to zero in between a load
+	 * and a refcount increment of a node in try_get, then back up to
+	 * something non-zero, then the refcount increment is done by the other
+	 * thread) -- so if the CAS to add the node to the actual list fails,
+	 * decrese the refcount and leave the add operation to the next thread
+	 * who puts the refcount back to zero (which could be us, hence the
+	 * loop).
+	 */
+	struct freelist_node *head = READ_ONCE(list->head);
+
+	for (;;) {
+		WRITE_ONCE(node->next, head);
+		atomic_set_release(&node->refs, 1);
+
+		if (!try_cmpxchg_release(&list->head, &head, node)) {
+			/*
+			 * Hmm, the add failed, but we can only try again when
+			 * the refcount goes back to zero.
+			 */
+			if (atomic_fetch_add_release(REFS_ON_FREELIST - 1, &node->refs) == 1)
+				continue;
+		}
+		return;
+	}
+}
+
+static inline void freelist_add(struct freelist_node *node, struct freelist_head *list)
+{
+	/*
+	 * We know that the should-be-on-freelist bit is 0 at this point, so
+	 * it's safe to set it using a fetch_add.
+	 */
+	if (!atomic_fetch_add_release(REFS_ON_FREELIST, &node->refs)) {
+		/*
+		 * Oh look! We were the last ones referencing this node, and we
+		 * know we want to add it to the free list, so let's do it!
+		 */
+		__freelist_add(node, list);
+	}
+}
+
+static inline struct freelist_node *freelist_try_get(struct freelist_head *list)
+{
+	struct freelist_node *prev, *next, *head = smp_load_acquire(&list->head);
+	unsigned int refs;
+
+	while (head) {
+		prev = head;
+		refs = atomic_read(&head->refs);
+		if ((refs & REFS_MASK) == 0 ||
+		    !atomic_try_cmpxchg_acquire(&head->refs, &refs, refs+1)) {
+			head = smp_load_acquire(&list->head);
+			continue;
+		}
+
+		/*
+		 * Good, reference count has been incremented (it wasn't at
+		 * zero), which means we can read the next and not worry about
+		 * it changing between now and the time we do the CAS.
+		 */
+		next = READ_ONCE(head->next);
+		if (try_cmpxchg_acquire(&list->head, &head, next)) {
+			/*
+			 * Yay, got the node. This means it was on the list,
+			 * which means should-be-on-freelist must be false no
+			 * matter the refcount (because nobody else knows it's
+			 * been taken off yet, it can't have been put back on).
+			 */
+			WARN_ON_ONCE(atomic_read(&head->refs) & REFS_ON_FREELIST);
+
+			/*
+			 * Decrease refcount twice, once for our ref, and once
+			 * for the list's ref.
+			 */
+			atomic_fetch_add(-2, &head->refs);
+
+			return head;
+		}
+
+		/*
+		 * OK, the head must have changed on us, but we still need to decrement
+		 * the refcount we increased.
+		 */
+		refs = atomic_fetch_add(-1, &prev->refs);
+		if (refs == REFS_ON_FREELIST + 1)
+			__freelist_add(prev, list);
+	}
+
+	return NULL;
+}
+
+#endif /* FREELIST_H */
-- 
cgit 


From 6e426e0fcd20ce144bb93e00b70df51e9f2e08c3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 29 Aug 2020 22:03:56 +0900
Subject: kprobes: Replace rp->free_instance with freelist

Gets rid of rp->lock, and as a result kretprobes are now fully
lockless.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/159870623583.1229682.17472357584134058687.stgit@devnote2
---
 include/linux/kprobes.h |  8 +++----
 kernel/kprobes.c        | 56 +++++++++++++++++++++----------------------------
 2 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 00cf4421efd5..b7824e3f1ef5 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -28,6 +28,7 @@
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
 #include <linux/refcount.h>
+#include <linux/freelist.h>
 #include <asm/kprobes.h>
 
 #ifdef CONFIG_KPROBES
@@ -157,17 +158,16 @@ struct kretprobe {
 	int maxactive;
 	int nmissed;
 	size_t data_size;
-	struct hlist_head free_instances;
+	struct freelist_head freelist;
 	struct kretprobe_holder *rph;
-	raw_spinlock_t lock;
 };
 
 struct kretprobe_instance {
 	union {
-		struct llist_node llist;
-		struct hlist_node hlist;
+		struct freelist_node freelist;
 		struct rcu_head rcu;
 	};
+	struct llist_node llist;
 	struct kretprobe_holder *rph;
 	kprobe_opcode_t *ret_addr;
 	void *fp;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 850ee36a4051..30b8fe7d571d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1228,11 +1228,8 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
 {
 	struct kretprobe *rp = get_kretprobe(ri);
 
-	INIT_HLIST_NODE(&ri->hlist);
 	if (likely(rp)) {
-		raw_spin_lock(&rp->lock);
-		hlist_add_head(&ri->hlist, &rp->free_instances);
-		raw_spin_unlock(&rp->lock);
+		freelist_add(&ri->freelist, &rp->freelist);
 	} else
 		call_rcu(&ri->rcu, free_rp_inst_rcu);
 }
@@ -1290,11 +1287,14 @@ NOKPROBE_SYMBOL(kprobe_flush_task);
 static inline void free_rp_inst(struct kretprobe *rp)
 {
 	struct kretprobe_instance *ri;
-	struct hlist_node *next;
+	struct freelist_node *node;
 	int count = 0;
 
-	hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
-		hlist_del(&ri->hlist);
+	node = rp->freelist.head;
+	while (node) {
+		ri = container_of(node, struct kretprobe_instance, freelist);
+		node = node->next;
+
 		kfree(ri);
 		count++;
 	}
@@ -1925,32 +1925,26 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
 static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-	unsigned long flags = 0;
 	struct kretprobe_instance *ri;
+	struct freelist_node *fn;
 
-	/* TODO: consider to only swap the RA after the last pre_handler fired */
-	raw_spin_lock_irqsave(&rp->lock, flags);
-	if (!hlist_empty(&rp->free_instances)) {
-		ri = hlist_entry(rp->free_instances.first,
-				struct kretprobe_instance, hlist);
-		hlist_del(&ri->hlist);
-		raw_spin_unlock_irqrestore(&rp->lock, flags);
-
-		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-			raw_spin_lock_irqsave(&rp->lock, flags);
-			hlist_add_head(&ri->hlist, &rp->free_instances);
-			raw_spin_unlock_irqrestore(&rp->lock, flags);
-			return 0;
-		}
-
-		arch_prepare_kretprobe(ri, regs);
+	fn = freelist_try_get(&rp->freelist);
+	if (!fn) {
+		rp->nmissed++;
+		return 0;
+	}
 
-		__llist_add(&ri->llist, &current->kretprobe_instances);
+	ri = container_of(fn, struct kretprobe_instance, freelist);
 
-	} else {
-		rp->nmissed++;
-		raw_spin_unlock_irqrestore(&rp->lock, flags);
+	if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+		freelist_add(&ri->freelist, &rp->freelist);
+		return 0;
 	}
+
+	arch_prepare_kretprobe(ri, regs);
+
+	__llist_add(&ri->llist, &current->kretprobe_instances);
+
 	return 0;
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
@@ -2007,8 +2001,7 @@ int register_kretprobe(struct kretprobe *rp)
 		rp->maxactive = num_possible_cpus();
 #endif
 	}
-	raw_spin_lock_init(&rp->lock);
-	INIT_HLIST_HEAD(&rp->free_instances);
+	rp->freelist.head = NULL;
 	rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
 	if (!rp->rph)
 		return -ENOMEM;
@@ -2023,8 +2016,7 @@ int register_kretprobe(struct kretprobe *rp)
 			return -ENOMEM;
 		}
 		inst->rph = rp->rph;
-		INIT_HLIST_NODE(&inst->hlist);
-		hlist_add_head(&inst->hlist, &rp->free_instances);
+		freelist_add(&inst->freelist, &rp->freelist);
 	}
 	refcount_set(&rp->rph->ref, i);
 
-- 
cgit 


From 8bca49e43fb5994bd2a03f3f114cf89bc6f87a14 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 22 Apr 2020 15:51:55 +0300
Subject: drm: shmobile: Reduce include dependencies

This file doesn't need anything provided by <linux/kernel.h>.
All it needs are some types, which are provided by <drm/drm_mode.h>.

Drop unneeded <linux/kernel.h> completely.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200422125201.37618-1-andriy.shevchenko@linux.intel.com
---
 include/linux/platform_data/shmob_drm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index fe815d7d9f58..d661399b217d 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -10,8 +10,6 @@
 #ifndef __SHMOB_DRM_H__
 #define __SHMOB_DRM_H__
 
-#include <linux/kernel.h>
-
 #include <drm/drm_mode.h>
 
 enum shmob_drm_clk_source {
-- 
cgit 


From ac80cd17a615e472e3dcff0a5446a58540e64e6d Mon Sep 17 00:00:00 2001
From: Jianxin Xiong <jianxin.xiong@intel.com>
Date: Wed, 14 Oct 2020 09:16:01 -0700
Subject: dma-buf: Clarify that dma-buf sg lists are page aligned
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dma-buf API have been used under the assumption that the sg lists
returned from dma_buf_map_attachment() are fully page aligned. Lots of
stuff can break otherwise all over the place. Clarify this in the
documentation and add a check when DMA API debug is enabled.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1602692161-107096-1-git-send-email-jianxin.xiong@intel.com
---
 drivers/dma-buf/dma-buf.c | 21 +++++++++++++++++++++
 include/linux/dma-buf.h   |  3 ++-
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index a6ba4d598f0e..9db211a2b6cb 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -849,6 +849,9 @@ EXPORT_SYMBOL_GPL(dma_buf_unpin);
  * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR
  * on error. May return -EINTR if it is interrupted by a signal.
  *
+ * On success, the DMA addresses and lengths in the returned scatterlist are
+ * PAGE_SIZE aligned.
+ *
  * A mapping must be unmapped by using dma_buf_unmap_attachment(). Note that
  * the underlying backing storage is pinned for as long as a mapping exists,
  * therefore users/importers should not hold onto a mapping for undue amounts of
@@ -902,6 +905,24 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach,
 		attach->dir = direction;
 	}
 
+#ifdef CONFIG_DMA_API_DEBUG
+	{
+		struct scatterlist *sg;
+		u64 addr;
+		int len;
+		int i;
+
+		for_each_sgtable_dma_sg(sg_table, sg, i) {
+			addr = sg_dma_address(sg);
+			len = sg_dma_len(sg);
+			if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(len)) {
+				pr_debug("%s: addr %llx or len %x is not page aligned!\n",
+					 __func__, addr, len);
+			}
+		}
+	}
+#endif /* CONFIG_DMA_API_DEBUG */
+
 	return sg_table;
 }
 EXPORT_SYMBOL_GPL(dma_buf_map_attachment);
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index cf77cc15f4ba..03875eaed51a 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -146,7 +146,8 @@ struct dma_buf_ops {
 	 *
 	 * A &sg_table scatter list of or the backing storage of the DMA buffer,
 	 * already mapped into the device address space of the &device attached
-	 * with the provided &dma_buf_attachment.
+	 * with the provided &dma_buf_attachment. The addresses and lengths in
+	 * the scatter list are PAGE_SIZE aligned.
 	 *
 	 * On failure, returns a negative error value wrapped into a pointer.
 	 * May also return -EINTR when a signal was received while being
-- 
cgit 


From 44cdc1d952e3f7aa9944c1bbf38fc23f49885017 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 27 Sep 2020 11:18:30 -0400
Subject: convert ->f_ep_links/->fllink to hlist

we don't care about the order of elements there

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventpoll.c            | 18 +++++++++---------
 include/linux/eventpoll.h |  4 ++--
 include/linux/fs.h        |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 680f4ada53d4..c40576e62728 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -160,7 +160,7 @@ struct epitem {
 	struct eventpoll *ep;
 
 	/* List header used to link this item to the "struct file" items list */
-	struct list_head fllink;
+	struct hlist_node fllink;
 
 	/* wakeup_source used when EPOLLWAKEUP is set */
 	struct wakeup_source __rcu *ws;
@@ -642,7 +642,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	/* Remove the current item from the list of epoll hooks */
 	spin_lock(&file->f_lock);
-	list_del_rcu(&epi->fllink);
+	hlist_del_rcu(&epi->fllink);
 	spin_unlock(&file->f_lock);
 
 	rb_erase_cached(&epi->rbn, &ep->rbr);
@@ -835,7 +835,8 @@ static const struct file_operations eventpoll_fops = {
 void eventpoll_release_file(struct file *file)
 {
 	struct eventpoll *ep;
-	struct epitem *epi, *next;
+	struct epitem *epi;
+	struct hlist_node *next;
 
 	/*
 	 * We don't want to get "file->f_lock" because it is not
@@ -851,7 +852,7 @@ void eventpoll_release_file(struct file *file)
 	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
-	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
+	hlist_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
 		ep = epi->ep;
 		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
@@ -1257,11 +1258,11 @@ static int reverse_path_check_proc(struct file *file, int depth)
 
 	/* CTL_DEL can remove links here, but that can't increase our count */
 	rcu_read_lock();
-	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
+	hlist_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
 		struct file *recepient = epi->ep->file;
 		if (WARN_ON(!is_file_epoll(recepient)))
 			continue;
-		if (list_empty(&recepient->f_ep_links))
+		if (hlist_empty(&recepient->f_ep_links))
 			error = path_count_inc(depth);
 		else
 			error = reverse_path_check_proc(recepient, depth + 1);
@@ -1361,7 +1362,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 
 	/* Item initialization follow here ... */
 	INIT_LIST_HEAD(&epi->rdllink);
-	INIT_LIST_HEAD(&epi->fllink);
 	epi->ep = ep;
 	ep_set_ffd(&epi->ffd, tfile, fd);
 	epi->event = *event;
@@ -1373,7 +1373,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 		mutex_lock_nested(&tep->mtx, 1);
 	/* Add the current item to the list of active epoll hook for this file */
 	spin_lock(&tfile->f_lock);
-	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+	hlist_add_head_rcu(&epi->fllink, &tfile->f_ep_links);
 	spin_unlock(&tfile->f_lock);
 
 	/*
@@ -1999,7 +1999,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	if (error)
 		goto error_tgt_fput;
 	if (op == EPOLL_CTL_ADD) {
-		if (!list_empty(&f.file->f_ep_links) ||
+		if (!hlist_empty(&f.file->f_ep_links) ||
 				ep->gen == loop_check_gen ||
 						is_file_epoll(tf.file)) {
 			mutex_unlock(&ep->mtx);
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 8f000fada5a4..4e215ccfa792 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -25,7 +25,7 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t
 /* Used to initialize the epoll bits inside the "struct file" */
 static inline void eventpoll_init_file(struct file *file)
 {
-	INIT_LIST_HEAD(&file->f_ep_links);
+	INIT_HLIST_HEAD(&file->f_ep_links);
 	INIT_LIST_HEAD(&file->f_tfile_llink);
 }
 
@@ -50,7 +50,7 @@ static inline void eventpoll_release(struct file *file)
 	 * because the file in on the way to be removed and nobody ( but
 	 * eventpoll ) has still a reference to this file.
 	 */
-	if (likely(list_empty(&file->f_ep_links)))
+	if (likely(hlist_empty(&file->f_ep_links)))
 		return;
 
 	/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0bd126418bb6..b484ba0e474a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -946,7 +946,7 @@ struct file {
 
 #ifdef CONFIG_EPOLL
 	/* Used by fs/eventpoll.c to link all the hooks to this file */
-	struct list_head	f_ep_links;
+	struct hlist_head	f_ep_links;
 	struct list_head	f_tfile_llink;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
-- 
cgit 


From 319c15174757aaedacc89a6e55c965416f130e64 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Oct 2020 20:45:51 -0400
Subject: epoll: take epitem list out of struct file

Move the head of epitem list out of struct file; for epoll ones it's
moved into struct eventpoll (->refs there), for non-epoll - into
the new object (struct epitem_head).  In place of ->f_ep_links we
leave a pointer to the list head (->f_ep).

->f_ep is protected by ->f_lock and it's zeroed as soon as the list
of epitems becomes empty (that can happen only in ep_remove() by
now).

The list of files for reverse path check is *not* going through
struct file now - it's a single-linked list going through epitem_head
instances.  It's terminated by ERR_PTR(-1) (== EP_UNACTIVE_POINTER),
so the elements of list can be distinguished by head->next != NULL.

epitem_head instances are allocated at ep_insert() time (by
attach_epitem()) and freed either by ep_remove() (if it empties
the set of epitems *and* epitem_head does not belong to the
reverse path check list) or by clear_tfile_check_list() when
the list is emptied (if the set of epitems is empty by that
point).  Allocations are done from a separate slab - minimal kmalloc()
size is too large on some architectures.

As the result, we trim struct file _and_ get rid of the games with
temporary file references.

Locking and barriers are interesting (aren't they always); see unlist_file()
and ep_remove() for details.  The non-obvious part is that ep_remove() needs
to decide if it will be the one to free the damn thing *before* actually
storing NULL to head->epitems.first - that's what smp_load_acquire is for
in there.  unlist_file() lockless path is safe, since we hit it only if
we observe NULL in head->epitems.first and whoever had done that store is
guaranteed to have observed non-NULL in head->next.  IOW, their last access
had been the store of NULL into ->epitems.first and we can safely free
the sucker.  OTOH, we are under rcu_read_lock() and both epitem and
epitem->file have their freeing RCU-delayed.  So if we see non-NULL
->epitems.first, we can grab ->f_lock (all epitems in there share the
same struct file) and safely recheck the emptiness of ->epitems; again,
->next is still non-NULL, so ep_remove() couldn't have freed head yet.
->f_lock serializes us wrt ep_remove(); the rest is trivial.

Note that once head->epitems becomes NULL, nothing can get inserted into
it - the only remaining reference to head after that point is from the
reverse path check list.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventpoll.c            | 168 ++++++++++++++++++++++++++++++++++------------
 fs/file_table.c           |   1 -
 include/linux/eventpoll.h |  11 +--
 include/linux/fs.h        |   5 +-
 4 files changed, 129 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8f68095c5ce9..d89aead356df 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -215,6 +215,7 @@ struct eventpoll {
 
 	/* used to optimize loop detection check */
 	u64 gen;
+	struct hlist_head refs;
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	/* used to track busy poll napi_id */
@@ -259,7 +260,45 @@ static struct kmem_cache *pwq_cache __read_mostly;
  * List of files with newly added links, where we may need to limit the number
  * of emanating paths. Protected by the epmutex.
  */
-static LIST_HEAD(tfile_check_list);
+struct epitems_head {
+	struct hlist_head epitems;
+	struct epitems_head *next;
+};
+static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
+
+static struct kmem_cache *ephead_cache __read_mostly;
+
+static inline void free_ephead(struct epitems_head *head)
+{
+	if (head)
+		kmem_cache_free(ephead_cache, head);
+}
+
+static void list_file(struct file *file)
+{
+	struct epitems_head *head;
+
+	head = container_of(file->f_ep, struct epitems_head, epitems);
+	if (!head->next) {
+		head->next = tfile_check_list;
+		tfile_check_list = head;
+	}
+}
+
+static void unlist_file(struct epitems_head *head)
+{
+	struct epitems_head *to_free = head;
+	struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
+	if (p) {
+		struct epitem *epi= container_of(p, struct epitem, fllink);
+		spin_lock(&epi->ffd.file->f_lock);
+		if (!hlist_empty(&head->epitems))
+			to_free = NULL;
+		head->next = NULL;
+		spin_unlock(&epi->ffd.file->f_lock);
+	}
+	free_ephead(to_free);
+}
 
 #ifdef CONFIG_SYSCTL
 
@@ -632,6 +671,8 @@ static void epi_rcu_free(struct rcu_head *head)
 static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 {
 	struct file *file = epi->ffd.file;
+	struct epitems_head *to_free;
+	struct hlist_head *head;
 
 	lockdep_assert_irqs_enabled();
 
@@ -642,8 +683,20 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	/* Remove the current item from the list of epoll hooks */
 	spin_lock(&file->f_lock);
+	to_free = NULL;
+	head = file->f_ep;
+	if (head->first == &epi->fllink && !epi->fllink.next) {
+		file->f_ep = NULL;
+		if (!is_file_epoll(file)) {
+			struct epitems_head *v;
+			v = container_of(head, struct epitems_head, epitems);
+			if (!smp_load_acquire(&v->next))
+				to_free = v;
+		}
+	}
 	hlist_del_rcu(&epi->fllink);
 	spin_unlock(&file->f_lock);
+	free_ephead(to_free);
 
 	rb_erase_cached(&epi->rbn, &ep->rbr);
 
@@ -852,7 +905,11 @@ void eventpoll_release_file(struct file *file)
 	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
-	hlist_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
+	if (unlikely(!file->f_ep)) {
+		mutex_unlock(&epmutex);
+		return;
+	}
+	hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
 		ep = epi->ep;
 		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
@@ -1248,7 +1305,7 @@ static void path_count_init(void)
 		path_count[i] = 0;
 }
 
-static int reverse_path_check_proc(struct file *file, int depth)
+static int reverse_path_check_proc(struct hlist_head *refs, int depth)
 {
 	int error = 0;
 	struct epitem *epi;
@@ -1257,14 +1314,12 @@ static int reverse_path_check_proc(struct file *file, int depth)
 		return -1;
 
 	/* CTL_DEL can remove links here, but that can't increase our count */
-	hlist_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
-		struct file *recepient = epi->ep->file;
-		if (WARN_ON(!is_file_epoll(recepient)))
-			continue;
-		if (hlist_empty(&recepient->f_ep_links))
+	hlist_for_each_entry_rcu(epi, refs, fllink) {
+		struct hlist_head *refs = &epi->ep->refs;
+		if (hlist_empty(refs))
 			error = path_count_inc(depth);
 		else
-			error = reverse_path_check_proc(recepient, depth + 1);
+			error = reverse_path_check_proc(refs, depth + 1);
 		if (error != 0)
 			break;
 	}
@@ -1272,7 +1327,7 @@ static int reverse_path_check_proc(struct file *file, int depth)
 }
 
 /**
- * reverse_path_check - The tfile_check_list is list of file *, which have
+ * reverse_path_check - The tfile_check_list is list of epitem_head, which have
  *                      links that are proposed to be newly added. We need to
  *                      make sure that those added links don't add too many
  *                      paths such that we will spend all our time waking up
@@ -1283,19 +1338,18 @@ static int reverse_path_check_proc(struct file *file, int depth)
  */
 static int reverse_path_check(void)
 {
-	int error = 0;
-	struct file *current_file;
+	struct epitems_head *p;
 
-	/* let's call this for all tfiles */
-	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+	for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
+		int error;
 		path_count_init();
 		rcu_read_lock();
-		error = reverse_path_check_proc(current_file, 0);
+		error = reverse_path_check_proc(&p->epitems, 0);
 		rcu_read_unlock();
 		if (error)
-			break;
+			return error;
 	}
-	return error;
+	return 0;
 }
 
 static int ep_create_wakeup_source(struct epitem *epi)
@@ -1336,6 +1390,39 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
 	wakeup_source_unregister(ws);
 }
 
+static int attach_epitem(struct file *file, struct epitem *epi)
+{
+	struct epitems_head *to_free = NULL;
+	struct hlist_head *head = NULL;
+	struct eventpoll *ep = NULL;
+
+	if (is_file_epoll(file))
+		ep = file->private_data;
+
+	if (ep) {
+		head = &ep->refs;
+	} else if (!READ_ONCE(file->f_ep)) {
+allocate:
+		to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
+		if (!to_free)
+			return -ENOMEM;
+		head = &to_free->epitems;
+	}
+	spin_lock(&file->f_lock);
+	if (!file->f_ep) {
+		if (unlikely(!head)) {
+			spin_unlock(&file->f_lock);
+			goto allocate;
+		}
+		file->f_ep = head;
+		to_free = NULL;
+	}
+	hlist_add_head_rcu(&epi->fllink, file->f_ep);
+	spin_unlock(&file->f_lock);
+	free_ephead(to_free);
+	return 0;
+}
+
 /*
  * Must be called with "mtx" held.
  */
@@ -1367,19 +1454,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	epi->event = *event;
 	epi->next = EP_UNACTIVE_PTR;
 
-	atomic_long_inc(&ep->user->epoll_watches);
-
 	if (tep)
 		mutex_lock_nested(&tep->mtx, 1);
 	/* Add the current item to the list of active epoll hook for this file */
-	spin_lock(&tfile->f_lock);
-	hlist_add_head_rcu(&epi->fllink, &tfile->f_ep_links);
-	spin_unlock(&tfile->f_lock);
-	if (full_check && !tep) {
-		get_file(tfile);
-		list_add(&tfile->f_tfile_llink, &tfile_check_list);
+	if (unlikely(attach_epitem(tfile, epi) < 0)) {
+		kmem_cache_free(epi_cache, epi);
+		if (tep)
+			mutex_unlock(&tep->mtx);
+		return -ENOMEM;
 	}
 
+	if (full_check && !tep)
+		list_file(tfile);
+
+	atomic_long_inc(&ep->user->epoll_watches);
+
 	/*
 	 * Add the current item to the RB tree. All RB tree operations are
 	 * protected by "mtx", and ep_insert() is called with "mtx" held.
@@ -1813,11 +1902,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
 			 * not already there, and calling reverse_path_check()
 			 * during ep_insert().
 			 */
-			if (list_empty(&epi->ffd.file->f_tfile_llink)) {
-				if (get_file_rcu(epi->ffd.file))
-					list_add(&epi->ffd.file->f_tfile_llink,
-						 &tfile_check_list);
-			}
+			list_file(epi->ffd.file);
 		}
 	}
 	mutex_unlock(&ep->mtx);
@@ -1844,16 +1929,13 @@ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
 
 static void clear_tfile_check_list(void)
 {
-	struct file *file;
-
-	/* first clear the tfile_check_list */
-	while (!list_empty(&tfile_check_list)) {
-		file = list_first_entry(&tfile_check_list, struct file,
-					f_tfile_llink);
-		list_del_init(&file->f_tfile_llink);
-		fput(file);
+	rcu_read_lock();
+	while (tfile_check_list != EP_UNACTIVE_PTR) {
+		struct epitems_head *head = tfile_check_list;
+		tfile_check_list = head->next;
+		unlist_file(head);
 	}
-	INIT_LIST_HEAD(&tfile_check_list);
+	rcu_read_unlock();
 }
 
 /*
@@ -2003,9 +2085,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	if (error)
 		goto error_tgt_fput;
 	if (op == EPOLL_CTL_ADD) {
-		if (!hlist_empty(&f.file->f_ep_links) ||
-				ep->gen == loop_check_gen ||
-						is_file_epoll(tf.file)) {
+		if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
+		    is_file_epoll(tf.file)) {
 			mutex_unlock(&ep->mtx);
 			error = epoll_mutex_lock(&epmutex, 0, nonblock);
 			if (error)
@@ -2216,6 +2297,9 @@ static int __init eventpoll_init(void)
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
 		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
+	ephead_cache = kmem_cache_create("ep_head",
+		sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
+
 	return 0;
 }
 fs_initcall(eventpoll_init);
diff --git a/fs/file_table.c b/fs/file_table.c
index 709ada3151da..45437f8e1003 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -113,7 +113,6 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
-	eventpoll_init_file(f);
 	f->f_flags = flags;
 	f->f_mode = OPEN_FMODE(flags);
 	/* f->f_version: 0 */
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 4e215ccfa792..0350393465d4 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -22,14 +22,6 @@ struct file;
 struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
 #endif
 
-/* Used to initialize the epoll bits inside the "struct file" */
-static inline void eventpoll_init_file(struct file *file)
-{
-	INIT_HLIST_HEAD(&file->f_ep_links);
-	INIT_LIST_HEAD(&file->f_tfile_llink);
-}
-
-
 /* Used to release the epoll bits inside the "struct file" */
 void eventpoll_release_file(struct file *file);
 
@@ -50,7 +42,7 @@ static inline void eventpoll_release(struct file *file)
 	 * because the file in on the way to be removed and nobody ( but
 	 * eventpoll ) has still a reference to this file.
 	 */
-	if (likely(hlist_empty(&file->f_ep_links)))
+	if (likely(!file->f_ep))
 		return;
 
 	/*
@@ -72,7 +64,6 @@ static inline int ep_op_has_event(int op)
 
 #else
 
-static inline void eventpoll_init_file(struct file *file) {}
 static inline void eventpoll_release(struct file *file) {}
 
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b484ba0e474a..993c540e3b77 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -923,7 +923,7 @@ struct file {
 	const struct file_operations	*f_op;
 
 	/*
-	 * Protects f_ep_links, f_flags.
+	 * Protects f_ep, f_flags.
 	 * Must not be taken from IRQ context.
 	 */
 	spinlock_t		f_lock;
@@ -946,8 +946,7 @@ struct file {
 
 #ifdef CONFIG_EPOLL
 	/* Used by fs/eventpoll.c to link all the hooks to this file */
-	struct hlist_head	f_ep_links;
-	struct list_head	f_tfile_llink;
+	struct hlist_head	*f_ep;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
 	errseq_t		f_wb_err;
-- 
cgit 


From 96ffcdf239de6f9970178bb7d643e16fd9e68ab9 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Tue, 20 Oct 2020 15:12:12 +0900
Subject: PM / devfreq: Remove redundant governor_name from struct devfreq

The devfreq structure instance contains the governor_name and a governor
instance. When need to show the governor name, better to use the name
of devfreq_governor structure. So, governor_name variable in struct devfreq
is a redundant and unneeded variable. Remove the redundant governor_name
of struct devfreq and then use the name of devfreq_governor instance.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/devfreq.c  | 18 +++++++-----------
 drivers/devfreq/governor.h |  2 ++
 include/linux/devfreq.h    |  4 ----
 3 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 02cfc6552913..6aa10de792b3 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -811,7 +811,6 @@ struct devfreq *devfreq_add_device(struct device *dev,
 	devfreq->dev.release = devfreq_dev_release;
 	INIT_LIST_HEAD(&devfreq->node);
 	devfreq->profile = profile;
-	strscpy(devfreq->governor_name, governor_name, DEVFREQ_NAME_LEN);
 	devfreq->previous_freq = profile->initial_freq;
 	devfreq->last_status.current_frequency = profile->initial_freq;
 	devfreq->data = data;
@@ -907,7 +906,7 @@ struct devfreq *devfreq_add_device(struct device *dev,
 
 	mutex_lock(&devfreq_list_lock);
 
-	governor = try_then_request_governor(devfreq->governor_name);
+	governor = try_then_request_governor(governor_name);
 	if (IS_ERR(governor)) {
 		dev_err(dev, "%s: Unable to find governor for the device\n",
 			__func__);
@@ -1249,7 +1248,7 @@ int devfreq_add_governor(struct devfreq_governor *governor)
 		int ret = 0;
 		struct device *dev = devfreq->dev.parent;
 
-		if (!strncmp(devfreq->governor_name, governor->name,
+		if (!strncmp(devfreq->governor->name, governor->name,
 			     DEVFREQ_NAME_LEN)) {
 			/* The following should never occur */
 			if (devfreq->governor) {
@@ -1311,7 +1310,7 @@ int devfreq_remove_governor(struct devfreq_governor *governor)
 		int ret;
 		struct device *dev = devfreq->dev.parent;
 
-		if (!strncmp(devfreq->governor_name, governor->name,
+		if (!strncmp(devfreq->governor->name, governor->name,
 			     DEVFREQ_NAME_LEN)) {
 			/* we should have a devfreq governor! */
 			if (!devfreq->governor) {
@@ -1406,7 +1405,6 @@ static ssize_t governor_store(struct device *dev, struct device_attribute *attr,
 	 */
 	prev_governor = df->governor;
 	df->governor = governor;
-	strncpy(df->governor_name, governor->name, DEVFREQ_NAME_LEN);
 	ret = df->governor->event_handler(df, DEVFREQ_GOV_START, NULL);
 	if (ret) {
 		dev_warn(dev, "%s: Governor %s not started(%d)\n",
@@ -1414,13 +1412,11 @@ static ssize_t governor_store(struct device *dev, struct device_attribute *attr,
 
 		/* Restore previous governor */
 		df->governor = prev_governor;
-		strncpy(df->governor_name, prev_governor->name,
-			DEVFREQ_NAME_LEN);
 		ret = df->governor->event_handler(df, DEVFREQ_GOV_START, NULL);
 		if (ret) {
 			dev_err(dev,
 				"%s: reverting to Governor %s failed (%d)\n",
-				__func__, df->governor_name, ret);
+				__func__, prev_governor->name, ret);
 			df->governor = NULL;
 			goto out;
 		}
@@ -1459,7 +1455,7 @@ static ssize_t available_governors_show(struct device *d,
 	 */
 	if (IS_SUPPORTED_FLAG(df->governor->flags, IMMUTABLE)) {
 		count = scnprintf(&buf[count], DEVFREQ_NAME_LEN,
-				  "%s ", df->governor_name);
+				  "%s ", df->governor->name);
 	/*
 	 * The devfreq device shows the registered governor except for
 	 * immutable governors such as passive governor .
@@ -1902,7 +1898,7 @@ static int devfreq_summary_show(struct seq_file *s, void *data)
 
 	list_for_each_entry_reverse(devfreq, &devfreq_list, node) {
 #if IS_ENABLED(CONFIG_DEVFREQ_GOV_PASSIVE)
-		if (!strncmp(devfreq->governor_name, DEVFREQ_GOV_PASSIVE,
+		if (!strncmp(devfreq->governor->name, DEVFREQ_GOV_PASSIVE,
 							DEVFREQ_NAME_LEN)) {
 			struct devfreq_passive_data *data = devfreq->data;
 
@@ -1928,7 +1924,7 @@ static int devfreq_summary_show(struct seq_file *s, void *data)
 			"%-30s %-30s %-15s %-10s %10d %12ld %12ld %12ld\n",
 			dev_name(&devfreq->dev),
 			p_devfreq ? dev_name(&p_devfreq->dev) : "null",
-			devfreq->governor_name,
+			devfreq->governor->name,
 			polling_ms ? timer_name[timer] : "null",
 			polling_ms,
 			cur_freq,
diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h
index df413b851bb2..2a52f97b542d 100644
--- a/drivers/devfreq/governor.h
+++ b/drivers/devfreq/governor.h
@@ -13,6 +13,8 @@
 
 #include <linux/devfreq.h>
 
+#define DEVFREQ_NAME_LEN			16
+
 #define to_devfreq(DEV)	container_of((DEV), struct devfreq, dev)
 
 /* Devfreq events */
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 121a2430d7f7..b6d3bae1c74d 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -15,8 +15,6 @@
 #include <linux/pm_opp.h>
 #include <linux/pm_qos.h>
 
-#define DEVFREQ_NAME_LEN 16
-
 /* DEVFREQ governor name */
 #define DEVFREQ_GOV_SIMPLE_ONDEMAND	"simple_ondemand"
 #define DEVFREQ_GOV_PERFORMANCE		"performance"
@@ -139,7 +137,6 @@ struct devfreq_stats {
  *		using devfreq.
  * @profile:	device-specific devfreq profile
  * @governor:	method how to choose frequency based on the usage.
- * @governor_name:	devfreq governor name for use with this devfreq
  * @nb:		notifier block used to notify devfreq object that it should
  *		reevaluate operable frequencies. Devfreq users may use
  *		devfreq.nb to the corresponding register notifier call chain.
@@ -176,7 +173,6 @@ struct devfreq {
 	struct device dev;
 	struct devfreq_dev_profile *profile;
 	const struct devfreq_governor *governor;
-	char governor_name[DEVFREQ_NAME_LEN];
 	struct notifier_block nb;
 	struct delayed_work work;
 
-- 
cgit 


From e275d2109cdaea8b4554b9eb8a828bdb8f8ba068 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 26 Oct 2020 10:08:47 +0200
Subject: bus: ti-sysc: Fix reset status check for modules with quirks

Commit d46f9fbec719 ("bus: ti-sysc: Use optional clocks on for enable and
wait for softreset bit") started showing a "OCP softreset timed out"
warning on enable if the interconnect target module is not out of reset.
This caused the warning to be often triggered for i2c and hdq while the
devices are working properly.

Turns out that some interconnect target modules seem to have an unusable
reset status bits unless the module specific reset quirks are activated.

Let's just skip the reset status check for those modules as we only want
to activate the reset quirks when doing a reset, and not on enable. This
way we don't see the bogus "OCP softreset timed out" warnings during boot.

Fixes: d46f9fbec719 ("bus: ti-sysc: Use optional clocks on for enable and wait for softreset bit")
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 24 +++++++++++++++---------
 include/linux/platform_data/ti-sysc.h |  1 +
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index efb088df1276..88a751c11677 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -970,9 +970,15 @@ static int sysc_enable_module(struct device *dev)
 			return error;
 		}
 	}
-	error = sysc_wait_softreset(ddata);
-	if (error)
-		dev_warn(ddata->dev, "OCP softreset timed out\n");
+	/*
+	 * Some modules like i2c and hdq1w have unusable reset status unless
+	 * the module reset quirk is enabled. Skip status check on enable.
+	 */
+	if (!(ddata->cfg.quirks & SYSC_MODULE_QUIRK_ENA_RESETDONE)) {
+		error = sysc_wait_softreset(ddata);
+		if (error)
+			dev_warn(ddata->dev, "OCP softreset timed out\n");
+	}
 	if (ddata->cfg.quirks & SYSC_QUIRK_OPT_CLKS_IN_RESET)
 		sysc_disable_opt_clocks(ddata);
 
@@ -1373,17 +1379,17 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 	SYSC_QUIRK("hdmi", 0, 0, 0x10, -ENODEV, 0x50030200, 0xffffffff,
 		   SYSC_QUIRK_OPT_CLKS_NEEDED),
 	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff,
-		   SYSC_MODULE_QUIRK_HDQ1W),
+		   SYSC_MODULE_QUIRK_HDQ1W | SYSC_MODULE_QUIRK_ENA_RESETDONE),
 	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff,
-		   SYSC_MODULE_QUIRK_HDQ1W),
+		   SYSC_MODULE_QUIRK_HDQ1W | SYSC_MODULE_QUIRK_ENA_RESETDONE),
 	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000036, 0x000000ff,
-		   SYSC_MODULE_QUIRK_I2C),
+		   SYSC_MODULE_QUIRK_I2C | SYSC_MODULE_QUIRK_ENA_RESETDONE),
 	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x0000003c, 0x000000ff,
-		   SYSC_MODULE_QUIRK_I2C),
+		   SYSC_MODULE_QUIRK_I2C | SYSC_MODULE_QUIRK_ENA_RESETDONE),
 	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000040, 0x000000ff,
-		   SYSC_MODULE_QUIRK_I2C),
+		   SYSC_MODULE_QUIRK_I2C | SYSC_MODULE_QUIRK_ENA_RESETDONE),
 	SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0,
-		   SYSC_MODULE_QUIRK_I2C),
+		   SYSC_MODULE_QUIRK_I2C | SYSC_MODULE_QUIRK_ENA_RESETDONE),
 	SYSC_QUIRK("gpu", 0x50000000, 0x14, -ENODEV, -ENODEV, 0x00010201, 0xffffffff, 0),
 	SYSC_QUIRK("gpu", 0x50000000, 0xfe00, 0xfe10, -ENODEV, 0x40000000 , 0xffffffff,
 		   SYSC_MODULE_QUIRK_SGX),
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index c59999ce044e..240dce553a0b 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -50,6 +50,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_MODULE_QUIRK_ENA_RESETDONE	BIT(25)
 #define SYSC_MODULE_QUIRK_PRUSS		BIT(24)
 #define SYSC_MODULE_QUIRK_DSS_RESET	BIT(23)
 #define SYSC_MODULE_QUIRK_RTC_UNLOCK	BIT(22)
-- 
cgit 


From bc3d7bf61a9eaecccc84dc2ecc2a9a3fa4f5ec47 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Sat, 3 Oct 2020 23:25:31 -0400
Subject: elf: Expose ELF header in compat_start_thread()

Like it is done for SET_PERSONALITY with x86, which requires the ELF header
to select correct personality parameters, x86 requires the headers on
compat_start_thread() to choose starting CS for ELF32 binaries, instead of
relying on the going-away TIF_IA32/X32 flags.

Add an indirection macro to ELF invocations of START_THREAD, that x86 can
reimplement to receive the extra parameter just for ELF files.  This
requires no changes to other architectures who don't need the header
information, they can continue to use the original start_thread for ELF and
non-ELF binaries, and it prevents affecting non-ELF code paths for x86.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201004032536.1229030-6-krisman@collabora.com
---
 fs/binfmt_elf.c        | 2 +-
 fs/compat_binfmt_elf.c | 9 +++++++--
 include/linux/elf.h    | 5 +++++
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b6b3d052ca86..b23f7553fe9b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1307,7 +1307,7 @@ out_free_interp:
 #endif
 
 	finalize_exec(bprm);
-	start_thread(regs, elf_entry, bprm->p);
+	START_THREAD(elf_ex, regs, elf_entry, bprm->p);
 	retval = 0;
 out:
 	return retval;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 2d24c765cbd7..12b991368f0a 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -106,8 +106,13 @@
 #endif
 
 #ifdef	compat_start_thread
-#undef	start_thread
-#define	start_thread		compat_start_thread
+#define COMPAT_START_THREAD(ex, regs, new_ip, new_sp)	\
+	compat_start_thread(regs, new_ip, new_sp)
+#endif
+
+#ifdef	COMPAT_START_THREAD
+#undef	START_THREAD
+#define START_THREAD		COMPAT_START_THREAD
 #endif
 
 #ifdef	compat_arch_setup_additional_pages
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 5d5b0321da0b..6dbcfe7a3fd7 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -22,6 +22,11 @@
 	SET_PERSONALITY(ex)
 #endif
 
+#ifndef START_THREAD
+#define START_THREAD(elf_ex, regs, elf_entry, start_stack)	\
+	start_thread(regs, elf_entry, start_stack)
+#endif
+
 #define ELF32_GNU_PROPERTY_ALIGN	4
 #define ELF64_GNU_PROPERTY_ALIGN	8
 
-- 
cgit 


From 9a29a671902c2be05d636045a4dd365219ca716c Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Sat, 3 Oct 2020 23:25:33 -0400
Subject: elf: Expose ELF header on arch_setup_additional_pages()

Like it is done for SET_PERSONALITY with ARM, which requires the ELF
header to select correct personality parameters, x86 requires the
headers when selecting which VDSO to load, instead of relying on the
going-away TIF_IA32/X32 flags.

Add an indirection macro to arch_setup_additional_pages(), that x86 can
reimplement to receive the extra parameter just for ELF files.  This
requires no changes to other architectures, who can continue to use the
original arch_setup_additional_pages for ELF and non-ELF binaries.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201004032536.1229030-8-krisman@collabora.com
---
 fs/binfmt_elf.c        |  2 +-
 fs/compat_binfmt_elf.c | 11 ++++++++---
 include/linux/elf.h    |  5 +++++
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b23f7553fe9b..aabc11f099cf 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1246,7 +1246,7 @@ out_free_interp:
 	set_binfmt(&elf_format);
 
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
-	retval = arch_setup_additional_pages(bprm, !!interpreter);
+	retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
 	if (retval < 0)
 		goto out;
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 12b991368f0a..2c557229696a 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -115,11 +115,16 @@
 #define START_THREAD		COMPAT_START_THREAD
 #endif
 
-#ifdef	compat_arch_setup_additional_pages
+#ifdef compat_arch_setup_additional_pages
+#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \
+	compat_arch_setup_additional_pages(bprm, interpreter)
+#endif
+
+#ifdef	COMPAT_ARCH_SETUP_ADDITIONAL_PAGES
 #undef	ARCH_HAS_SETUP_ADDITIONAL_PAGES
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
-#undef	arch_setup_additional_pages
-#define	arch_setup_additional_pages compat_arch_setup_additional_pages
+#undef	ARCH_SETUP_ADDITIONAL_PAGES
+#define	ARCH_SETUP_ADDITIONAL_PAGES COMPAT_ARCH_SETUP_ADDITIONAL_PAGES
 #endif
 
 #ifdef	compat_elf_read_implies_exec
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 6dbcfe7a3fd7..c9a46c4e183b 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -27,6 +27,11 @@
 	start_thread(regs, elf_entry, start_stack)
 #endif
 
+#if defined(ARCH_HAS_SETUP_ADDITIONAL_PAGES) && !defined(ARCH_SETUP_ADDITIONAL_PAGES)
+#define ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \
+	arch_setup_additional_pages(bprm, interpreter)
+#endif
+
 #define ELF32_GNU_PROPERTY_ALIGN	4
 #define ELF64_GNU_PROPERTY_ALIGN	8
 
-- 
cgit 


From ab589bac553f79d559952aa088480a72258ac5bc Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 19 Oct 2020 13:53:13 +0300
Subject: ASoC: adau1977: remove platform data and move micbias bindings
 include

The change removes the platform_data include/definition. It only contains
some values for the MICBIAS.
These are moved into 'dt-bindings/sound/adi,adau1977.h' so that they can be
used inside device-trees. When moving then, they need to be converted to
pre-compiler defines, so that the DT compiler can understand them.

The driver then, also needs to include the new
'dt-bindings/sound/adi,adau1977.h' file.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20201019105313.24862-1-alexandru.ardelean@analog.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/dt-bindings/sound/adi,adau1977.h | 15 +++++++++++
 include/linux/platform_data/adau1977.h   | 44 --------------------------------
 sound/soc/codecs/adau1977.c              |  9 +++----
 3 files changed, 18 insertions(+), 50 deletions(-)
 create mode 100644 include/dt-bindings/sound/adi,adau1977.h
 delete mode 100644 include/linux/platform_data/adau1977.h

(limited to 'include/linux')

diff --git a/include/dt-bindings/sound/adi,adau1977.h b/include/dt-bindings/sound/adi,adau1977.h
new file mode 100644
index 000000000000..8eebec6570f2
--- /dev/null
+++ b/include/dt-bindings/sound/adi,adau1977.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __DT_BINDINGS_ADI_ADAU1977_H__
+#define __DT_BINDINGS_ADI_ADAU1977_H__
+
+#define ADAU1977_MICBIAS_5V0	0x0
+#define ADAU1977_MICBIAS_5V5	0x1
+#define ADAU1977_MICBIAS_6V0	0x2
+#define ADAU1977_MICBIAS_6V5	0x3
+#define ADAU1977_MICBIAS_7V0	0x4
+#define ADAU1977_MICBIAS_7V5	0x5
+#define ADAU1977_MICBIAS_8V0	0x6
+#define ADAU1977_MICBIAS_8V5	0x7
+#define ADAU1977_MICBIAS_9V0	0x8
+
+#endif /* __DT_BINDINGS_ADI_ADAU1977_H__ */
diff --git a/include/linux/platform_data/adau1977.h b/include/linux/platform_data/adau1977.h
deleted file mode 100644
index 86667235077a..000000000000
--- a/include/linux/platform_data/adau1977.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * ADAU1977/ADAU1978/ADAU1979 driver
- *
- * Copyright 2014 Analog Devices Inc.
- *  Author: Lars-Peter Clausen <lars@metafoo.de>
- */
-
-#ifndef __LINUX_PLATFORM_DATA_ADAU1977_H__
-#define __LINUX_PLATFORM_DATA_ADAU1977_H__
-
-/**
- * enum adau1977_micbias - ADAU1977 MICBIAS pin voltage setting
- * @ADAU1977_MICBIAS_5V0: MICBIAS is set to 5.0 V
- * @ADAU1977_MICBIAS_5V5: MICBIAS is set to 5.5 V
- * @ADAU1977_MICBIAS_6V0: MICBIAS is set to 6.0 V
- * @ADAU1977_MICBIAS_6V5: MICBIAS is set to 6.5 V
- * @ADAU1977_MICBIAS_7V0: MICBIAS is set to 7.0 V
- * @ADAU1977_MICBIAS_7V5: MICBIAS is set to 7.5 V
- * @ADAU1977_MICBIAS_8V0: MICBIAS is set to 8.0 V
- * @ADAU1977_MICBIAS_8V5: MICBIAS is set to 8.5 V
- * @ADAU1977_MICBIAS_9V0: MICBIAS is set to 9.0 V
- */
-enum adau1977_micbias {
-	ADAU1977_MICBIAS_5V0 = 0x0,
-	ADAU1977_MICBIAS_5V5 = 0x1,
-	ADAU1977_MICBIAS_6V0 = 0x2,
-	ADAU1977_MICBIAS_6V5 = 0x3,
-	ADAU1977_MICBIAS_7V0 = 0x4,
-	ADAU1977_MICBIAS_7V5 = 0x5,
-	ADAU1977_MICBIAS_8V0 = 0x6,
-	ADAU1977_MICBIAS_8V5 = 0x7,
-	ADAU1977_MICBIAS_9V0 = 0x8,
-};
-
-/**
- * struct adau1977_platform_data - Platform configuration data for the ADAU1977
- * @micbias: Specifies the voltage for the MICBIAS pin
- */
-struct adau1977_platform_data {
-	enum adau1977_micbias micbias;
-};
-
-#endif
diff --git a/sound/soc/codecs/adau1977.c b/sound/soc/codecs/adau1977.c
index 0a36e523584c..8260f49caa24 100644
--- a/sound/soc/codecs/adau1977.c
+++ b/sound/soc/codecs/adau1977.c
@@ -12,7 +12,6 @@
 #include <linux/i2c.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/platform_data/adau1977.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
 #include <linux/slab.h>
@@ -24,6 +23,8 @@
 #include <sound/soc.h>
 #include <sound/tlv.h>
 
+#include <dt-bindings/sound/adi,adau1977.h>
+
 #include "adau1977.h"
 
 #define ADAU1977_REG_POWER		0x00
@@ -881,13 +882,9 @@ static const struct snd_soc_component_driver adau1977_component_driver = {
 
 static int adau1977_setup_micbias(struct adau1977 *adau1977)
 {
-	struct adau1977_platform_data *pdata = adau1977->dev->platform_data;
 	unsigned int micbias;
 
-	if (pdata)
-		micbias = pdata->micbias;
-	else if (device_property_read_u32(adau1977->dev, "adi,micbias",
-					  &micbias))
+	if (device_property_read_u32(adau1977->dev, "adi,micbias", &micbias))
 		micbias = ADAU1977_MICBIAS_8V5;
 
 	if (micbias > ADAU1977_MICBIAS_9V0) {
-- 
cgit 


From 6e1e90ec027509a7e8d4efbd77a65b32b5a8b3ec Mon Sep 17 00:00:00 2001
From: Adrian Ratiu <adrian.ratiu@collabora.com>
Date: Wed, 14 Oct 2020 23:30:24 +0300
Subject: regmap: mmio: add config option to allow relaxed MMIO accesses

On some platforms (eg armv7 due to the CONFIG_ARM_DMA_MEM_BUFFERABLE)
MMIO R/W operations always add memory barriers which can increase load,
decrease battery life or in general reduce performance unnecessarily
on devices which access a lot of configuration registers and where
ordering does not matter (eg. media accelerators like the Verisilicon /
Hantro video decoders).

Drivers used to call the relaxed MMIO variants directly but since they
are now accessing the MMIO registers via regmaps (to compensate for
different VPU HW reg layouts via regmap fields), there is a need for a
relaxed API / config to preserve existing behaviour.

Cc: Mark Brown <broonie@kernel.org>
Signed-off-by: Adrian Ratiu <adrian.ratiu@collabora.com>
Link: https://lore.kernel.org/r/20201014203024.954369-1-adrian.ratiu@collabora.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-mmio.c | 90 +++++++++++++++++++++++++++++++++++----
 include/linux/regmap.h            |  5 +++
 2 files changed, 87 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c
index af967d8f975e..f9cd51afb9d2 100644
--- a/drivers/base/regmap/regmap-mmio.c
+++ b/drivers/base/regmap/regmap-mmio.c
@@ -16,6 +16,7 @@
 struct regmap_mmio_context {
 	void __iomem *regs;
 	unsigned val_bytes;
+	bool relaxed_mmio;
 
 	bool attached_clk;
 	struct clk *clk;
@@ -75,6 +76,13 @@ static void regmap_mmio_write8(struct regmap_mmio_context *ctx,
 	writeb(val, ctx->regs + reg);
 }
 
+static void regmap_mmio_write8_relaxed(struct regmap_mmio_context *ctx,
+				unsigned int reg,
+				unsigned int val)
+{
+	writeb_relaxed(val, ctx->regs + reg);
+}
+
 static void regmap_mmio_write16le(struct regmap_mmio_context *ctx,
 				  unsigned int reg,
 				  unsigned int val)
@@ -82,6 +90,13 @@ static void regmap_mmio_write16le(struct regmap_mmio_context *ctx,
 	writew(val, ctx->regs + reg);
 }
 
+static void regmap_mmio_write16le_relaxed(struct regmap_mmio_context *ctx,
+				  unsigned int reg,
+				  unsigned int val)
+{
+	writew_relaxed(val, ctx->regs + reg);
+}
+
 static void regmap_mmio_write16be(struct regmap_mmio_context *ctx,
 				  unsigned int reg,
 				  unsigned int val)
@@ -96,6 +111,13 @@ static void regmap_mmio_write32le(struct regmap_mmio_context *ctx,
 	writel(val, ctx->regs + reg);
 }
 
+static void regmap_mmio_write32le_relaxed(struct regmap_mmio_context *ctx,
+				  unsigned int reg,
+				  unsigned int val)
+{
+	writel_relaxed(val, ctx->regs + reg);
+}
+
 static void regmap_mmio_write32be(struct regmap_mmio_context *ctx,
 				  unsigned int reg,
 				  unsigned int val)
@@ -110,6 +132,13 @@ static void regmap_mmio_write64le(struct regmap_mmio_context *ctx,
 {
 	writeq(val, ctx->regs + reg);
 }
+
+static void regmap_mmio_write64le_relaxed(struct regmap_mmio_context *ctx,
+				  unsigned int reg,
+				  unsigned int val)
+{
+	writeq_relaxed(val, ctx->regs + reg);
+}
 #endif
 
 static int regmap_mmio_write(void *context, unsigned int reg, unsigned int val)
@@ -137,12 +166,24 @@ static unsigned int regmap_mmio_read8(struct regmap_mmio_context *ctx,
 	return readb(ctx->regs + reg);
 }
 
+static unsigned int regmap_mmio_read8_relaxed(struct regmap_mmio_context *ctx,
+				      unsigned int reg)
+{
+	return readb_relaxed(ctx->regs + reg);
+}
+
 static unsigned int regmap_mmio_read16le(struct regmap_mmio_context *ctx,
 				         unsigned int reg)
 {
 	return readw(ctx->regs + reg);
 }
 
+static unsigned int regmap_mmio_read16le_relaxed(struct regmap_mmio_context *ctx,
+						 unsigned int reg)
+{
+	return readw_relaxed(ctx->regs + reg);
+}
+
 static unsigned int regmap_mmio_read16be(struct regmap_mmio_context *ctx,
 				         unsigned int reg)
 {
@@ -155,6 +196,12 @@ static unsigned int regmap_mmio_read32le(struct regmap_mmio_context *ctx,
 	return readl(ctx->regs + reg);
 }
 
+static unsigned int regmap_mmio_read32le_relaxed(struct regmap_mmio_context *ctx,
+						 unsigned int reg)
+{
+	return readl_relaxed(ctx->regs + reg);
+}
+
 static unsigned int regmap_mmio_read32be(struct regmap_mmio_context *ctx,
 				         unsigned int reg)
 {
@@ -167,6 +214,12 @@ static unsigned int regmap_mmio_read64le(struct regmap_mmio_context *ctx,
 {
 	return readq(ctx->regs + reg);
 }
+
+static unsigned int regmap_mmio_read64le_relaxed(struct regmap_mmio_context *ctx,
+						 unsigned int reg)
+{
+	return readq_relaxed(ctx->regs + reg);
+}
 #endif
 
 static int regmap_mmio_read(void *context, unsigned int reg, unsigned int *val)
@@ -237,6 +290,7 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev,
 
 	ctx->regs = regs;
 	ctx->val_bytes = config->val_bits / 8;
+	ctx->relaxed_mmio = config->use_relaxed_mmio;
 	ctx->clk = ERR_PTR(-ENODEV);
 
 	switch (regmap_get_val_endian(dev, &regmap_mmio, config)) {
@@ -247,21 +301,41 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev,
 #endif
 		switch (config->val_bits) {
 		case 8:
-			ctx->reg_read = regmap_mmio_read8;
-			ctx->reg_write = regmap_mmio_write8;
+			if (ctx->relaxed_mmio) {
+				ctx->reg_read = regmap_mmio_read8_relaxed;
+				ctx->reg_write = regmap_mmio_write8_relaxed;
+			} else {
+				ctx->reg_read = regmap_mmio_read8;
+				ctx->reg_write = regmap_mmio_write8;
+			}
 			break;
 		case 16:
-			ctx->reg_read = regmap_mmio_read16le;
-			ctx->reg_write = regmap_mmio_write16le;
+			if (ctx->relaxed_mmio) {
+				ctx->reg_read = regmap_mmio_read16le_relaxed;
+				ctx->reg_write = regmap_mmio_write16le_relaxed;
+			} else {
+				ctx->reg_read = regmap_mmio_read16le;
+				ctx->reg_write = regmap_mmio_write16le;
+			}
 			break;
 		case 32:
-			ctx->reg_read = regmap_mmio_read32le;
-			ctx->reg_write = regmap_mmio_write32le;
+			if (ctx->relaxed_mmio) {
+				ctx->reg_read = regmap_mmio_read32le_relaxed;
+				ctx->reg_write = regmap_mmio_write32le_relaxed;
+			} else {
+				ctx->reg_read = regmap_mmio_read32le;
+				ctx->reg_write = regmap_mmio_write32le;
+			}
 			break;
 #ifdef CONFIG_64BIT
 		case 64:
-			ctx->reg_read = regmap_mmio_read64le;
-			ctx->reg_write = regmap_mmio_write64le;
+			if (ctx->relaxed_mmio) {
+				ctx->reg_read = regmap_mmio_read64le_relaxed;
+				ctx->reg_write = regmap_mmio_write64le_relaxed;
+			} else {
+				ctx->reg_read = regmap_mmio_read64le;
+				ctx->reg_write = regmap_mmio_write64le;
+			}
 			break;
 #endif
 		default:
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index e7834d98207f..126fe700d1d8 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -315,6 +315,10 @@ typedef void (*regmap_unlock)(void *);
  *                   masks are used.
  * @zero_flag_mask: If set, read_flag_mask and write_flag_mask are used even
  *                   if they are both empty.
+ * @use_relaxed_mmio: If set, MMIO R/W operations will not use memory barriers.
+ *                    This can avoid load on devices which don't require strict
+ *                    orderings, but drivers should carefully add any explicit
+ *                    memory barriers when they may require them.
  * @use_single_read: If set, converts the bulk read operation into a series of
  *                   single read operations. This is useful for a device that
  *                   does not support  bulk read.
@@ -388,6 +392,7 @@ struct regmap_config {
 
 	bool use_single_read;
 	bool use_single_write;
+	bool use_relaxed_mmio;
 	bool can_multi_write;
 
 	enum regmap_endian reg_format_endian;
-- 
cgit 


From ab7cffb8d2367e5b088c7c14452724e719a10eba Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Mon, 26 Oct 2020 20:20:40 +0100
Subject: MIPS: ingenic: remove unused platform_data header file

There are no users of this headers file in the kernel.
All users are likely migrated to device tree which is a good thing.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Paul Cercueil <paul@crapouillou.net>
Cc: Harvey Hunt <harveyhuntnexus@gmail.com>
Cc: linux-mips@vger.kernel.org
Reviewed-by: Paul Cercueil <paul@crapouillou.net>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 include/linux/platform_data/jz4740/jz4740_nand.h | 25 ------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 include/linux/platform_data/jz4740/jz4740_nand.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/jz4740/jz4740_nand.h b/include/linux/platform_data/jz4740/jz4740_nand.h
deleted file mode 100644
index b3f066d63059..000000000000
--- a/include/linux/platform_data/jz4740/jz4740_nand.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- *  Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de>
- *  JZ4740 SoC NAND controller driver
- */
-
-#ifndef __JZ4740_NAND_H__
-#define __JZ4740_NAND_H__
-
-#include <linux/mtd/rawnand.h>
-#include <linux/mtd/partitions.h>
-
-#define JZ_NAND_NUM_BANKS 4
-
-struct jz_nand_platform_data {
-	int			num_partitions;
-	struct mtd_partition	*partitions;
-
-	unsigned char banks[JZ_NAND_NUM_BANKS];
-
-	void (*ident_callback)(struct platform_device *, struct mtd_info *,
-				struct mtd_partition **, int *num_partitions);
-};
-
-#endif
-- 
cgit 


From 343a3e8bc635bd4c58d45a4fe67f9c3a78fbd191 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 26 Oct 2020 17:20:50 +0100
Subject: bpf: Fix -Wshadow warnings

There are thousands of warnings about one macro in a W=2 build:

  include/linux/filter.h:561:6: warning: declaration of 'ret' shadows a previous local [-Wshadow]

Prefix all the locals in that macro with __ to avoid most of
these warnings.

Fixes: 492ecee892c2 ("bpf: enable program stats")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201026162110.3710415-1-arnd@kernel.org
---
 include/linux/filter.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 72d62cbc1578..1b62397bd124 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -558,21 +558,21 @@ struct sk_filter {
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
 #define __BPF_PROG_RUN(prog, ctx, dfunc)	({			\
-	u32 ret;							\
+	u32 __ret;							\
 	cant_migrate();							\
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {		\
-		struct bpf_prog_stats *stats;				\
-		u64 start = sched_clock();				\
-		ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
-		stats = this_cpu_ptr(prog->aux->stats);			\
-		u64_stats_update_begin(&stats->syncp);			\
-		stats->cnt++;						\
-		stats->nsecs += sched_clock() - start;			\
-		u64_stats_update_end(&stats->syncp);			\
+		struct bpf_prog_stats *__stats;				\
+		u64 __start = sched_clock();				\
+		__ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
+		__stats = this_cpu_ptr(prog->aux->stats);		\
+		u64_stats_update_begin(&__stats->syncp);		\
+		__stats->cnt++;						\
+		__stats->nsecs += sched_clock() - __start;		\
+		u64_stats_update_end(&__stats->syncp);			\
 	} else {							\
-		ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
+		__ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
 	}								\
-	ret; })
+	__ret; })
 
 #define BPF_PROG_RUN(prog, ctx)						\
 	__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func)
-- 
cgit 


From 6d915476e67d99b73a57bceb83cff1cf153d8bf6 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 22 Sep 2020 08:44:50 -0400
Subject: audit: trigger accompanying records when no rules present

When there are no audit rules registered, mandatory records (config,
etc.) are missing their accompanying records (syscall, proctitle, etc.).

This is due to audit context dummy set on syscall entry based on absence
of rules that signals that no other records are to be printed.  Clear the dummy
bit if any record is generated, open coding this in audit_log_start().

The proctitle context and dummy checks are pointless since the
proctitle record will not be printed if no syscall records are printed.

The fds array is reset to -1 after the first syscall to indicate it
isn't valid any more, but was never set to -1 when the context was
allocated to indicate it wasn't yet valid.

Check ctx->pwd in audit_log_name().

The audit_inode* functions can be called without going through
getname_flags() or getname_kernel() that sets audit_names and cwd, so
set the cwd in audit_alloc_name() if it has not already been done so due to
audit_names being valid and purge all other audit_getcwd() calls.

Revert the LSM dump_common_audit_data() LSM_AUDIT_DATA_* cases from the
ghak96 patch since they are no longer necessary due to cwd coverage in
audit_alloc_name().

Thanks to bauen1 <j2468h@googlemail.com> for reporting LSM situations in
which context->cwd is not valid, inadvertantly fixed by the ghak96 patch.

Please see upstream github issue
https://github.com/linux-audit/audit-kernel/issues/120
This is also related to upstream github issue
https://github.com/linux-audit/audit-kernel/issues/96

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |  8 --------
 kernel/audit.c        |  3 +++
 kernel/auditsc.c      | 27 +++++++--------------------
 security/lsm_audit.c  |  5 -----
 4 files changed, 10 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b3d859831a31..82b7c1116a85 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -292,7 +292,6 @@ extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
 extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern struct filename *__audit_reusename(const __user char *uptr);
 extern void __audit_getname(struct filename *name);
-extern void __audit_getcwd(void);
 extern void __audit_inode(struct filename *name, const struct dentry *dentry,
 				unsigned int flags);
 extern void __audit_file(const struct file *);
@@ -351,11 +350,6 @@ static inline void audit_getname(struct filename *name)
 	if (unlikely(!audit_dummy_context()))
 		__audit_getname(name);
 }
-static inline void audit_getcwd(void)
-{
-	if (unlikely(audit_context()))
-		__audit_getcwd();
-}
 static inline void audit_inode(struct filename *name,
 				const struct dentry *dentry,
 				unsigned int aflags) {
@@ -584,8 +578,6 @@ static inline struct filename *audit_reusename(const __user char *name)
 }
 static inline void audit_getname(struct filename *name)
 { }
-static inline void audit_getcwd(void)
-{ }
 static inline void audit_inode(struct filename *name,
 				const struct dentry *dentry,
 				unsigned int aflags)
diff --git a/kernel/audit.c b/kernel/audit.c
index 0be42cac086b..ac0aeaa99937 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1865,6 +1865,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	}
 
 	audit_get_stamp(ab->ctx, &t, &serial);
+	/* cancel dummy context to enable supporting records */
+	if (ctx)
+		ctx->dummy = 0;
 	audit_log_format(ab, "audit(%llu.%03lu:%u): ",
 			 (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8dba8f0983b5..183d79cc2e12 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -929,6 +929,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
 	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 	INIT_LIST_HEAD(&context->killed_trees);
 	INIT_LIST_HEAD(&context->names_list);
+	context->fds[0] = -1;
 	return context;
 }
 
@@ -1367,7 +1368,10 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
 			/* name was specified as a relative path and the
 			 * directory component is the cwd
 			 */
-			audit_log_d_path(ab, " name=", &context->pwd);
+			if (context->pwd.dentry && context->pwd.mnt)
+				audit_log_d_path(ab, " name=", &context->pwd);
+			else
+				audit_log_format(ab, " name=(null)");
 			break;
 		default:
 			/* log the name's directory component */
@@ -1435,9 +1439,6 @@ static void audit_log_proctitle(void)
 	struct audit_context *context = audit_context();
 	struct audit_buffer *ab;
 
-	if (!context || context->dummy)
-		return;
-
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
 	if (!ab)
 		return;	/* audit_panic or being filtered */
@@ -1866,6 +1867,8 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
 	list_add_tail(&aname->list, &context->names_list);
 
 	context->name_count++;
+	if (!context->pwd.dentry)
+		get_fs_pwd(current->fs, &context->pwd);
 	return aname;
 }
 
@@ -1894,20 +1897,6 @@ __audit_reusename(const __user char *uptr)
 	return NULL;
 }
 
-inline void _audit_getcwd(struct audit_context *context)
-{
-	if (!context->pwd.dentry)
-		get_fs_pwd(current->fs, &context->pwd);
-}
-
-void __audit_getcwd(void)
-{
-	struct audit_context *context = audit_context();
-
-	if (context->in_syscall)
-		_audit_getcwd(context);
-}
-
 /**
  * __audit_getname - add a name to the list
  * @name: name to add
@@ -1931,8 +1920,6 @@ void __audit_getname(struct filename *name)
 	n->name_len = AUDIT_NAME_FULL;
 	name->aname = n;
 	name->refcnt++;
-
-	_audit_getcwd(context);
 }
 
 static inline int audit_copy_fcaps(struct audit_names *name,
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 53d0d183db8f..221370794d14 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -241,7 +241,6 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			audit_log_untrustedstring(ab, inode->i_sb->s_id);
 			audit_log_format(ab, " ino=%lu", inode->i_ino);
 		}
-		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_FILE: {
@@ -255,7 +254,6 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			audit_log_untrustedstring(ab, inode->i_sb->s_id);
 			audit_log_format(ab, " ino=%lu", inode->i_ino);
 		}
-		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_IOCTL_OP: {
@@ -271,7 +269,6 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		}
 
 		audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd);
-		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_DENTRY: {
@@ -286,7 +283,6 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			audit_log_untrustedstring(ab, inode->i_sb->s_id);
 			audit_log_format(ab, " ino=%lu", inode->i_ino);
 		}
-		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_INODE: {
@@ -304,7 +300,6 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		audit_log_format(ab, " dev=");
 		audit_log_untrustedstring(ab, inode->i_sb->s_id);
 		audit_log_format(ab, " ino=%lu", inode->i_ino);
-		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_TASK: {
-- 
cgit 


From 95de5094f5ac50b6f355f4e7dffcb6f34bd5dada Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Tue, 22 Sep 2020 18:24:29 +0800
Subject: firmware: imx: add dummy functions

add dummy functions to avoid build failure when header files
are included, but drivers are not built.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 include/linux/firmware/imx/ipc.h      | 13 +++++++++++++
 include/linux/firmware/imx/sci.h      | 27 +++++++++++++++++++++++++++
 include/linux/firmware/imx/svc/misc.h | 19 +++++++++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/imx/ipc.h b/include/linux/firmware/imx/ipc.h
index 891057434858..0b4643571625 100644
--- a/include/linux/firmware/imx/ipc.h
+++ b/include/linux/firmware/imx/ipc.h
@@ -34,6 +34,7 @@ struct imx_sc_rpc_msg {
 	uint8_t func;
 };
 
+#ifdef CONFIG_IMX_SCU
 /*
  * This is an function to send an RPC message over an IPC channel.
  * It is called by client-side SCFW API function shims.
@@ -55,4 +56,16 @@ int imx_scu_call_rpc(struct imx_sc_ipc *ipc, void *msg, bool have_resp);
  * @return Returns an error code (0 = success, failed if < 0)
  */
 int imx_scu_get_handle(struct imx_sc_ipc **ipc);
+#else
+static inline int imx_scu_call_rpc(struct imx_sc_ipc *ipc, void *msg,
+				   bool have_resp)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_get_handle(struct imx_sc_ipc **ipc)
+{
+	return -ENOTSUPP;
+}
+#endif
 #endif /* _SC_IPC_H */
diff --git a/include/linux/firmware/imx/sci.h b/include/linux/firmware/imx/sci.h
index 22c76571a294..5cc63fe7e84d 100644
--- a/include/linux/firmware/imx/sci.h
+++ b/include/linux/firmware/imx/sci.h
@@ -16,9 +16,36 @@
 #include <linux/firmware/imx/svc/pm.h>
 #include <linux/firmware/imx/svc/rm.h>
 
+#if IS_ENABLED(CONFIG_IMX_SCU)
 int imx_scu_enable_general_irq_channel(struct device *dev);
 int imx_scu_irq_register_notifier(struct notifier_block *nb);
 int imx_scu_irq_unregister_notifier(struct notifier_block *nb);
 int imx_scu_irq_group_enable(u8 group, u32 mask, u8 enable);
 int imx_scu_soc_init(struct device *dev);
+#else
+static inline int imx_scu_soc_init(struct device *dev)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_enable_general_irq_channel(struct device *dev)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_irq_register_notifier(struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_irq_unregister_notifier(struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_irq_group_enable(u8 group, u32 mask, u8 enable)
+{
+	return -ENOTSUPP;
+}
+#endif
 #endif /* _SC_SCI_H */
diff --git a/include/linux/firmware/imx/svc/misc.h b/include/linux/firmware/imx/svc/misc.h
index 031dd4d3c766..760db08a67fc 100644
--- a/include/linux/firmware/imx/svc/misc.h
+++ b/include/linux/firmware/imx/svc/misc.h
@@ -46,6 +46,7 @@ enum imx_misc_func {
  * Control Functions
  */
 
+#ifdef CONFIG_IMX_SCU
 int imx_sc_misc_set_control(struct imx_sc_ipc *ipc, u32 resource,
 			    u8 ctrl, u32 val);
 
@@ -54,5 +55,23 @@ int imx_sc_misc_get_control(struct imx_sc_ipc *ipc, u32 resource,
 
 int imx_sc_pm_cpu_start(struct imx_sc_ipc *ipc, u32 resource,
 			bool enable, u64 phys_addr);
+#else
+static inline int imx_sc_misc_set_control(struct imx_sc_ipc *ipc,
+					  u32 resource, u8 ctrl, u32 val)
+{
+	return -ENOTSUPP;
+}
 
+static inline int imx_sc_misc_get_control(struct imx_sc_ipc *ipc,
+					  u32 resource, u8 ctrl, u32 *val)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_sc_pm_cpu_start(struct imx_sc_ipc *ipc, u32 resource,
+				      bool enable, u64 phys_addr)
+{
+	return -ENOTSUPP;
+}
+#endif
 #endif /* _SC_MISC_API_H */
-- 
cgit 


From ea856ec266c1e6aecd2b107032d5b5d661f0686d Mon Sep 17 00:00:00 2001
From: Samuel Čavoj <samuel@cavoj.net>
Date: Wed, 21 Oct 2020 00:09:44 +0200
Subject: platform/x86: asus-wmi: Add support for SW_TABLET_MODE on UX360
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The UX360CA has a WMI device id 0x00060062, which reports whether the
lid is flipped in tablet mode (1) or in normal laptop mode (0).

Add a quirk (quirk_asus_use_lid_flip_devid) for devices on which this
WMI device should be used to figure out the SW_TABLET_MODE state, as
opposed to the quirk_asus_use_kbd_dock_devid.

Additionally, the device needs to be queried on resume and restore
because the firmware does not generate an event if the laptop is put to
sleep while in tablet mode, flipped to normal mode, and later awoken.

It is assumed other UX360* models have the same WMI device. As such, the
quirk is applied to devices with DMI_MATCH(DMI_PRODUCT_NAME, "UX360").
More devices with this feature need to be tested and added accordingly.

The reason for using an allowlist via the quirk mechanism is that the new
WMI device (0x00060062) is also present on some models which do not have
a 360 degree hinge (at least FX503VD and GL503VD from Hans' DSTS
collection) and therefore its presence cannot be relied on.

Signed-off-by: Samuel Čavoj <samuel@cavoj.net>
Cc: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20201020220944.1075530-1-samuel@cavoj.net
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/asus-nb-wmi.c         | 15 +++++++++++
 drivers/platform/x86/asus-wmi.c            | 40 ++++++++++++++++++++++++++++++
 drivers/platform/x86/asus-wmi.h            |  1 +
 include/linux/platform_data/x86/asus-wmi.h |  1 +
 4 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c
index 1d9fbabd02fb..d41d7ad14be0 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -119,6 +119,11 @@ static struct quirk_entry quirk_asus_use_kbd_dock_devid = {
 	.use_kbd_dock_devid = true,
 };
 
+static struct quirk_entry quirk_asus_use_lid_flip_devid = {
+	.wmi_backlight_set_devstate = true,
+	.use_lid_flip_devid = true,
+};
+
 static int dmi_matched(const struct dmi_system_id *dmi)
 {
 	pr_info("Identified laptop model '%s'\n", dmi->ident);
@@ -520,6 +525,16 @@ static const struct dmi_system_id asus_quirks[] = {
 		},
 		.driver_data = &quirk_asus_use_kbd_dock_devid,
 	},
+	{
+		.callback = dmi_matched,
+		.ident = "ASUS ZenBook Flip UX360",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			/* Match UX360* */
+			DMI_MATCH(DMI_PRODUCT_NAME, "UX360"),
+		},
+		.driver_data = &quirk_asus_use_lid_flip_devid,
+	},
 	{},
 };
 
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 39e1a6396e08..fa884c418f4e 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -63,6 +63,7 @@ MODULE_LICENSE("GPL");
 #define NOTIFY_KBD_BRTTOGGLE		0xc7
 #define NOTIFY_KBD_FBM			0x99
 #define NOTIFY_KBD_TTP			0xae
+#define NOTIFY_LID_FLIP			0xfa
 
 #define ASUS_WMI_FNLOCK_BIOS_DISABLED	BIT(0)
 
@@ -375,6 +376,20 @@ static int asus_wmi_input_init(struct asus_wmi *asus)
 		}
 	}
 
+	if (asus->driver->quirks->use_lid_flip_devid) {
+		result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_LID_FLIP);
+		if (result < 0)
+			asus->driver->quirks->use_lid_flip_devid = 0;
+		if (result >= 0) {
+			input_set_capability(asus->inputdev, EV_SW, SW_TABLET_MODE);
+			input_report_switch(asus->inputdev, SW_TABLET_MODE, result);
+		} else if (result == -ENODEV) {
+			pr_err("This device has lid_flip quirk but got ENODEV checking it. This is a bug.");
+		} else {
+			pr_err("Error checking for lid-flip: %d\n", result);
+		}
+	}
+
 	err = input_register_device(asus->inputdev);
 	if (err)
 		goto err_free_dev;
@@ -394,6 +409,18 @@ static void asus_wmi_input_exit(struct asus_wmi *asus)
 	asus->inputdev = NULL;
 }
 
+/* Tablet mode ****************************************************************/
+
+static void lid_flip_tablet_mode_get_state(struct asus_wmi *asus)
+{
+	int result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_LID_FLIP);
+
+	if (result >= 0) {
+		input_report_switch(asus->inputdev, SW_TABLET_MODE, result);
+		input_sync(asus->inputdev);
+	}
+}
+
 /* Battery ********************************************************************/
 
 /* The battery maximum charging percentage */
@@ -2128,6 +2155,11 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus)
 		return;
 	}
 
+	if (asus->driver->quirks->use_lid_flip_devid && code == NOTIFY_LID_FLIP) {
+		lid_flip_tablet_mode_get_state(asus);
+		return;
+	}
+
 	if (asus->fan_boost_mode_available && code == NOTIFY_KBD_FBM) {
 		fan_boost_mode_switch_next(asus);
 		return;
@@ -2719,6 +2751,10 @@ static int asus_hotk_resume(struct device *device)
 
 	if (asus_wmi_has_fnlock_key(asus))
 		asus_wmi_fnlock_update(asus);
+
+	if (asus->driver->quirks->use_lid_flip_devid)
+		lid_flip_tablet_mode_get_state(asus);
+
 	return 0;
 }
 
@@ -2757,6 +2793,10 @@ static int asus_hotk_restore(struct device *device)
 
 	if (asus_wmi_has_fnlock_key(asus))
 		asus_wmi_fnlock_update(asus);
+
+	if (asus->driver->quirks->use_lid_flip_devid)
+		lid_flip_tablet_mode_get_state(asus);
+
 	return 0;
 }
 
diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h
index 1a95c172f94b..b302415bf1d9 100644
--- a/drivers/platform/x86/asus-wmi.h
+++ b/drivers/platform/x86/asus-wmi.h
@@ -34,6 +34,7 @@ struct quirk_entry {
 	bool wmi_backlight_set_devstate;
 	bool wmi_force_als_set;
 	bool use_kbd_dock_devid;
+	bool use_lid_flip_devid;
 	int wapf;
 	/*
 	 * For machines with AMD graphic chips, it will send out WMI event
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 897b8332a39f..2f274cf52805 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -62,6 +62,7 @@
 
 /* Misc */
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
+#define ASUS_WMI_DEVID_LID_FLIP		0x00060062
 
 /* Storage */
 #define ASUS_WMI_DEVID_CARDREADER	0x00080013
-- 
cgit 


From da31de35cd2fb78f75788873e0b61e56d4bb1a90 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <laurent@vivier.eu>
Date: Sat, 10 Oct 2020 02:47:49 +0200
Subject: tty: goldfish: use __raw_writel()/__raw_readl()

gf_early_console_putchar() uses __raw_writel() but the standard driver
uses writel()/readl(). This means we can't use both on the same machine
as the device is either big-endian, little-endian or native-endian.

As android implementation defines the endianness of the device is the one
of the architecture replace all writel()/readl() by
__raw_writel()/__raw_readl()

https://android.googlesource.com/platform/external/qemu/+/refs/heads/emu-master-dev/hw/char/goldfish_tty.c#222

Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Link: https://lore.kernel.org/r/20201010004749.1201695-1-laurent@vivier.eu
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/goldfish.c   | 18 +++++++++---------
 include/linux/goldfish.h |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index c8c5cdfc5e19..cd23a4b05c8f 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -61,13 +61,13 @@ static void do_rw_io(struct goldfish_tty *qtty,
 	spin_lock_irqsave(&qtty->lock, irq_flags);
 	gf_write_ptr((void *)address, base + GOLDFISH_TTY_REG_DATA_PTR,
 		     base + GOLDFISH_TTY_REG_DATA_PTR_HIGH);
-	writel(count, base + GOLDFISH_TTY_REG_DATA_LEN);
+	__raw_writel(count, base + GOLDFISH_TTY_REG_DATA_LEN);
 
 	if (is_write)
-		writel(GOLDFISH_TTY_CMD_WRITE_BUFFER,
+		__raw_writel(GOLDFISH_TTY_CMD_WRITE_BUFFER,
 		       base + GOLDFISH_TTY_REG_CMD);
 	else
-		writel(GOLDFISH_TTY_CMD_READ_BUFFER,
+		__raw_writel(GOLDFISH_TTY_CMD_READ_BUFFER,
 		       base + GOLDFISH_TTY_REG_CMD);
 
 	spin_unlock_irqrestore(&qtty->lock, irq_flags);
@@ -142,7 +142,7 @@ static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id)
 	unsigned char *buf;
 	u32 count;
 
-	count = readl(base + GOLDFISH_TTY_REG_BYTES_READY);
+	count = __raw_readl(base + GOLDFISH_TTY_REG_BYTES_READY);
 	if (count == 0)
 		return IRQ_NONE;
 
@@ -159,7 +159,7 @@ static int goldfish_tty_activate(struct tty_port *port, struct tty_struct *tty)
 {
 	struct goldfish_tty *qtty = container_of(port, struct goldfish_tty,
 									port);
-	writel(GOLDFISH_TTY_CMD_INT_ENABLE, qtty->base + GOLDFISH_TTY_REG_CMD);
+	__raw_writel(GOLDFISH_TTY_CMD_INT_ENABLE, qtty->base + GOLDFISH_TTY_REG_CMD);
 	return 0;
 }
 
@@ -167,7 +167,7 @@ static void goldfish_tty_shutdown(struct tty_port *port)
 {
 	struct goldfish_tty *qtty = container_of(port, struct goldfish_tty,
 									port);
-	writel(GOLDFISH_TTY_CMD_INT_DISABLE, qtty->base + GOLDFISH_TTY_REG_CMD);
+	__raw_writel(GOLDFISH_TTY_CMD_INT_DISABLE, qtty->base + GOLDFISH_TTY_REG_CMD);
 }
 
 static int goldfish_tty_open(struct tty_struct *tty, struct file *filp)
@@ -202,7 +202,7 @@ static int goldfish_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct goldfish_tty *qtty = &goldfish_ttys[tty->index];
 	void __iomem *base = qtty->base;
-	return readl(base + GOLDFISH_TTY_REG_BYTES_READY);
+	return __raw_readl(base + GOLDFISH_TTY_REG_BYTES_READY);
 }
 
 static void goldfish_tty_console_write(struct console *co, const char *b,
@@ -357,7 +357,7 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	 * on Ranchu emulator (qemu2) returns 1 here and
 	 * driver will use physical addresses.
 	 */
-	qtty->version = readl(base + GOLDFISH_TTY_REG_VERSION);
+	qtty->version = __raw_readl(base + GOLDFISH_TTY_REG_VERSION);
 
 	/*
 	 * Goldfish TTY device on Ranchu emulator (qemu2)
@@ -376,7 +376,7 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 		}
 	}
 
-	writel(GOLDFISH_TTY_CMD_INT_DISABLE, base + GOLDFISH_TTY_REG_CMD);
+	__raw_writel(GOLDFISH_TTY_CMD_INT_DISABLE, base + GOLDFISH_TTY_REG_CMD);
 
 	ret = request_irq(irq, goldfish_tty_interrupt, IRQF_SHARED,
 			  "goldfish_tty", qtty);
diff --git a/include/linux/goldfish.h b/include/linux/goldfish.h
index 265a099cd3b8..12be1601fd84 100644
--- a/include/linux/goldfish.h
+++ b/include/linux/goldfish.h
@@ -13,9 +13,9 @@ static inline void gf_write_ptr(const void *ptr, void __iomem *portl,
 {
 	const unsigned long addr = (unsigned long)ptr;
 
-	writel(lower_32_bits(addr), portl);
+	__raw_writel(lower_32_bits(addr), portl);
 #ifdef CONFIG_64BIT
-	writel(upper_32_bits(addr), porth);
+	__raw_writel(upper_32_bits(addr), porth);
 #endif
 }
 
@@ -23,9 +23,9 @@ static inline void gf_write_dma_addr(const dma_addr_t addr,
 				     void __iomem *portl,
 				     void __iomem *porth)
 {
-	writel(lower_32_bits(addr), portl);
+	__raw_writel(lower_32_bits(addr), portl);
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
-	writel(upper_32_bits(addr), porth);
+	__raw_writel(upper_32_bits(addr), porth);
 #endif
 }
 
-- 
cgit 


From caabdd0f59a9771ed095efe3ad5a08867b976ab2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 19 Oct 2020 09:35:39 +0200
Subject: ctype.h: remove duplicate isdigit() helper

gcc warns a few thousand times about the isdigit() shadow:

include/linux/ctype.h:26:19: warning: declaration of 'isdigit' shadows a built-in function [-Wshadow]

As there is already a compiler builtin, just use that, and make
it clear we do that by defining a macro.  Unfortunately, clang
does not have the isdigit() builtin, so this has to be conditional.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/compiler_types.h | 11 +++++++++++
 include/linux/ctype.h          | 15 +++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 6e390d58a9f8..5d41dff3f79c 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -64,6 +64,17 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 /* Attributes */
 #include <linux/compiler_attributes.h>
 
+/* Builtins */
+
+/*
+ * __has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.
+ * In the meantime, to support gcc < 10, we implement __has_builtin
+ * by hand.
+ */
+#ifndef __has_builtin
+#define __has_builtin(x) (0)
+#endif
+
 /* Compiler specific macros. */
 #ifdef __clang__
 #include <linux/compiler-clang.h>
diff --git a/include/linux/ctype.h b/include/linux/ctype.h
index 363b004426db..bc95aef2219c 100644
--- a/include/linux/ctype.h
+++ b/include/linux/ctype.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_CTYPE_H
 #define _LINUX_CTYPE_H
 
+#include <linux/compiler.h>
+
 /*
  * NOTE! This ctype does not handle EOF like the standard C
  * library is required to.
@@ -23,10 +25,6 @@ extern const unsigned char _ctype[];
 #define isalnum(c)	((__ismask(c)&(_U|_L|_D)) != 0)
 #define isalpha(c)	((__ismask(c)&(_U|_L)) != 0)
 #define iscntrl(c)	((__ismask(c)&(_C)) != 0)
-static inline int isdigit(int c)
-{
-	return '0' <= c && c <= '9';
-}
 #define isgraph(c)	((__ismask(c)&(_P|_U|_L|_D)) != 0)
 #define islower(c)	((__ismask(c)&(_L)) != 0)
 #define isprint(c)	((__ismask(c)&(_P|_U|_L|_D|_SP)) != 0)
@@ -39,6 +37,15 @@ static inline int isdigit(int c)
 #define isascii(c) (((unsigned char)(c))<=0x7f)
 #define toascii(c) (((unsigned char)(c))&0x7f)
 
+#if __has_builtin(__builtin_isdigit)
+#define  isdigit(c) __builtin_isdigit(c)
+#else
+static inline int isdigit(int c)
+{
+	return '0' <= c && c <= '9';
+}
+#endif
+
 static inline unsigned char __tolower(unsigned char c)
 {
 	if (isupper(c))
-- 
cgit 


From 39613eaad3ceff320da344427a70c655e783475e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 28 Oct 2020 12:16:10 +0530
Subject: qcom-geni-se: remove has_opp_table

has_opp_table isn't used anymore, remove it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

Link: https://lore.kernel.org/r/08ec1ee1d4252a266956abb5f1e0e0026d753564.1603867487.git.viresh.kumar@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/qcom-geni-se.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h
index f7bbea3f09ca..ec2ad4b0fe14 100644
--- a/include/linux/qcom-geni-se.h
+++ b/include/linux/qcom-geni-se.h
@@ -48,7 +48,6 @@ struct geni_icc_path {
  * @clk_perf_tbl:	Table of clock frequency input to serial engine clock
  * @icc_paths:		Array of ICC paths for SE
  * @opp_table:		Pointer to the OPP table
- * @has_opp_table:	Specifies if the SE has an OPP table
  */
 struct geni_se {
 	void __iomem *base;
@@ -59,7 +58,6 @@ struct geni_se {
 	unsigned long *clk_perf_tbl;
 	struct geni_icc_path icc_paths[3];
 	struct opp_table *opp_table;
-	bool has_opp_table;
 };
 
 /* Common SE registers */
-- 
cgit 


From f1f37abbe6fc2b1242f78157db76e48dbf9518ee Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 19 Oct 2020 15:40:46 +0200
Subject: gpio: Retire the explicit gpio irqchip code

Now that all gpiolib irqchip users have been over to use
the irqchip template, we can finally retire the old code
path and leave just one way in to the irqchip: set up the
template when registering the gpio_chip. For a while
we had two code paths for this which was a bit confusing.

This brings this work to a conclusion, there is now one
way of doing this.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Thierry Reding <thierry.reding@gmail.com>
Link: https://lore.kernel.org/r/20201019134046.65101-1-linus.walleij@linaro.org
---
 Documentation/driver-api/gpio/driver.rst |  63 ++++++++-----
 drivers/gpio/TODO                        |  49 ----------
 drivers/gpio/gpiolib.c                   | 153 -------------------------------
 include/linux/gpio/driver.h              |  71 --------------
 4 files changed, 42 insertions(+), 294 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 072a7455044e..65d708093b71 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -416,7 +416,8 @@ The preferred way to set up the helpers is to fill in the
 struct gpio_irq_chip inside struct gpio_chip before adding the gpio_chip.
 If you do this, the additional irq_chip will be set up by gpiolib at the
 same time as setting up the rest of the GPIO functionality. The following
-is a typical example of a cascaded interrupt handler using gpio_irq_chip:
+is a typical example of a chained cascaded interrupt handler using
+the gpio_irq_chip:
 
 .. code-block:: c
 
@@ -452,7 +453,46 @@ is a typical example of a cascaded interrupt handler using gpio_irq_chip:
 
   return devm_gpiochip_add_data(dev, &g->gc, g);
 
-The helper support using hierarchical interrupt controllers as well.
+The helper supports using threaded interrupts as well. Then you just request
+the interrupt separately and go with it:
+
+.. code-block:: c
+
+  /* Typical state container with dynamic irqchip */
+  struct my_gpio {
+      struct gpio_chip gc;
+      struct irq_chip irq;
+  };
+
+  int irq; /* from platform etc */
+  struct my_gpio *g;
+  struct gpio_irq_chip *girq;
+
+  /* Set up the irqchip dynamically */
+  g->irq.name = "my_gpio_irq";
+  g->irq.irq_ack = my_gpio_ack_irq;
+  g->irq.irq_mask = my_gpio_mask_irq;
+  g->irq.irq_unmask = my_gpio_unmask_irq;
+  g->irq.irq_set_type = my_gpio_set_irq_type;
+
+  ret = devm_request_threaded_irq(dev, irq, NULL,
+		irq_thread_fn, IRQF_ONESHOT, "my-chip", g);
+  if (ret < 0)
+	return ret;
+
+  /* Get a pointer to the gpio_irq_chip */
+  girq = &g->gc.irq;
+  girq->chip = &g->irq;
+  /* This will let us handle the parent IRQ in the driver */
+  girq->parent_handler = NULL;
+  girq->num_parents = 0;
+  girq->parents = NULL;
+  girq->default_type = IRQ_TYPE_NONE;
+  girq->handler = handle_bad_irq;
+
+  return devm_gpiochip_add_data(dev, &g->gc, g);
+
+The helper supports using hierarchical interrupt controllers as well.
 In this case the typical set-up will look like this:
 
 .. code-block:: c
@@ -493,25 +533,6 @@ the parent hardware irq from a child (i.e. this gpio chip) hardware irq.
 As always it is good to look at examples in the kernel tree for advice
 on how to find the required pieces.
 
-The old way of adding irqchips to gpiochips after registration is also still
-available but we try to move away from this:
-
-- DEPRECATED: gpiochip_irqchip_add(): adds a chained cascaded irqchip to a
-  gpiochip. It will pass the struct gpio_chip* for the chip to all IRQ
-  callbacks, so the callbacks need to embed the gpio_chip in its state
-  container and obtain a pointer to the container using container_of().
-  (See Documentation/driver-api/driver-model/design-patterns.rst)
-
-- gpiochip_irqchip_add_nested(): adds a nested cascaded irqchip to a gpiochip,
-  as discussed above regarding different types of cascaded irqchips. The
-  cascaded irq has to be handled by a threaded interrupt handler.
-  Apart from that it works exactly like the chained irqchip.
-
-- gpiochip_set_nested_irqchip(): sets up a nested cascaded irq handler for a
-  gpio_chip from a parent IRQ. As the parent IRQ has usually been
-  explicitly requested by the driver, this does very little more than
-  mark all the child IRQs as having the other IRQ as parent.
-
 If there is a need to exclude certain GPIO lines from the IRQ domain handled by
 these helpers, we can set .irq.need_valid_mask of the gpiochip before
 devm_gpiochip_add_data() or gpiochip_add_data() is called. This allocates an
diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO
index e560e45e84f8..cd04e0b60159 100644
--- a/drivers/gpio/TODO
+++ b/drivers/gpio/TODO
@@ -129,58 +129,9 @@ GPIOLIB irqchip
 The GPIOLIB irqchip is a helper irqchip for "simple cases" that should
 try to cover any generic kind of irqchip cascaded from a GPIO.
 
-- Convert all the GPIOLIB_IRQCHIP users to pass an irqchip template,
-  parent and flags before calling [devm_]gpiochip_add[_data]().
-  Currently we set up the irqchip after setting up the gpiochip
-  using gpiochip_irqchip_add() and gpiochip_set_[chained|nested]_irqchip().
-  This is too complex, so convert all users over to just set up
-  the irqchip before registering the gpio_chip, typical example:
-
-  /* Typical state container with dynamic irqchip */
-  struct my_gpio {
-      struct gpio_chip gc;
-      struct irq_chip irq;
-  };
-
-  int irq; /* from platform etc */
-  struct my_gpio *g;
-  struct gpio_irq_chip *girq;
-
-  /* Set up the irqchip dynamically */
-  g->irq.name = "my_gpio_irq";
-  g->irq.irq_ack = my_gpio_ack_irq;
-  g->irq.irq_mask = my_gpio_mask_irq;
-  g->irq.irq_unmask = my_gpio_unmask_irq;
-  g->irq.irq_set_type = my_gpio_set_irq_type;
-
-  /* Get a pointer to the gpio_irq_chip */
-  girq = &g->gc.irq;
-  girq->chip = &g->irq;
-  girq->parent_handler = ftgpio_gpio_irq_handler;
-  girq->num_parents = 1;
-  girq->parents = devm_kcalloc(dev, 1, sizeof(*girq->parents),
-                               GFP_KERNEL);
-  if (!girq->parents)
-      return -ENOMEM;
-  girq->default_type = IRQ_TYPE_NONE;
-  girq->handler = handle_bad_irq;
-  girq->parents[0] = irq;
-
-  When this is done, we will delete the old APIs for instatiating
-  GPIOLIB_IRQCHIP and simplify the code.
-
 - Look over and identify any remaining easily converted drivers and
   dry-code conversions to gpiolib irqchip for maintainers to test
 
-- Drop gpiochip_set_chained_irqchip() when all the chained irqchips
-  have been converted to the above infrastructure.
-
-- Add more infrastructure to make it possible to also pass a threaded
-  irqchip in struct gpio_irq_chip.
-
-- Drop gpiochip_irqchip_add_nested() when all the chained irqchips
-  have been converted to the above infrastructure.
-
 
 Increase integration with pin control
 
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 3b23a0ca77dd..8e29a60c3697 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -924,67 +924,6 @@ bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc,
 }
 EXPORT_SYMBOL_GPL(gpiochip_irqchip_irq_valid);
 
-/**
- * gpiochip_set_cascaded_irqchip() - connects a cascaded irqchip to a gpiochip
- * @gc: the gpiochip to set the irqchip chain to
- * @parent_irq: the irq number corresponding to the parent IRQ for this
- * cascaded irqchip
- * @parent_handler: the parent interrupt handler for the accumulated IRQ
- * coming out of the gpiochip. If the interrupt is nested rather than
- * cascaded, pass NULL in this handler argument
- */
-static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gc,
-					  unsigned int parent_irq,
-					  irq_flow_handler_t parent_handler)
-{
-	struct gpio_irq_chip *girq = &gc->irq;
-	struct device *dev = &gc->gpiodev->dev;
-
-	if (!girq->domain) {
-		chip_err(gc, "called %s before setting up irqchip\n",
-			 __func__);
-		return;
-	}
-
-	if (parent_handler) {
-		if (gc->can_sleep) {
-			chip_err(gc,
-				 "you cannot have chained interrupts on a chip that may sleep\n");
-			return;
-		}
-		girq->parents = devm_kcalloc(dev, 1,
-					     sizeof(*girq->parents),
-					     GFP_KERNEL);
-		if (!girq->parents) {
-			chip_err(gc, "out of memory allocating parent IRQ\n");
-			return;
-		}
-		girq->parents[0] = parent_irq;
-		girq->num_parents = 1;
-		/*
-		 * The parent irqchip is already using the chip_data for this
-		 * irqchip, so our callbacks simply use the handler_data.
-		 */
-		irq_set_chained_handler_and_data(parent_irq, parent_handler,
-						 gc);
-	}
-}
-
-/**
- * gpiochip_set_nested_irqchip() - connects a nested irqchip to a gpiochip
- * @gc: the gpiochip to set the irqchip nested handler to
- * @irqchip: the irqchip to nest to the gpiochip
- * @parent_irq: the irq number corresponding to the parent IRQ for this
- * nested irqchip
- */
-void gpiochip_set_nested_irqchip(struct gpio_chip *gc,
-				 struct irq_chip *irqchip,
-				 unsigned int parent_irq)
-{
-	gpiochip_set_cascaded_irqchip(gc, parent_irq, NULL);
-}
-EXPORT_SYMBOL_GPL(gpiochip_set_nested_irqchip);
-
 #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
 
 /**
@@ -1635,98 +1574,6 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gc)
 	gpiochip_irqchip_free_valid_mask(gc);
 }
 
-/**
- * gpiochip_irqchip_add_key() - adds an irqchip to a gpiochip
- * @gc: the gpiochip to add the irqchip to
- * @irqchip: the irqchip to add to the gpiochip
- * @first_irq: if not dynamically assigned, the base (first) IRQ to
- * allocate gpiochip irqs from
- * @handler: the irq handler to use (often a predefined irq core function)
- * @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE
- * to have the core avoid setting up any default type in the hardware.
- * @threaded: whether this irqchip uses a nested thread handler
- * @lock_key: lockdep class for IRQ lock
- * @request_key: lockdep class for IRQ request
- *
- * This function closely associates a certain irqchip with a certain
- * gpiochip, providing an irq domain to translate the local IRQs to
- * global irqs in the gpiolib core, and making sure that the gpiochip
- * is passed as chip data to all related functions. Driver callbacks
- * need to use gpiochip_get_data() to get their local state containers back
- * from the gpiochip passed as chip data. An irqdomain will be stored
- * in the gpiochip that shall be used by the driver to handle IRQ number
- * translation. The gpiochip will need to be initialized and registered
- * before calling this function.
- *
- * This function will handle two cell:ed simple IRQs and assumes all
- * the pins on the gpiochip can generate a unique IRQ. Everything else
- * need to be open coded.
- */
-int gpiochip_irqchip_add_key(struct gpio_chip *gc,
-			     struct irq_chip *irqchip,
-			     unsigned int first_irq,
-			     irq_flow_handler_t handler,
-			     unsigned int type,
-			     bool threaded,
-			     struct lock_class_key *lock_key,
-			     struct lock_class_key *request_key)
-{
-	struct device_node *of_node;
-
-	if (!gc || !irqchip)
-		return -EINVAL;
-
-	if (!gc->parent) {
-		chip_err(gc, "missing gpiochip .dev parent pointer\n");
-		return -EINVAL;
-	}
-	gc->irq.threaded = threaded;
-	of_node = gc->parent->of_node;
-#ifdef CONFIG_OF_GPIO
-	/*
-	 * If the gpiochip has an assigned OF node this takes precedence
-	 * FIXME: get rid of this and use gc->parent->of_node
-	 * everywhere
-	 */
-	if (gc->of_node)
-		of_node = gc->of_node;
-#endif
-	/*
-	 * Specifying a default trigger is a terrible idea if DT or ACPI is
-	 * used to configure the interrupts, as you may end-up with
-	 * conflicting triggers. Tell the user, and reset to NONE.
-	 */
-	if (WARN(of_node && type != IRQ_TYPE_NONE,
-		 "%pOF: Ignoring %d default trigger\n", of_node, type))
-		type = IRQ_TYPE_NONE;
-	if (has_acpi_companion(gc->parent) && type != IRQ_TYPE_NONE) {
-		acpi_handle_warn(ACPI_HANDLE(gc->parent),
-				 "Ignoring %d default trigger\n", type);
-		type = IRQ_TYPE_NONE;
-	}
-
-	gc->irq.chip = irqchip;
-	gc->irq.handler = handler;
-	gc->irq.default_type = type;
-	gc->to_irq = gpiochip_to_irq;
-	gc->irq.lock_key = lock_key;
-	gc->irq.request_key = request_key;
-	gc->irq.domain = irq_domain_add_simple(of_node,
-					gc->ngpio, first_irq,
-					&gpiochip_domain_ops, gc);
-	if (!gc->irq.domain) {
-		gc->irq.chip = NULL;
-		return -EINVAL;
-	}
-
-	gpiochip_set_irq_hooks(gc);
-
-	acpi_gpiochip_request_interrupts(gc);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key);
-
 /**
  * gpiochip_irqchip_add_domain() - adds an irqdomain to a gpiochip
  * @gc: the gpiochip to add the irqchip to
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 4a7e295c3640..286de0520574 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -621,83 +621,12 @@ int gpiochip_irq_domain_activate(struct irq_domain *domain,
 void gpiochip_irq_domain_deactivate(struct irq_domain *domain,
 				    struct irq_data *data);
 
-void gpiochip_set_nested_irqchip(struct gpio_chip *gc,
-		struct irq_chip *irqchip,
-		unsigned int parent_irq);
-
-int gpiochip_irqchip_add_key(struct gpio_chip *gc,
-			     struct irq_chip *irqchip,
-			     unsigned int first_irq,
-			     irq_flow_handler_t handler,
-			     unsigned int type,
-			     bool threaded,
-			     struct lock_class_key *lock_key,
-			     struct lock_class_key *request_key);
-
 bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc,
 				unsigned int offset);
 
 int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 				struct irq_domain *domain);
 
-#ifdef CONFIG_LOCKDEP
-
-/*
- * Lockdep requires that each irqchip instance be created with a
- * unique key so as to avoid unnecessary warnings. This upfront
- * boilerplate static inlines provides such a key for each
- * unique instance.
- */
-static inline int gpiochip_irqchip_add(struct gpio_chip *gc,
-				       struct irq_chip *irqchip,
-				       unsigned int first_irq,
-				       irq_flow_handler_t handler,
-				       unsigned int type)
-{
-	static struct lock_class_key lock_key;
-	static struct lock_class_key request_key;
-
-	return gpiochip_irqchip_add_key(gc, irqchip, first_irq,
-					handler, type, false,
-					&lock_key, &request_key);
-}
-
-static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gc,
-			  struct irq_chip *irqchip,
-			  unsigned int first_irq,
-			  irq_flow_handler_t handler,
-			  unsigned int type)
-{
-
-	static struct lock_class_key lock_key;
-	static struct lock_class_key request_key;
-
-	return gpiochip_irqchip_add_key(gc, irqchip, first_irq,
-					handler, type, true,
-					&lock_key, &request_key);
-}
-#else /* ! CONFIG_LOCKDEP */
-static inline int gpiochip_irqchip_add(struct gpio_chip *gc,
-				       struct irq_chip *irqchip,
-				       unsigned int first_irq,
-				       irq_flow_handler_t handler,
-				       unsigned int type)
-{
-	return gpiochip_irqchip_add_key(gc, irqchip, first_irq,
-					handler, type, false, NULL, NULL);
-}
-
-static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gc,
-			  struct irq_chip *irqchip,
-			  unsigned int first_irq,
-			  irq_flow_handler_t handler,
-			  unsigned int type)
-{
-	return gpiochip_irqchip_add_key(gc, irqchip, first_irq,
-					handler, type, true, NULL, NULL);
-}
-#endif /* CONFIG_LOCKDEP */
-
 int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset,
-- 
cgit 


From 6a6223ec7779dfdabb9c2567bb42079bc300cf27 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 27 Oct 2020 10:51:13 +0100
Subject: blk-mq: docs: add kernel-doc description for a new struct member

As reported by kernel-doc:
	./include/linux/blk-mq.h:267: warning: Function parameter or member 'active_queues_shared_sbitmap' not described in 'blk_mq_tag_set'

There is now a new member for struct blk_mq_tag_set. Add a
description for it, based on the commit that introduced it.

Fixes: f1b49fdc1c64 ("blk-mq: Record active_queues_shared_sbitmap per tag_set for when using shared sbitmap")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/8e513153b83eefc05e358f51f2632b592c3f6772.1603791716.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/blk-mq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b23eeca4d677..794b2a33a2c3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -235,6 +235,8 @@ enum hctx_type {
  * @flags:	   Zero or more BLK_MQ_F_* flags.
  * @driver_data:   Pointer to data owned by the block driver that created this
  *		   tag set.
+ * @active_queues_shared_sbitmap:
+ * 		   number of active request queues per tag set.
  * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
  * @__breserved_tags:
  *		   A shared reserved tags sbitmap, used over all hctx's
-- 
cgit 


From 89b422354409c275e898d26607201797cc05a932 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 27 Oct 2020 10:51:17 +0100
Subject: mm: pagemap.h: fix two kernel-doc markups

Changeset a8cf7f272b5a ("mm: add find_lock_head") renamed the
index parameter, but forgot to update the kernel-doc markups
accordingly.

Fixes: a8cf7f272b5a ("mm: add find_lock_head")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/dce89b296a4f5f9f8f798d5e76b6736c14a916ac.1603791716.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/pagemap.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c77b7c31b2e4..e1e19c1f9ec9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -344,9 +344,9 @@ static inline struct page *find_get_page_flags(struct address_space *mapping,
 /**
  * find_lock_page - locate, pin and lock a pagecache page
  * @mapping: the address_space to search
- * @offset: the page index
+ * @index: the page index
  *
- * Looks up the page cache entry at @mapping & @offset.  If there is a
+ * Looks up the page cache entry at @mapping & @index.  If there is a
  * page cache page, it is returned locked and with an increased
  * refcount.
  *
@@ -363,9 +363,9 @@ static inline struct page *find_lock_page(struct address_space *mapping,
 /**
  * find_lock_head - Locate, pin and lock a pagecache page.
  * @mapping: The address_space to search.
- * @offset: The page index.
+ * @index: The page index.
  *
- * Looks up the page cache entry at @mapping & @offset.  If there is a
+ * Looks up the page cache entry at @mapping & @index.  If there is a
  * page cache page, its head page is returned locked and with an increased
  * refcount.
  *
-- 
cgit 


From e86c6569c588a01f20e7554cc245f8fae831957b Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 27 Oct 2020 10:51:18 +0100
Subject: net: phy: remove kernel-doc duplication

Sphinx 3 now checks for duplicated function declarations:

	.../Documentation/networking/kapi:143: ../include/linux/phy.h:163: WARNING: Duplicate C declaration, also defined in 'networking/kapi'.
	Declaration is 'unsigned int phy_supported_speeds (struct phy_device *phy, unsigned int *speeds, unsigned int size)'.
	.../Documentation/networking/kapi:143: ../include/linux/phy.h:1034: WARNING: Duplicate C declaration, also defined in 'networking/kapi'.
	Declaration is 'int phy_read_mmd (struct phy_device *phydev, int devad, u32 regnum)'.
	.../Documentation/networking/kapi:143: ../include/linux/phy.h:1076: WARNING: Duplicate C declaration, also defined in 'networking/kapi'.
	Declaration is 'int __phy_read_mmd (struct phy_device *phydev, int devad, u32 regnum)'.
	.../Documentation/networking/kapi:143: ../include/linux/phy.h:1088: WARNING: Duplicate C declaration, also defined in 'networking/kapi'.
	Declaration is 'int phy_write_mmd (struct phy_device *phydev, int devad, u32 regnum, u16 val)'.
	.../Documentation/networking/kapi:143: ../include/linux/phy.h:1100: WARNING: Duplicate C declaration, also defined in 'networking/kapi'.
	Declaration is 'int __phy_write_mmd (struct phy_device *phydev, int devad, u32 regnum, u16 val)'.

It turns that both the C and the H files have the same
kernel-doc markup for the same functions. Let's drop the
at the header file, keeping the one closer to the code.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/75e9a357f9a716833d2094b04898754876365e68.1603791716.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/phy.h | 40 +++++-----------------------------------
 1 file changed, 5 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index eb3cb1a98b45..56563e5e0dc7 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -147,16 +147,8 @@ typedef enum {
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
-/**
+/*
  * phy_supported_speeds - return all speeds currently supported by a PHY device
- * @phy: The PHY device to return supported speeds of.
- * @speeds: buffer to store supported speeds in.
- * @size: size of speeds buffer.
- *
- * Description: Returns the number of supported speeds, and fills
- * the speeds buffer with the supported speeds. If speeds buffer is
- * too small to contain all currently supported speeds, will return as
- * many speeds as can fit.
  */
 unsigned int phy_supported_speeds(struct phy_device *phy,
 				      unsigned int *speeds,
@@ -1022,14 +1014,9 @@ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum,
 					regnum, mask, set);
 }
 
-/**
+/*
  * phy_read_mmd - Convenience function for reading a register
  * from an MMD on a given PHY.
- * @phydev: The phy_device struct
- * @devad: The MMD to read from
- * @regnum: The register on the MMD to read
- *
- * Same rules as for phy_read();
  */
 int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);
 
@@ -1064,38 +1051,21 @@ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);
 	__ret; \
 })
 
-/**
+/*
  * __phy_read_mmd - Convenience function for reading a register
  * from an MMD on a given PHY.
- * @phydev: The phy_device struct
- * @devad: The MMD to read from
- * @regnum: The register on the MMD to read
- *
- * Same rules as for __phy_read();
  */
 int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);
 
-/**
+/*
  * phy_write_mmd - Convenience function for writing a register
  * on an MMD on a given PHY.
- * @phydev: The phy_device struct
- * @devad: The MMD to write to
- * @regnum: The register on the MMD to read
- * @val: value to write to @regnum
- *
- * Same rules as for phy_write();
  */
 int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val);
 
-/**
+/*
  * __phy_write_mmd - Convenience function for writing a register
  * on an MMD on a given PHY.
- * @phydev: The phy_device struct
- * @devad: The MMD to write to
- * @regnum: The register on the MMD to read
- * @val: value to write to @regnum
- *
- * Same rules as for __phy_write();
  */
 int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val);
 
-- 
cgit 


From cf38cc9f1e71151f22584c40357afaab6609384b Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 27 Oct 2020 10:51:23 +0100
Subject: locking/refcount: move kernel-doc markups to the proper place

Changeset a435b9a14356 ("locking/refcount: Provide __refcount API to obtain the old value")
added a set of functions starting with __ that have a new
parameter, adding a series of new warnings:

	$ ./scripts/kernel-doc -none include/linux/refcount.h
	include/linux/refcount.h:169: warning: Function parameter or member 'oldp' not described in '__refcount_add_not_zero'
	include/linux/refcount.h:208: warning: Function parameter or member 'oldp' not described in '__refcount_add'
	include/linux/refcount.h:239: warning: Function parameter or member 'oldp' not described in '__refcount_inc_not_zero'
	include/linux/refcount.h:261: warning: Function parameter or member 'oldp' not described in '__refcount_inc'
	include/linux/refcount.h:291: warning: Function parameter or member 'oldp' not described in '__refcount_sub_and_test'
	include/linux/refcount.h:327: warning: Function parameter or member 'oldp' not described in '__refcount_dec_and_test'
	include/linux/refcount.h:347: warning: Function parameter or member 'oldp' not described in '__refcount_dec'

The issue is that the kernel-doc markups are now misplaced,
as they should be added just before the functions.

So, move the kernel-doc markups to the proper places,
in order to drop the warnings.

It should be noticed that git show produces a crappy output,
for this patch without "--patience" flag.

Fixes: a435b9a14356 ("locking/refcount: Provide __refcount API to obtain the old value")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/7985c31d1ace591bc5e1faa05c367f1295b78afd.1603791716.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/refcount.h | 130 +++++++++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 7fabb1af18e0..497990c69b0b 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -147,24 +147,6 @@ static inline unsigned int refcount_read(const refcount_t *r)
 	return atomic_read(&r->refs);
 }
 
-/**
- * refcount_add_not_zero - add a value to a refcount unless it is 0
- * @i: the value to add to the refcount
- * @r: the refcount
- *
- * Will saturate at REFCOUNT_SATURATED and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- *
- * Use of this function is not recommended for the normal reference counting
- * use case in which references are taken and released one at a time.  In these
- * cases, refcount_inc(), or one of its variants, should instead be used to
- * increment a reference count.
- *
- * Return: false if the passed refcount is 0, true otherwise
- */
 static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
 {
 	int old = refcount_read(r);
@@ -183,17 +165,12 @@ static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, in
 	return old;
 }
 
-static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
-{
-	return __refcount_add_not_zero(i, r, NULL);
-}
-
 /**
- * refcount_add - add a value to a refcount
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
  * @i: the value to add to the refcount
  * @r: the refcount
  *
- * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
+ * Will saturate at REFCOUNT_SATURATED and WARN.
  *
  * Provides no memory ordering, it is assumed the caller has guaranteed the
  * object memory to be stable (RCU, etc.). It does provide a control dependency
@@ -203,7 +180,14 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
  * use case in which references are taken and released one at a time.  In these
  * cases, refcount_inc(), or one of its variants, should instead be used to
  * increment a reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
  */
+static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
+{
+	return __refcount_add_not_zero(i, r, NULL);
+}
+
 static inline void __refcount_add(int i, refcount_t *r, int *oldp)
 {
 	int old = atomic_fetch_add_relaxed(i, &r->refs);
@@ -217,11 +201,32 @@ static inline void __refcount_add(int i, refcount_t *r, int *oldp)
 		refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
 }
 
+/**
+ * refcount_add - add a value to a refcount
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ */
 static inline void refcount_add(int i, refcount_t *r)
 {
 	__refcount_add(i, r, NULL);
 }
 
+static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
+{
+	return __refcount_add_not_zero(1, r, oldp);
+}
+
 /**
  * refcount_inc_not_zero - increment a refcount unless it is 0
  * @r: the refcount to increment
@@ -235,14 +240,14 @@ static inline void refcount_add(int i, refcount_t *r)
  *
  * Return: true if the increment was successful, false otherwise
  */
-static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
+static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
 {
-	return __refcount_add_not_zero(1, r, oldp);
+	return __refcount_inc_not_zero(r, NULL);
 }
 
-static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+static inline void __refcount_inc(refcount_t *r, int *oldp)
 {
-	return __refcount_inc_not_zero(r, NULL);
+	__refcount_add(1, r, oldp);
 }
 
 /**
@@ -257,14 +262,27 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
  * Will WARN if the refcount is 0, as this represents a possible use-after-free
  * condition.
  */
-static inline void __refcount_inc(refcount_t *r, int *oldp)
+static inline void refcount_inc(refcount_t *r)
 {
-	__refcount_add(1, r, oldp);
+	__refcount_inc(r, NULL);
 }
 
-static inline void refcount_inc(refcount_t *r)
+static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
 {
-	__refcount_inc(r, NULL);
+	int old = atomic_fetch_sub_release(i, &r->refs);
+
+	if (oldp)
+		*oldp = old;
+
+	if (old == i) {
+		smp_acquire__after_ctrl_dep();
+		return true;
+	}
+
+	if (unlikely(old < 0 || old - i < 0))
+		refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
+
+	return false;
 }
 
 /**
@@ -287,27 +305,14 @@ static inline void refcount_inc(refcount_t *r)
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
-static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
+static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
 {
-	int old = atomic_fetch_sub_release(i, &r->refs);
-
-	if (oldp)
-		*oldp = old;
-
-	if (old == i) {
-		smp_acquire__after_ctrl_dep();
-		return true;
-	}
-
-	if (unlikely(old < 0 || old - i < 0))
-		refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
-
-	return false;
+	return __refcount_sub_and_test(i, r, NULL);
 }
 
-static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
+static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
 {
-	return __refcount_sub_and_test(i, r, NULL);
+	return __refcount_sub_and_test(1, r, oldp);
 }
 
 /**
@@ -323,26 +328,11 @@ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
-static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
-{
-	return __refcount_sub_and_test(1, r, oldp);
-}
-
 static inline __must_check bool refcount_dec_and_test(refcount_t *r)
 {
 	return __refcount_dec_and_test(r, NULL);
 }
 
-/**
- * refcount_dec - decrement a refcount
- * @r: the refcount
- *
- * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
- * when saturated at REFCOUNT_SATURATED.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before.
- */
 static inline void __refcount_dec(refcount_t *r, int *oldp)
 {
 	int old = atomic_fetch_sub_release(1, &r->refs);
@@ -354,6 +344,16 @@ static inline void __refcount_dec(refcount_t *r, int *oldp)
 		refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
 }
 
+/**
+ * refcount_dec - decrement a refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
 static inline void refcount_dec(refcount_t *r)
 {
 	__refcount_dec(r, NULL);
-- 
cgit 


From 7cb415003468d41aecd6877ae088c38f6c0fc174 Mon Sep 17 00:00:00 2001
From: Peilin Ye <yepeilin.cs@gmail.com>
Date: Wed, 28 Oct 2020 06:56:47 -0400
Subject: Fonts: Make font size unsigned in font_desc

`width` and `height` are defined as unsigned in our UAPI font descriptor
`struct console_font`. Make them unsigned in our kernel font descriptor
`struct font_desc`, too.

Also, change the corresponding printk() format identifiers from `%d` to
`%u`, in sti_select_fbfont().

Signed-off-by: Peilin Ye <yepeilin.cs@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20201028105647.1210161-1-yepeilin.cs@gmail.com
---
 drivers/video/console/sticore.c | 2 +-
 include/linux/font.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/console/sticore.c b/drivers/video/console/sticore.c
index 84c3ca37040a..fade32aa6737 100644
--- a/drivers/video/console/sticore.c
+++ b/drivers/video/console/sticore.c
@@ -506,7 +506,7 @@ sti_select_fbfont(struct sti_cooked_rom *cooked_rom, const char *fbfont_name)
 	if (!fbfont)
 		return NULL;
 
-	pr_info("STI selected %dx%d framebuffer font %s for sticon\n",
+	pr_info("STI selected %ux%u framebuffer font %s for sticon\n",
 			fbfont->width, fbfont->height, fbfont->name);
 			
 	bpc = ((fbfont->width+7)/8) * fbfont->height; 
diff --git a/include/linux/font.h b/include/linux/font.h
index 4a3f8741bb7e..b7214d7881f0 100644
--- a/include/linux/font.h
+++ b/include/linux/font.h
@@ -16,7 +16,7 @@
 struct font_desc {
     int idx;
     const char *name;
-    int width, height;
+    unsigned int width, height;
     const void *data;
     int pref;
 };
-- 
cgit 


From 8073c1ac82c12aaf1b475a3ce5328d43b3eaa4ae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Oct 2020 22:35:11 +0100
Subject: genirq/msi: Allow shadow declarations of msi_msg:: $member

Architectures like x86 have their MSI messages in various bits of the data,
address_lo and address_hi field. Composing or decomposing these messages
with bitmasks and shifts is possible, but unreadable gunk.

Allow architectures to provide an architecture specific representation for
each member of msi_msg. Provide empty defaults for each and stick them into
an union.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201024213535.443185-12-dwmw2@infradead.org
---
 include/asm-generic/msi.h |  4 ++++
 include/linux/msi.h       | 46 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/msi.h b/include/asm-generic/msi.h
index e6795f088bdd..25344de0e8f9 100644
--- a/include/asm-generic/msi.h
+++ b/include/asm-generic/msi.h
@@ -4,6 +4,8 @@
 
 #include <linux/types.h>
 
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+
 #ifndef NUM_MSI_ALLOC_SCRATCHPAD_REGS
 # define NUM_MSI_ALLOC_SCRATCHPAD_REGS	2
 #endif
@@ -30,4 +32,6 @@ typedef struct msi_alloc_info {
 
 #define GENERIC_MSI_DOMAIN_OPS		1
 
+#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
+
 #endif
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 6b584cc4757c..360a0a7e7341 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -4,11 +4,50 @@
 
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <asm/msi.h>
+
+/* Dummy shadow structures if an architecture does not define them */
+#ifndef arch_msi_msg_addr_lo
+typedef struct arch_msi_msg_addr_lo {
+	u32	address_lo;
+} __attribute__ ((packed)) arch_msi_msg_addr_lo_t;
+#endif
+
+#ifndef arch_msi_msg_addr_hi
+typedef struct arch_msi_msg_addr_hi {
+	u32	address_hi;
+} __attribute__ ((packed)) arch_msi_msg_addr_hi_t;
+#endif
+
+#ifndef arch_msi_msg_data
+typedef struct arch_msi_msg_data {
+	u32	data;
+} __attribute__ ((packed)) arch_msi_msg_data_t;
+#endif
 
+/**
+ * msi_msg - Representation of a MSI message
+ * @address_lo:		Low 32 bits of msi message address
+ * @arch_addrlo:	Architecture specific shadow of @address_lo
+ * @address_hi:		High 32 bits of msi message address
+ *			(only used when device supports it)
+ * @arch_addrhi:	Architecture specific shadow of @address_hi
+ * @data:		MSI message data (usually 16 bits)
+ * @arch_data:		Architecture specific shadow of @data
+ */
 struct msi_msg {
-	u32	address_lo;	/* low 32 bits of msi message address */
-	u32	address_hi;	/* high 32 bits of msi message address */
-	u32	data;		/* 16 bits of msi message data */
+	union {
+		u32			address_lo;
+		arch_msi_msg_addr_lo_t	arch_addr_lo;
+	};
+	union {
+		u32			address_hi;
+		arch_msi_msg_addr_hi_t	arch_addr_hi;
+	};
+	union {
+		u32			data;
+		arch_msi_msg_data_t	arch_data;
+	};
 };
 
 extern int pci_msi_ignore_mask;
@@ -243,7 +282,6 @@ struct msi_controller {
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 
 #include <linux/irqhandler.h>
-#include <asm/msi.h>
 
 struct irq_domain;
 struct irq_domain_ops;
-- 
cgit 


From cf83b2d2e2b64920bd6999b199dfa271d7e94cf8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 27 Oct 2020 23:10:54 -0700
Subject: bpf: Permit cond_resched for some iterators

Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with
bpf task/task_file iterator") tries to fix rcu stalls warning
which is caused by bpf task_file iterator when running
"bpftool prog".

      rcu: INFO: rcu_sched self-detected stall on CPU
      rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
      \x09(t=21031 jiffies g=2534773 q=179750)
      NMI backtrace for cpu 7
      CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G        W         5.8.0-00004-g68bfc7f8c1b4 #6
      Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
      Call Trace:
      <IRQ>
      dump_stack+0x57/0x70
      nmi_cpu_backtrace.cold+0x14/0x53
      ? lapic_can_unplug_cpu.cold+0x39/0x39
      nmi_trigger_cpumask_backtrace+0xb7/0xc7
      rcu_dump_cpu_stacks+0xa2/0xd0
      rcu_sched_clock_irq.cold+0x1ff/0x3d9
      ? tick_nohz_handler+0x100/0x100
      update_process_times+0x5b/0x90
      tick_sched_timer+0x5e/0xf0
      __hrtimer_run_queues+0x12a/0x2a0
      hrtimer_interrupt+0x10e/0x280
      __sysvec_apic_timer_interrupt+0x51/0xe0
      asm_call_on_stack+0xf/0x20
      </IRQ>
      sysvec_apic_timer_interrupt+0x6f/0x80
      ...
      task_file_seq_next+0x52/0xa0
      bpf_seq_read+0xb9/0x320
      vfs_read+0x9d/0x180
      ksys_read+0x5f/0xe0
      do_syscall_64+0x38/0x60
      entry_SYSCALL_64_after_hwframe+0x44/0xa9

The fix is to limit the number of bpf program runs to be
one million. This fixed the program in most cases. But
we also found under heavy load, which can increase the wallclock
time for bpf_seq_read(), the warning may still be possible.

For example, calling bpf_delay() in the "while" loop of
bpf_seq_read(), which will introduce artificial delay,
the warning will show up in my qemu run.

  static unsigned q;
  volatile unsigned *p = &q;
  volatile unsigned long long ll;
  static void bpf_delay(void)
  {
         int i, j;

         for (i = 0; i < 10000; i++)
                 for (j = 0; j < 10000; j++)
                         ll += *p;
  }

There are two ways to fix this issue. One is to reduce the above
one million threshold to say 100,000 and hopefully rcu warning will
not show up any more. Another is to introduce a target feature
which enables bpf_seq_read() calling cond_resched().

This patch took second approach as the first approach may cause
more -EAGAIN failures for read() syscalls. Note that not all bpf_iter
targets can permit cond_resched() in bpf_seq_read() as some, e.g.,
netlink seq iterator, rcu read lock critical section spans through
seq_ops->next() -> seq_ops->show() -> seq_ops->next().

For the kernel code with the above hack, "bpftool p" roughly takes
38 seconds to finish on my VM with 184 bpf program runs.
Using the following command, I am able to collect the number of
context switches:
   perf stat -e context-switches -- ./bpftool p >& log
Without this patch,
   69      context-switches
With this patch,
   75      context-switches
This patch added additional 6 context switches, roughly every 6 seconds
to reschedule, to avoid lengthy no-rescheduling which may cause the
above RCU warnings.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com
---
 include/linux/bpf.h    |  5 +++++
 kernel/bpf/bpf_iter.c  | 14 ++++++++++++++
 kernel/bpf/task_iter.c |  2 ++
 3 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b16bf48aab6..2fffd30e13ac 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1294,6 +1294,10 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux,
 typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux,
 					 struct bpf_link_info *info);
 
+enum bpf_iter_feature {
+	BPF_ITER_RESCHED	= BIT(0),
+};
+
 #define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
@@ -1302,6 +1306,7 @@ struct bpf_iter_reg {
 	bpf_iter_show_fdinfo_t show_fdinfo;
 	bpf_iter_fill_link_info_t fill_link_info;
 	u32 ctx_arg_info_size;
+	u32 feature;
 	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
 	const struct bpf_iter_seq_info *seq_info;
 };
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 8f10e30ea0b0..5454161407f1 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -67,6 +67,15 @@ static void bpf_iter_done_stop(struct seq_file *seq)
 	iter_priv->done_stop = true;
 }
 
+static bool bpf_iter_support_resched(struct seq_file *seq)
+{
+	struct bpf_iter_priv_data *iter_priv;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+	return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED;
+}
+
 /* maximum visited objects before bailing out */
 #define MAX_ITER_OBJECTS	1000000
 
@@ -83,6 +92,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	struct seq_file *seq = file->private_data;
 	size_t n, offs, copied = 0;
 	int err = 0, num_objs = 0;
+	bool can_resched;
 	void *p;
 
 	mutex_lock(&seq->lock);
@@ -135,6 +145,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 		goto done;
 	}
 
+	can_resched = bpf_iter_support_resched(seq);
 	while (1) {
 		loff_t pos = seq->index;
 
@@ -180,6 +191,9 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 			}
 			break;
 		}
+
+		if (can_resched)
+			cond_resched();
 	}
 stop:
 	offs = seq->count;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 5b6af30bfbcd..1fdb2fc196cd 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -337,6 +337,7 @@ static const struct bpf_iter_seq_info task_seq_info = {
 
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
+	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task, task),
@@ -354,6 +355,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
 
 static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
+	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task_file, task),
-- 
cgit 


From 5c251e9dc0e127bac6fc5b8e6696363d2e35f515 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Oct 2020 14:32:27 -0600
Subject: signal: Add task_sigpending() helper

This is in preparation for maintaining signal_pending() as the decider of
whether or not a schedule() loop should be broken, or continue sleeping.
This is different than the core signal use cases, which really need to know
whether an actual signal is pending or not. task_sigpending() returns
non-zero if TIF_SIGPENDING is set.

Only core kernel use cases should care about the distinction between
the two, make sure those use the task_sigpending() helper.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20201026203230.386348-2-axboe@kernel.dk
---
 include/linux/sched/signal.h | 9 +++++++--
 kernel/events/uprobes.c      | 2 +-
 kernel/signal.c              | 8 ++++----
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 1bad18a1d8ba..404145dc536e 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -353,11 +353,16 @@ static inline int restart_syscall(void)
 	return -ERESTARTNOINTR;
 }
 
-static inline int signal_pending(struct task_struct *p)
+static inline int task_sigpending(struct task_struct *p)
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
 
+static inline int signal_pending(struct task_struct *p)
+{
+	return task_sigpending(p);
+}
+
 static inline int __fatal_signal_pending(struct task_struct *p)
 {
 	return unlikely(sigismember(&p->pending.signal, SIGKILL));
@@ -365,7 +370,7 @@ static inline int __fatal_signal_pending(struct task_struct *p)
 
 static inline int fatal_signal_pending(struct task_struct *p)
 {
-	return signal_pending(p) && __fatal_signal_pending(p);
+	return task_sigpending(p) && __fatal_signal_pending(p);
 }
 
 static inline int signal_pending_state(long state, struct task_struct *p)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 649fd53dc9ad..edd0c985a939 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1973,7 +1973,7 @@ bool uprobe_deny_signal(void)
 
 	WARN_ON_ONCE(utask->state != UTASK_SSTEP);
 
-	if (signal_pending(t)) {
+	if (task_sigpending(t)) {
 		spin_lock_irq(&t->sighand->siglock);
 		clear_tsk_thread_flag(t, TIF_SIGPENDING);
 		spin_unlock_irq(&t->sighand->siglock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 42b67d2cea37..b179eccc86d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -983,7 +983,7 @@ static inline bool wants_signal(int sig, struct task_struct *p)
 	if (task_is_stopped_or_traced(p))
 		return false;
 
-	return task_curr(p) || !signal_pending(p);
+	return task_curr(p) || !task_sigpending(p);
 }
 
 static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
@@ -2822,7 +2822,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
 		/* Remove the signals this thread can handle. */
 		sigandsets(&retarget, &retarget, &t->blocked);
 
-		if (!signal_pending(t))
+		if (!task_sigpending(t))
 			signal_wake_up(t, 0);
 
 		if (sigisemptyset(&retarget))
@@ -2856,7 +2856,7 @@ void exit_signals(struct task_struct *tsk)
 
 	cgroup_threadgroup_change_end(tsk);
 
-	if (!signal_pending(tsk))
+	if (!task_sigpending(tsk))
 		goto out;
 
 	unblocked = tsk->blocked;
@@ -2900,7 +2900,7 @@ long do_no_restart_syscall(struct restart_block *param)
 
 static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
 {
-	if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+	if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
 		sigset_t newblocked;
 		/* A set of now blocked but previously unblocked signals. */
 		sigandnsets(&newblocked, newset, &current->blocked);
-- 
cgit 


From 12db8b690010ccfadf9d0b49a1e1798e47dbbe1a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Oct 2020 14:32:28 -0600
Subject: entry: Add support for TIF_NOTIFY_SIGNAL

Add TIF_NOTIFY_SIGNAL handling in the generic entry code, which if set,
will return true if signal_pending() is used in a wait loop. That causes an
exit of the loop so that notify_signal tracehooks can be run. If the wait
loop is currently inside a system call, the system call is restarted once
task_work has been processed.

In preparation for only having arch_do_signal() handle syscall restarts if
_TIF_SIGPENDING isn't set, rename it to arch_do_signal_or_restart().  Pass
in a boolean that tells the architecture specific signal handler if it
should attempt to get a signal, or just process a potential syscall
restart.

For !CONFIG_GENERIC_ENTRY archs, add the TIF_NOTIFY_SIGNAL handling to
get_signal(). This is done to minimize the needed architecture changes to
support this feature.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20201026203230.386348-3-axboe@kernel.dk
---
 arch/x86/kernel/signal.c     |  4 ++--
 include/linux/entry-common.h | 11 ++++++++---
 include/linux/entry-kvm.h    |  4 ++--
 include/linux/sched/signal.h | 11 ++++++++++-
 include/linux/tracehook.h    | 27 +++++++++++++++++++++++++++
 kernel/entry/common.c        | 14 +++++++++++---
 kernel/entry/kvm.c           |  3 +++
 kernel/signal.c              | 14 ++++++++++++++
 8 files changed, 77 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d5fa494c2304..ec3b9c6e5a4c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -804,11 +804,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-void arch_do_signal(struct pt_regs *regs)
+void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
 {
 	struct ksignal ksig;
 
-	if (get_signal(&ksig)) {
+	if (has_signal && get_signal(&ksig)) {
 		/* Whee! Actually deliver the signal.  */
 		handle_signal(&ksig, regs);
 		return;
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index efebbffcd5cc..c7bfac45f951 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -37,6 +37,10 @@
 # define _TIF_UPROBE			(0)
 #endif
 
+#ifndef _TIF_NOTIFY_SIGNAL
+# define _TIF_NOTIFY_SIGNAL		(0)
+#endif
+
 /*
  * TIF flags handled in syscall_enter_from_usermode()
  */
@@ -69,7 +73,7 @@
 
 #define EXIT_TO_USER_MODE_WORK						\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
-	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING |			\
+	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |	\
 	 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
@@ -226,12 +230,13 @@ static __always_inline void arch_exit_to_user_mode(void) { }
 #endif
 
 /**
- * arch_do_signal -  Architecture specific signal delivery function
+ * arch_do_signal_or_restart -  Architecture specific signal delivery function
  * @regs:	Pointer to currents pt_regs
+ * @has_signal:	actual signal to handle
  *
  * Invoked from exit_to_user_mode_loop().
  */
-void arch_do_signal(struct pt_regs *regs);
+void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal);
 
 /**
  * arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 0cef17afb41a..9b93f8584ff7 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -11,8 +11,8 @@
 # define ARCH_XFER_TO_GUEST_MODE_WORK	(0)
 #endif
 
-#define XFER_TO_GUEST_MODE_WORK					\
-	(_TIF_NEED_RESCHED | _TIF_SIGPENDING |			\
+#define XFER_TO_GUEST_MODE_WORK						\
+	(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL |	\
 	 _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
 
 struct kvm_vcpu;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 404145dc536e..bd5afa076189 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -360,6 +360,15 @@ static inline int task_sigpending(struct task_struct *p)
 
 static inline int signal_pending(struct task_struct *p)
 {
+#if defined(TIF_NOTIFY_SIGNAL)
+	/*
+	 * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
+	 * behavior in terms of ensuring that we break out of wait loops
+	 * so that notify signal callbacks can be processed.
+	 */
+	if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
+		return 1;
+#endif
 	return task_sigpending(p);
 }
 
@@ -507,7 +516,7 @@ extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
 static inline void restore_saved_sigmask_unless(bool interrupted)
 {
 	if (interrupted)
-		WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+		WARN_ON(!signal_pending(current));
 	else
 		restore_saved_sigmask();
 }
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 36fb3bbed6b2..1e8caca92e1f 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -198,4 +198,31 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 	blkcg_maybe_throttle_current();
 }
 
+/*
+ * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
+ * is currently used by TWA_SIGNAL based task_work, which requires breaking
+ * wait loops to ensure that task_work is noticed and run.
+ */
+static inline void tracehook_notify_signal(void)
+{
+#if defined(TIF_NOTIFY_SIGNAL)
+	clear_thread_flag(TIF_NOTIFY_SIGNAL);
+	smp_mb__after_atomic();
+	if (current->task_works)
+		task_work_run();
+#endif
+}
+
+/*
+ * Called when we have work to process from exit_to_user_mode_loop()
+ */
+static inline void set_notify_signal(struct task_struct *task)
+{
+#if defined(TIF_NOTIFY_SIGNAL)
+	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
+	    !wake_up_state(task, TASK_INTERRUPTIBLE))
+		kick_process(task);
+#endif
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9852e0d62d95..42eff115c426 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -109,7 +109,15 @@ static __always_inline void exit_to_user_mode(void)
 }
 
 /* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal(struct pt_regs *regs) { }
+void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
+
+static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
+{
+	if (ti_work & _TIF_NOTIFY_SIGNAL)
+		tracehook_notify_signal();
+
+	arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
+}
 
 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 					    unsigned long ti_work)
@@ -131,8 +139,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		if (ti_work & _TIF_PATCH_PENDING)
 			klp_update_patch_state(current);
 
-		if (ti_work & _TIF_SIGPENDING)
-			arch_do_signal(regs);
+		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+			handle_signal_work(regs, ti_work);
 
 		if (ti_work & _TIF_NOTIFY_RESUME) {
 			clear_thread_flag(TIF_NOTIFY_RESUME);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index eb1a8a4c867c..b828a3ddebf1 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,6 +8,9 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 	do {
 		int ret;
 
+		if (ti_work & _TIF_NOTIFY_SIGNAL)
+			tracehook_notify_signal();
+
 		if (ti_work & _TIF_SIGPENDING) {
 			kvm_handle_signal_exit(vcpu);
 			return -EINTR;
diff --git a/kernel/signal.c b/kernel/signal.c
index b179eccc86d0..61b377e65c46 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2529,6 +2529,20 @@ bool get_signal(struct ksignal *ksig)
 	struct signal_struct *signal = current->signal;
 	int signr;
 
+	/*
+	 * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
+	 * that the arch handlers don't all have to do it. If we get here
+	 * without TIF_SIGPENDING, just exit after running signal work.
+	 */
+#ifdef TIF_NOTIFY_SIGNAL
+	if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+		if (test_thread_flag(TIF_NOTIFY_SIGNAL))
+			tracehook_notify_signal();
+		if (!task_sigpending(current))
+			return false;
+	}
+#endif
+
 	if (unlikely(uprobe_deny_signal()))
 		return false;
 
-- 
cgit 


From 5bc78502322a5e4eef3f1b2a2813751dc6434143 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 20 Oct 2020 09:47:13 -0400
Subject: sched: fix exit_mm vs membarrier (v4)

exit_mm should issue memory barriers after user-space memory accesses,
before clearing current->mm, to order user-space memory accesses
performed prior to exit_mm before clearing tsk->mm, which has the
effect of skipping the membarrier private expedited IPIs.

exit_mm should also update the runqueue's membarrier_state so
membarrier global expedited IPIs are not sent when they are not
needed.

The membarrier system call can be issued concurrently with do_exit
if we have thread groups created with CLONE_VM but not CLONE_THREAD.

Here is the scenario I have in mind:

Two thread groups are created, A and B. Thread group B is created by
issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
Let's assume we have a single thread within each thread group (Thread A
and Thread B).

The AFAIU we can have:

Userspace variables:

int x = 0, y = 0;

CPU 0                   CPU 1
Thread A                Thread B
(in thread group A)     (in thread group B)

x = 1
barrier()
y = 1
exit()
exit_mm()
current->mm = NULL;
                        r1 = load y
                        membarrier()
                          skips CPU 0 (no IPI) because its current mm is NULL
                        r2 = load x
                        BUG_ON(r1 == 1 && r2 == 0)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201020134715.13909-2-mathieu.desnoyers@efficios.com
---
 include/linux/sched/mm.h  |  5 +++++
 kernel/exit.c             | 16 +++++++++++++++-
 kernel/sched/membarrier.c | 12 ++++++++++++
 3 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..a91fb3ad9ec7 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 
 extern void membarrier_exec_mmap(struct mm_struct *mm);
 
+extern void membarrier_update_current_mm(struct mm_struct *next_mm);
+
 #else
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
 static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 {
 }
+static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 87a2d515de0d..a3dd6b36f99a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -475,10 +475,24 @@ static void exit_mm(void)
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
+	/*
+	 * When a thread stops operating on an address space, the loop
+	 * in membarrier_private_expedited() may not observe that
+	 * tsk->mm, and the loop in membarrier_global_expedited() may
+	 * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
+	 * rq->membarrier_state, so those would not issue an IPI.
+	 * Membarrier requires a memory barrier after accessing
+	 * user-space memory, before clearing tsk->mm or the
+	 * rq->membarrier_state.
+	 */
+	smp_mb__after_spinlock();
+	local_irq_disable();
 	current->mm = NULL;
-	mmap_read_unlock(mm);
+	membarrier_update_current_mm(NULL);
 	enter_lazy_tlb(mm, current);
+	local_irq_enable();
 	task_unlock(current);
+	mmap_read_unlock(mm);
 	mm_update_next_owner(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index e23e74d52db5..aac329258af0 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -76,6 +76,18 @@ void membarrier_exec_mmap(struct mm_struct *mm)
 	this_cpu_write(runqueues.membarrier_state, 0);
 }
 
+void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+	struct rq *rq = this_rq();
+	int membarrier_state = 0;
+
+	if (next_mm)
+		membarrier_state = atomic_read(&next_mm->membarrier_state);
+	if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+		return;
+	WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+
 static int membarrier_global_expedited(void)
 {
 	int cpu;
-- 
cgit 


From 8d97e71811aaafe4abf611dc24822fd6e73df1a1 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 1 Oct 2020 06:57:46 -0700
Subject: perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

Current perf can report both virtual addresses and physical addresses,
but not the MMU page size. Without the MMU page size information of the
utilized page, users cannot decide whether to promote/demote large pages
to optimize memory usage.

Add a new sample type for the data MMU page size.

Current perf already has a facility to collect data virtual addresses.
A page walker is required to walk the pages tables and calculate the
MMU page size from a given virtual address.

On some platforms, e.g., X86, the page walker is invoked in an NMI
handler. So the page walker must be NMI-safe and low overhead. Besides,
the page walker should work for both user and kernel virtual address.
The existing generic page walker, e.g., walk_page_range_novma(), is a
little bit complex and doesn't guarantee the NMI-safe. The follow_page()
is only for user-virtual address.

Add a new function perf_get_page_size() to walk the page tables and
calculate the MMU page size. In the function:
- Interrupts have to be disabled to prevent any teardown of the page
  tables.
- For user space threads, the current->mm is used for the page walker.
  For kernel threads and the like, the current->mm is NULL. The init_mm
  is used for the page walker. The active_mm is not used here, because
  it can be NULL.
  Quote from Peter Zijlstra,
  "context_switch() can set prev->active_mm to NULL when it transfers it
   to @next. It does this before @current is updated. So an NMI that
   comes in between this active_mm swizzling and updating @current will
   see !active_mm."
- The MMU page size is calculated from the page table level.

The method should work for all architectures, but it has only been
verified on X86. Should there be some architectures, which support perf,
where the method doesn't work, it can be fixed later separately.
Reporting the wrong page size would not be fatal for the architecture.

Some under discussion features may impact the method in the future.
Quote from Dave Hansen,
  "There are lots of weird things folks are trying to do with the page
   tables, like Address Space Isolation.  For instance, if you get a
   perf NMI when running userspace, current->mm->pgd is *different* than
   the PGD that was in use when userspace was running. It's close enough
   today, but it might not stay that way."
If the case happens later, lots of consecutive page walk errors will
happen. The worst case is that lots of page-size '0' are returned, which
would not be fatal.
In the perf tool, a check is implemented to detect this case. Once it
happens, a kernel patch could be implemented accordingly then.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201001135749.2804-2-kan.liang@linux.intel.com
---
 include/linux/perf_event.h      |   1 +
 include/uapi/linux/perf_event.h |   4 +-
 kernel/events/core.c            | 103 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0c19d279b97f..7e3785dd27d9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1034,6 +1034,7 @@ struct perf_sample_data {
 
 	u64				phys_addr;
 	u64				cgroup;
+	u64				data_page_size;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 077e7ee69e3d..cc6ea346e9f9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -143,8 +143,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 	PERF_SAMPLE_AUX				= 1U << 20,
 	PERF_SAMPLE_CGROUP			= 1U << 21,
+	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 22,
 
-	PERF_SAMPLE_MAX = 1U << 22,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 23,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -896,6 +897,7 @@ enum perf_event_type {
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 *	{ u64			size;
 	 *	  char			data[size]; } && PERF_SAMPLE_AUX
+	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fb662eb4fb69..a796db2f3b57 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,7 @@
 #include <linux/proc_ns.h>
 #include <linux/mount.h>
 #include <linux/min_heap.h>
+#include <linux/highmem.h>
 
 #include "internal.h"
 
@@ -1894,6 +1895,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_CGROUP)
 		size += sizeof(data->cgroup);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		size += sizeof(data->data_page_size);
+
 	event->header_size = size;
 }
 
@@ -6938,6 +6942,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_CGROUP)
 		perf_output_put(handle, data->cgroup);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		perf_output_put(handle, data->data_page_size);
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		perf_output_put(handle, data->aux_size);
 
@@ -6995,6 +7002,94 @@ static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
+#ifdef CONFIG_MMU
+
+/*
+ * Return the MMU page size of a given virtual address
+ */
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d))
+		return 0;
+
+	if (p4d_leaf(*p4d))
+		return 1ULL << P4D_SHIFT;
+
+	pud = pud_offset(p4d, addr);
+	if (!pud_present(*pud))
+		return 0;
+
+	if (pud_leaf(*pud))
+		return 1ULL << PUD_SHIFT;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return 0;
+
+	if (pmd_leaf(*pmd))
+		return 1ULL << PMD_SHIFT;
+
+	pte = pte_offset_map(pmd, addr);
+	if (!pte_present(*pte)) {
+		pte_unmap(pte);
+		return 0;
+	}
+
+	pte_unmap(pte);
+	return PAGE_SIZE;
+}
+
+#else
+
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+	return 0;
+}
+
+#endif
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+	struct mm_struct *mm;
+	unsigned long flags;
+	u64 size;
+
+	if (!addr)
+		return 0;
+
+	/*
+	 * Software page-table walkers must disable IRQs,
+	 * which prevents any tear down of the page tables.
+	 */
+	local_irq_save(flags);
+
+	mm = current->mm;
+	if (!mm) {
+		/*
+		 * For kernel threads and the like, use init_mm so that
+		 * we can find kernel memory.
+		 */
+		mm = &init_mm;
+	}
+
+	size = __perf_get_page_size(mm, addr);
+
+	local_irq_restore(flags);
+
+	return size;
+}
+
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
 struct perf_callchain_entry *
@@ -7150,6 +7245,14 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 #endif
 
+	/*
+	 * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
+	 * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
+	 * but the value will not dump to the userspace.
+	 */
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		data->data_page_size = perf_get_page_size(data->addr);
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		u64 size;
 
-- 
cgit 


From 995f088efebe1eba0282a6ffa12411b37f8990c2 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 1 Oct 2020 06:57:49 -0700
Subject: perf/core: Add support for PERF_SAMPLE_CODE_PAGE_SIZE

When studying code layout, it is useful to capture the page size of the
sampled code address.

Add a new sample type for code page size.
The new sample type requires collecting the ip. The code page size can
be calculated from the NMI-safe perf_get_page_size().

For large PEBS, it's very unlikely that the mapping is gone for the
earlier PEBS records. Enable the feature for the large PEBS. The worst
case is that page-size '0' is returned.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201001135749.2804-5-kan.liang@linux.intel.com
---
 arch/x86/events/perf_event.h    |  2 +-
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 11 ++++++++++-
 4 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ee2b9b9fc2a5..10032f023fcc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -132,7 +132,7 @@ struct amd_nb {
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
 	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
 	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
-	PERF_SAMPLE_PERIOD)
+	PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE)
 
 #define PEBS_GP_REGS			\
 	((1ULL << PERF_REG_X86_AX)    | \
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7e3785dd27d9..e533b03af053 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1035,6 +1035,7 @@ struct perf_sample_data {
 	u64				phys_addr;
 	u64				cgroup;
 	u64				data_page_size;
+	u64				code_page_size;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index cc6ea346e9f9..c2f20ee3124d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -144,8 +144,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_AUX				= 1U << 20,
 	PERF_SAMPLE_CGROUP			= 1U << 21,
 	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 22,
+	PERF_SAMPLE_CODE_PAGE_SIZE		= 1U << 23,
 
-	PERF_SAMPLE_MAX = 1U << 23,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 24,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -898,6 +899,7 @@ enum perf_event_type {
 	 *	{ u64			size;
 	 *	  char			data[size]; } && PERF_SAMPLE_AUX
 	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
+	 *	{ u64			code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a796db2f3b57..7f655d19b8c4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1898,6 +1898,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		size += sizeof(data->data_page_size);
 
+	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		size += sizeof(data->code_page_size);
+
 	event->header_size = size;
 }
 
@@ -6945,6 +6948,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		perf_output_put(handle, data->data_page_size);
 
+	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		perf_output_put(handle, data->code_page_size);
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		perf_output_put(handle, data->aux_size);
 
@@ -7125,7 +7131,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 	__perf_event_header__init_id(header, data, event);
 
-	if (sample_type & PERF_SAMPLE_IP)
+	if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
 		data->ip = perf_instruction_pointer(regs);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
@@ -7253,6 +7259,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		data->data_page_size = perf_get_page_size(data->addr);
 
+	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		data->code_page_size = perf_get_page_size(data->ip);
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		u64 size;
 
-- 
cgit 


From 51b646b2d9f84d6ff6300e3c1d09f2be4329a424 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Oct 2020 11:09:27 +0200
Subject: perf,mm: Handle non-page-table-aligned hugetlbfs

A limited nunmber of architectures support hugetlbfs sizes that do not
align with the page-tables (ARM64, Power, Sparc64). Add support for
this to the generic perf_get_page_size() implementation, and also
allow an architecture to override this implementation.

This latter is only needed when it uses non-page-table aligned huge
pages in its kernel map.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/perf_event.h |  4 ++++
 kernel/events/core.c       | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e533b03af053..0defb526cd0c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1590,4 +1590,8 @@ extern void __weak arch_perf_update_userpage(struct perf_event *event,
 					     struct perf_event_mmap_page *userpg,
 					     u64 now);
 
+#ifdef CONFIG_MMU
+extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr);
+#endif
+
 #endif /* _LINUX_PERF_EVENT_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f655d19b8c4..b458ed3dc81b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7011,10 +7011,18 @@ static u64 perf_virt_to_phys(u64 virt)
 #ifdef CONFIG_MMU
 
 /*
- * Return the MMU page size of a given virtual address
+ * Return the MMU page size of a given virtual address.
+ *
+ * This generic implementation handles page-table aligned huge pages, as well
+ * as non-page-table aligned hugetlbfs compound pages.
+ *
+ * If an architecture supports and uses non-page-table aligned pages in their
+ * kernel mapping it will need to provide it's own implementation of this
+ * function.
  */
-static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
 {
+	struct page *page;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
@@ -7036,15 +7044,27 @@ static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
 	if (!pud_present(*pud))
 		return 0;
 
-	if (pud_leaf(*pud))
+	if (pud_leaf(*pud)) {
+#ifdef pud_page
+		page = pud_page(*pud);
+		if (PageHuge(page))
+			return page_size(compound_head(page));
+#endif
 		return 1ULL << PUD_SHIFT;
+	}
 
 	pmd = pmd_offset(pud, addr);
 	if (!pmd_present(*pmd))
 		return 0;
 
-	if (pmd_leaf(*pmd))
+	if (pmd_leaf(*pmd)) {
+#ifdef pmd_page
+		page = pmd_page(*pmd);
+		if (PageHuge(page))
+			return page_size(compound_head(page));
+#endif
 		return 1ULL << PMD_SHIFT;
+	}
 
 	pte = pte_offset_map(pmd, addr);
 	if (!pte_present(*pte)) {
@@ -7052,13 +7072,20 @@ static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
 		return 0;
 	}
 
+	page = pte_page(*pte);
+	if (PageHuge(page)) {
+		u64 size = page_size(compound_head(page));
+		pte_unmap(pte);
+		return size;
+	}
+
 	pte_unmap(pte);
 	return PAGE_SIZE;
 }
 
 #else
 
-static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
 {
 	return 0;
 }
@@ -7089,7 +7116,7 @@ static u64 perf_get_page_size(unsigned long addr)
 		mm = &init_mm;
 	}
 
-	size = __perf_get_page_size(mm, addr);
+	size = arch_perf_get_page_size(mm, addr);
 
 	local_irq_restore(flags);
 
-- 
cgit 


From c6838eeef2fbc7e3e1f83759aa016ae6b70c643e Mon Sep 17 00:00:00 2001
From: "dmitry.torokhov@gmail.com" <dmitry.torokhov@gmail.com>
Date: Wed, 30 Sep 2020 15:47:13 -0700
Subject: HID: hid-input: occasionally report stylus battery even if not
 changed

There are styluses that only report their battery status when they are
touching the touchscreen; additionally we currently suppress battery
reports if capacity has not changed. To help userspace recognize how long
ago the device reported battery status, let's send the change event through
if either capacity has changed, or at least 30 seconds have passed since
last report we've let through.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 5 ++++-
 include/linux/hid.h     | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 9770db624bfa..b0db2a2ac15a 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -534,9 +534,12 @@ static void hidinput_update_battery(struct hid_device *dev, int value)
 	capacity = hidinput_scale_battery_capacity(dev, value);
 
 	if (dev->battery_status != HID_BATTERY_REPORTED ||
-	    capacity != dev->battery_capacity) {
+	    capacity != dev->battery_capacity ||
+	    ktime_after(ktime_get_coarse(), dev->battery_ratelimit_time)) {
 		dev->battery_capacity = capacity;
 		dev->battery_status = HID_BATTERY_REPORTED;
+		dev->battery_ratelimit_time =
+			ktime_add_ms(ktime_get_coarse(), 30 * 1000);
 		power_supply_changed(dev->battery);
 	}
 }
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 58684657960b..b079146850e6 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -585,6 +585,7 @@ struct hid_device {							/* device report descriptor */
 	__s32 battery_report_id;
 	enum hid_battery_status battery_status;
 	bool battery_avoid_query;
+	ktime_t battery_ratelimit_time;
 #endif
 
 	unsigned long status;						/* see STAT flags above */
-- 
cgit 


From f54ec58fee837ec847cb8b50593e81bfaa46107f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 27 Oct 2020 21:12:12 +0100
Subject: wimax: move out to staging

There are no known users of this driver as of October 2020, and it will
be removed unless someone turns out to still need it in future releases.

According to https://en.wikipedia.org/wiki/List_of_WiMAX_networks, there
have been many public wimax networks, but it appears that many of these
have migrated to LTE or discontinued their service altogether.
As most PCs and phones lack WiMAX hardware support, the remaining
networks tend to use standalone routers. These almost certainly
run Linux, but not a modern kernel or the mainline wimax driver stack.

NetworkManager appears to have dropped userspace support in 2015
https://bugzilla.gnome.org/show_bug.cgi?id=747846, the
www.linuxwimax.org
site had already shut down earlier.

WiMax is apparently still being deployed on airport campus networks
("AeroMACS"), but in a frequency band that was not supported by the old
Intel 2400m (used in Sandy Bridge laptops and earlier), which is the
only driver using the kernel's wimax stack.

Move all files into drivers/staging/wimax, including the uapi header
files and documentation, to make it easier to remove it when it gets
to that. Only minimal changes are made to the source files, in order
to make it possible to port patches across the move.

Also remove the MAINTAINERS entry that refers to a broken mailing
list and website.

Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-By: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Suggested-by: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 Documentation/admin-guide/index.rst                |    1 -
 Documentation/admin-guide/wimax/i2400m.rst         |  283 ----
 Documentation/admin-guide/wimax/index.rst          |   19 -
 Documentation/admin-guide/wimax/wimax.rst          |   89 --
 Documentation/networking/kapi.rst                  |   21 -
 .../translations/zh_CN/admin-guide/index.rst       |    1 -
 MAINTAINERS                                        |   22 -
 drivers/net/Kconfig                                |    2 -
 drivers/net/Makefile                               |    1 -
 drivers/net/wimax/Kconfig                          |   18 -
 drivers/net/wimax/Makefile                         |    2 -
 drivers/net/wimax/i2400m/Kconfig                   |   37 -
 drivers/net/wimax/i2400m/Makefile                  |   23 -
 drivers/net/wimax/i2400m/control.c                 | 1434 -----------------
 drivers/net/wimax/i2400m/debug-levels.h            |   32 -
 drivers/net/wimax/i2400m/debugfs.c                 |  253 ---
 drivers/net/wimax/i2400m/driver.c                  | 1002 ------------
 drivers/net/wimax/i2400m/fw.c                      | 1653 --------------------
 drivers/net/wimax/i2400m/i2400m-usb.h              |  275 ----
 drivers/net/wimax/i2400m/i2400m.h                  |  970 ------------
 drivers/net/wimax/i2400m/netdev.c                  |  603 -------
 drivers/net/wimax/i2400m/op-rfkill.c               |  196 ---
 drivers/net/wimax/i2400m/rx.c                      | 1395 -----------------
 drivers/net/wimax/i2400m/sysfs.c                   |   65 -
 drivers/net/wimax/i2400m/tx.c                      | 1011 ------------
 drivers/net/wimax/i2400m/usb-debug-levels.h        |   28 -
 drivers/net/wimax/i2400m/usb-fw.c                  |  365 -----
 drivers/net/wimax/i2400m/usb-notif.c               |  258 ---
 drivers/net/wimax/i2400m/usb-rx.c                  |  462 ------
 drivers/net/wimax/i2400m/usb-tx.c                  |  273 ----
 drivers/net/wimax/i2400m/usb.c                     |  764 ---------
 drivers/staging/Kconfig                            |    2 +
 drivers/staging/Makefile                           |    1 +
 drivers/staging/wimax/Documentation/i2400m.rst     |  283 ++++
 drivers/staging/wimax/Documentation/index.rst      |   19 +
 drivers/staging/wimax/Documentation/wimax.rst      |   89 ++
 drivers/staging/wimax/Kconfig                      |   46 +
 drivers/staging/wimax/Makefile                     |   15 +
 drivers/staging/wimax/TODO                         |   18 +
 drivers/staging/wimax/debug-levels.h               |   29 +
 drivers/staging/wimax/debugfs.c                    |   38 +
 drivers/staging/wimax/i2400m/Kconfig               |   37 +
 drivers/staging/wimax/i2400m/Makefile              |   23 +
 drivers/staging/wimax/i2400m/control.c             | 1434 +++++++++++++++++
 drivers/staging/wimax/i2400m/debug-levels.h        |   32 +
 drivers/staging/wimax/i2400m/debugfs.c             |  253 +++
 drivers/staging/wimax/i2400m/driver.c              | 1002 ++++++++++++
 drivers/staging/wimax/i2400m/fw.c                  | 1653 ++++++++++++++++++++
 drivers/staging/wimax/i2400m/i2400m-usb.h          |  275 ++++
 drivers/staging/wimax/i2400m/i2400m.h              |  970 ++++++++++++
 drivers/staging/wimax/i2400m/linux-wimax-i2400m.h  |  572 +++++++
 drivers/staging/wimax/i2400m/netdev.c              |  603 +++++++
 drivers/staging/wimax/i2400m/op-rfkill.c           |  196 +++
 drivers/staging/wimax/i2400m/rx.c                  | 1395 +++++++++++++++++
 drivers/staging/wimax/i2400m/sysfs.c               |   65 +
 drivers/staging/wimax/i2400m/tx.c                  | 1011 ++++++++++++
 drivers/staging/wimax/i2400m/usb-debug-levels.h    |   28 +
 drivers/staging/wimax/i2400m/usb-fw.c              |  365 +++++
 drivers/staging/wimax/i2400m/usb-notif.c           |  258 +++
 drivers/staging/wimax/i2400m/usb-rx.c              |  462 ++++++
 drivers/staging/wimax/i2400m/usb-tx.c              |  273 ++++
 drivers/staging/wimax/i2400m/usb.c                 |  764 +++++++++
 drivers/staging/wimax/id-table.c                   |  130 ++
 drivers/staging/wimax/linux-wimax-debug.h          |  491 ++++++
 drivers/staging/wimax/linux-wimax.h                |  239 +++
 drivers/staging/wimax/net-wimax.h                  |  503 ++++++
 drivers/staging/wimax/op-msg.c                     |  391 +++++
 drivers/staging/wimax/op-reset.c                   |  108 ++
 drivers/staging/wimax/op-rfkill.c                  |  431 +++++
 drivers/staging/wimax/op-state-get.c               |   52 +
 drivers/staging/wimax/stack.c                      |  616 ++++++++
 drivers/staging/wimax/wimax-internal.h             |   85 +
 include/linux/wimax/debug.h                        |  491 ------
 include/net/wimax.h                                |  503 ------
 include/uapi/linux/wimax.h                         |  239 ---
 include/uapi/linux/wimax/i2400m.h                  |  572 -------
 net/Kconfig                                        |    2 -
 net/Makefile                                       |    1 -
 net/wimax/Kconfig                                  |   40 -
 net/wimax/Makefile                                 |   13 -
 net/wimax/debug-levels.h                           |   29 -
 net/wimax/debugfs.c                                |   38 -
 net/wimax/id-table.c                               |  130 --
 net/wimax/op-msg.c                                 |  391 -----
 net/wimax/op-reset.c                               |  108 --
 net/wimax/op-rfkill.c                              |  431 -----
 net/wimax/op-state-get.c                           |   52 -
 net/wimax/stack.c                                  |  616 --------
 net/wimax/wimax-internal.h                         |   85 -
 89 files changed, 15257 insertions(+), 15299 deletions(-)
 delete mode 100644 Documentation/admin-guide/wimax/i2400m.rst
 delete mode 100644 Documentation/admin-guide/wimax/index.rst
 delete mode 100644 Documentation/admin-guide/wimax/wimax.rst
 delete mode 100644 drivers/net/wimax/Kconfig
 delete mode 100644 drivers/net/wimax/Makefile
 delete mode 100644 drivers/net/wimax/i2400m/Kconfig
 delete mode 100644 drivers/net/wimax/i2400m/Makefile
 delete mode 100644 drivers/net/wimax/i2400m/control.c
 delete mode 100644 drivers/net/wimax/i2400m/debug-levels.h
 delete mode 100644 drivers/net/wimax/i2400m/debugfs.c
 delete mode 100644 drivers/net/wimax/i2400m/driver.c
 delete mode 100644 drivers/net/wimax/i2400m/fw.c
 delete mode 100644 drivers/net/wimax/i2400m/i2400m-usb.h
 delete mode 100644 drivers/net/wimax/i2400m/i2400m.h
 delete mode 100644 drivers/net/wimax/i2400m/netdev.c
 delete mode 100644 drivers/net/wimax/i2400m/op-rfkill.c
 delete mode 100644 drivers/net/wimax/i2400m/rx.c
 delete mode 100644 drivers/net/wimax/i2400m/sysfs.c
 delete mode 100644 drivers/net/wimax/i2400m/tx.c
 delete mode 100644 drivers/net/wimax/i2400m/usb-debug-levels.h
 delete mode 100644 drivers/net/wimax/i2400m/usb-fw.c
 delete mode 100644 drivers/net/wimax/i2400m/usb-notif.c
 delete mode 100644 drivers/net/wimax/i2400m/usb-rx.c
 delete mode 100644 drivers/net/wimax/i2400m/usb-tx.c
 delete mode 100644 drivers/net/wimax/i2400m/usb.c
 create mode 100644 drivers/staging/wimax/Documentation/i2400m.rst
 create mode 100644 drivers/staging/wimax/Documentation/index.rst
 create mode 100644 drivers/staging/wimax/Documentation/wimax.rst
 create mode 100644 drivers/staging/wimax/Kconfig
 create mode 100644 drivers/staging/wimax/Makefile
 create mode 100644 drivers/staging/wimax/TODO
 create mode 100644 drivers/staging/wimax/debug-levels.h
 create mode 100644 drivers/staging/wimax/debugfs.c
 create mode 100644 drivers/staging/wimax/i2400m/Kconfig
 create mode 100644 drivers/staging/wimax/i2400m/Makefile
 create mode 100644 drivers/staging/wimax/i2400m/control.c
 create mode 100644 drivers/staging/wimax/i2400m/debug-levels.h
 create mode 100644 drivers/staging/wimax/i2400m/debugfs.c
 create mode 100644 drivers/staging/wimax/i2400m/driver.c
 create mode 100644 drivers/staging/wimax/i2400m/fw.c
 create mode 100644 drivers/staging/wimax/i2400m/i2400m-usb.h
 create mode 100644 drivers/staging/wimax/i2400m/i2400m.h
 create mode 100644 drivers/staging/wimax/i2400m/linux-wimax-i2400m.h
 create mode 100644 drivers/staging/wimax/i2400m/netdev.c
 create mode 100644 drivers/staging/wimax/i2400m/op-rfkill.c
 create mode 100644 drivers/staging/wimax/i2400m/rx.c
 create mode 100644 drivers/staging/wimax/i2400m/sysfs.c
 create mode 100644 drivers/staging/wimax/i2400m/tx.c
 create mode 100644 drivers/staging/wimax/i2400m/usb-debug-levels.h
 create mode 100644 drivers/staging/wimax/i2400m/usb-fw.c
 create mode 100644 drivers/staging/wimax/i2400m/usb-notif.c
 create mode 100644 drivers/staging/wimax/i2400m/usb-rx.c
 create mode 100644 drivers/staging/wimax/i2400m/usb-tx.c
 create mode 100644 drivers/staging/wimax/i2400m/usb.c
 create mode 100644 drivers/staging/wimax/id-table.c
 create mode 100644 drivers/staging/wimax/linux-wimax-debug.h
 create mode 100644 drivers/staging/wimax/linux-wimax.h
 create mode 100644 drivers/staging/wimax/net-wimax.h
 create mode 100644 drivers/staging/wimax/op-msg.c
 create mode 100644 drivers/staging/wimax/op-reset.c
 create mode 100644 drivers/staging/wimax/op-rfkill.c
 create mode 100644 drivers/staging/wimax/op-state-get.c
 create mode 100644 drivers/staging/wimax/stack.c
 create mode 100644 drivers/staging/wimax/wimax-internal.h
 delete mode 100644 include/linux/wimax/debug.h
 delete mode 100644 include/net/wimax.h
 delete mode 100644 include/uapi/linux/wimax.h
 delete mode 100644 include/uapi/linux/wimax/i2400m.h
 delete mode 100644 net/wimax/Kconfig
 delete mode 100644 net/wimax/Makefile
 delete mode 100644 net/wimax/debug-levels.h
 delete mode 100644 net/wimax/debugfs.c
 delete mode 100644 net/wimax/id-table.c
 delete mode 100644 net/wimax/op-msg.c
 delete mode 100644 net/wimax/op-reset.c
 delete mode 100644 net/wimax/op-rfkill.c
 delete mode 100644 net/wimax/op-state-get.c
 delete mode 100644 net/wimax/stack.c
 delete mode 100644 net/wimax/wimax-internal.h

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index ed1cf94ea50c..d53986a424c4 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -115,7 +115,6 @@ configure specific aspects of kernel behavior to your liking.
    unicode
    vga-softcursor
    video-output
-   wimax/index
    xfs
 
 .. only::  subproject and html
diff --git a/Documentation/admin-guide/wimax/i2400m.rst b/Documentation/admin-guide/wimax/i2400m.rst
deleted file mode 100644
index 194388c0c351..000000000000
--- a/Documentation/admin-guide/wimax/i2400m.rst
+++ /dev/null
@@ -1,283 +0,0 @@
-.. include:: <isonum.txt>
-
-====================================================
-Driver for the Intel Wireless Wimax Connection 2400m
-====================================================
-
-:Copyright: |copy| 2008 Intel Corporation < linux-wimax@intel.com >
-
-   This provides a driver for the Intel Wireless WiMAX Connection 2400m
-   and a basic Linux kernel WiMAX stack.
-
-1. Requirements
-===============
-
-     * Linux installation with Linux kernel 2.6.22 or newer (if building
-       from a separate tree)
-     * Intel i2400m Echo Peak or Baxter Peak; this includes the Intel
-       Wireless WiMAX/WiFi Link 5x50 series.
-     * build tools:
-
-          + Linux kernel development package for the target kernel; to
-            build against your currently running kernel, you need to have
-            the kernel development package corresponding to the running
-            image installed (usually if your kernel is named
-            linux-VERSION, the development package is called
-            linux-dev-VERSION or linux-headers-VERSION).
-          + GNU C Compiler, make
-
-2. Compilation and installation
-===============================
-
-2.1. Compilation of the drivers included in the kernel
-------------------------------------------------------
-
-   Configure the kernel; to enable the WiMAX drivers select Drivers >
-   Networking Drivers > WiMAX device support. Enable all of them as
-   modules (easier).
-
-   If USB or SDIO are not enabled in the kernel configuration, the options
-   to build the i2400m USB or SDIO drivers will not show. Enable said
-   subsystems and go back to the WiMAX menu to enable the drivers.
-
-   Compile and install your kernel as usual.
-
-2.2. Compilation of the drivers distributed as an standalone module
--------------------------------------------------------------------
-
-   To compile::
-
-	$ cd source/directory
-	$ make
-
-   Once built you can load and unload using the provided load.sh script;
-   load.sh will load the modules, load.sh u will unload them.
-
-   To install in the default kernel directories (and enable auto loading
-   when the device is plugged)::
-
-	$ make install
-	$ depmod -a
-
-   If your kernel development files are located in a non standard
-   directory or if you want to build for a kernel that is not the
-   currently running one, set KDIR to the right location::
-
-	$ make KDIR=/path/to/kernel/dev/tree
-
-   For more information, please contact linux-wimax@intel.com.
-
-3. Installing the firmware
---------------------------
-
-   The firmware can be obtained from http://linuxwimax.org or might have
-   been supplied with your hardware.
-
-   It has to be installed in the target system::
-
-	$ cp FIRMWAREFILE.sbcf /lib/firmware/i2400m-fw-BUSTYPE-1.3.sbcf
-
-     * NOTE: if your firmware came in an .rpm or .deb file, just install
-       it as normal, with the rpm (rpm -i FIRMWARE.rpm) or dpkg
-       (dpkg -i FIRMWARE.deb) commands. No further action is needed.
-     * BUSTYPE will be usb or sdio, depending on the hardware you have.
-       Each hardware type comes with its own firmware and will not work
-       with other types.
-
-4. Design
-=========
-
-   This package contains two major parts: a WiMAX kernel stack and a
-   driver for the Intel i2400m.
-
-   The WiMAX stack is designed to provide for common WiMAX control
-   services to current and future WiMAX devices from any vendor; please
-   see README.wimax for details.
-
-   The i2400m kernel driver is broken up in two main parts: the bus
-   generic driver and the bus-specific drivers. The bus generic driver
-   forms the drivercore and contain no knowledge of the actual method we
-   use to connect to the device. The bus specific drivers are just the
-   glue to connect the bus-generic driver and the device. Currently only
-   USB and SDIO are supported. See drivers/net/wimax/i2400m/i2400m.h for
-   more information.
-
-   The bus generic driver is logically broken up in two parts: OS-glue and
-   hardware-glue. The OS-glue interfaces with Linux. The hardware-glue
-   interfaces with the device on using an interface provided by the
-   bus-specific driver. The reason for this breakup is to be able to
-   easily reuse the hardware-glue to write drivers for other OSes; note
-   the hardware glue part is written as a native Linux driver; no
-   abstraction layers are used, so to port to another OS, the Linux kernel
-   API calls should be replaced with the target OS's.
-
-5. Usage
-========
-
-   To load the driver, follow the instructions in the install section;
-   once the driver is loaded, plug in the device (unless it is permanently
-   plugged in). The driver will enumerate the device, upload the firmware
-   and output messages in the kernel log (dmesg, /var/log/messages or
-   /var/log/kern.log) such as::
-
-	...
-	i2400m_usb 5-4:1.0: firmware interface version 8.0.0
-	i2400m_usb 5-4:1.0: WiMAX interface wmx0 (00:1d:e1:01:94:2c) ready
-
-   At this point the device is ready to work.
-
-   Current versions require the Intel WiMAX Network Service in userspace
-   to make things work. See the network service's README for instructions
-   on how to scan, connect and disconnect.
-
-5.1. Module parameters
-----------------------
-
-   Module parameters can be set at kernel or module load time or by
-   echoing values::
-
-	$ echo VALUE > /sys/module/MODULENAME/parameters/PARAMETERNAME
-
-   To make changes permanent, for example, for the i2400m module, you can
-   also create a file named /etc/modprobe.d/i2400m containing::
-
-	options i2400m idle_mode_disabled=1
-
-   To find which parameters are supported by a module, run::
-
-	$ modinfo path/to/module.ko
-
-   During kernel bootup (if the driver is linked in the kernel), specify
-   the following to the kernel command line::
-
-	i2400m.PARAMETER=VALUE
-
-5.1.1. i2400m: idle_mode_disabled
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   The i2400m module supports a parameter to disable idle mode. This
-   parameter, once set, will take effect only when the device is
-   reinitialized by the driver (eg: following a reset or a reconnect).
-
-5.2. Debug operations: debugfs entries
---------------------------------------
-
-   The driver will register debugfs entries that allow the user to tweak
-   debug settings. There are three main container directories where
-   entries are placed, which correspond to the three blocks a i2400m WiMAX
-   driver has:
-
-     * /sys/kernel/debug/wimax:DEVNAME/ for the generic WiMAX stack
-       controls
-     * /sys/kernel/debug/wimax:DEVNAME/i2400m for the i2400m generic
-       driver controls
-     * /sys/kernel/debug/wimax:DEVNAME/i2400m-usb (or -sdio) for the
-       bus-specific i2400m-usb or i2400m-sdio controls).
-
-   Of course, if debugfs is mounted in a directory other than
-   /sys/kernel/debug, those paths will change.
-
-5.2.1. Increasing debug output
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   The files named *dl_* indicate knobs for controlling the debug output
-   of different submodules::
-
-	# find /sys/kernel/debug/wimax\:wmx0 -name \*dl_\*
-	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_tx
-	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_rx
-	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_notif
-	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_fw
-	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_usb
-	/sys/kernel/debug/wimax:wmx0/i2400m/dl_tx
-	/sys/kernel/debug/wimax:wmx0/i2400m/dl_rx
-	/sys/kernel/debug/wimax:wmx0/i2400m/dl_rfkill
-	/sys/kernel/debug/wimax:wmx0/i2400m/dl_netdev
-	/sys/kernel/debug/wimax:wmx0/i2400m/dl_fw
-	/sys/kernel/debug/wimax:wmx0/i2400m/dl_debugfs
-	/sys/kernel/debug/wimax:wmx0/i2400m/dl_driver
-	/sys/kernel/debug/wimax:wmx0/i2400m/dl_control
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_stack
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_rfkill
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_reset
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_msg
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_id_table
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_debugfs
-
-   By reading the file you can obtain the current value of said debug
-   level; by writing to it, you can set it.
-
-   To increase the debug level of, for example, the i2400m's generic TX
-   engine, just write::
-
-	$ echo 3 > /sys/kernel/debug/wimax:wmx0/i2400m/dl_tx
-
-   Increasing numbers yield increasing debug information; for details of
-   what is printed and the available levels, check the source. The code
-   uses 0 for disabled and increasing values until 8.
-
-5.2.2. RX and TX statistics
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   The i2400m/rx_stats and i2400m/tx_stats provide statistics about the
-   data reception/delivery from the device::
-
-	$ cat /sys/kernel/debug/wimax:wmx0/i2400m/rx_stats
-	45 1 3 34 3104 48 480
-
-   The numbers reported are:
-
-     * packets/RX-buffer: total, min, max
-     * RX-buffers: total RX buffers received, accumulated RX buffer size
-       in bytes, min size received, max size received
-
-   Thus, to find the average buffer size received, divide accumulated
-   RX-buffer / total RX-buffers.
-
-   To clear the statistics back to 0, write anything to the rx_stats file::
-
-	$ echo 1 > /sys/kernel/debug/wimax:wmx0/i2400m_rx_stats
-
-   Likewise for TX.
-
-   Note the packets this debug file refers to are not network packet, but
-   packets in the sense of the device-specific protocol for communication
-   to the host. See drivers/net/wimax/i2400m/tx.c.
-
-5.2.3. Tracing messages received from user space
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   To echo messages received from user space into the trace pipe that the
-   i2400m driver creates, set the debug file i2400m/trace_msg_from_user to
-   1::
-
-	$ echo 1 > /sys/kernel/debug/wimax:wmx0/i2400m/trace_msg_from_user
-
-5.2.4. Performing a device reset
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   By writing a 0, a 1 or a 2 to the file
-   /sys/kernel/debug/wimax:wmx0/reset, the driver performs a warm (without
-   disconnecting from the bus), cold (disconnecting from the bus) or bus
-   (bus specific) reset on the device.
-
-5.2.5. Asking the device to enter power saving mode
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   By writing any value to the /sys/kernel/debug/wimax:wmx0 file, the
-   device will attempt to enter power saving mode.
-
-6. Troubleshooting
-==================
-
-6.1. Driver complains about ``i2400m-fw-usb-1.2.sbcf: request failed``
-----------------------------------------------------------------------
-
-   If upon connecting the device, the following is output in the kernel
-   log::
-
-	i2400m_usb 5-4:1.0: fw i2400m-fw-usb-1.3.sbcf: request failed: -2
-
-   This means that the driver cannot locate the firmware file named
-   /lib/firmware/i2400m-fw-usb-1.2.sbcf. Check that the file is present in
-   the right location.
diff --git a/Documentation/admin-guide/wimax/index.rst b/Documentation/admin-guide/wimax/index.rst
deleted file mode 100644
index fdf7c1f99ff5..000000000000
--- a/Documentation/admin-guide/wimax/index.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-WiMAX subsystem
-===============
-
-.. toctree::
-   :maxdepth: 2
-
-   wimax
-
-   i2400m
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/admin-guide/wimax/wimax.rst b/Documentation/admin-guide/wimax/wimax.rst
deleted file mode 100644
index 817ee8ba2732..000000000000
--- a/Documentation/admin-guide/wimax/wimax.rst
+++ /dev/null
@@ -1,89 +0,0 @@
-.. include:: <isonum.txt>
-
-========================
-Linux kernel WiMAX stack
-========================
-
-:Copyright: |copy| 2008 Intel Corporation < linux-wimax@intel.com >
-
-   This provides a basic Linux kernel WiMAX stack to provide a common
-   control API for WiMAX devices, usable from kernel and user space.
-
-1. Design
-=========
-
-   The WiMAX stack is designed to provide for common WiMAX control
-   services to current and future WiMAX devices from any vendor.
-
-   Because currently there is only one and we don't know what would be the
-   common services, the APIs it currently provides are very minimal.
-   However, it is done in such a way that it is easily extensible to
-   accommodate future requirements.
-
-   The stack works by embedding a struct wimax_dev in your device's
-   control structures. This provides a set of callbacks that the WiMAX
-   stack will call in order to implement control operations requested by
-   the user. As well, the stack provides API functions that the driver
-   calls to notify about changes of state in the device.
-
-   The stack exports the API calls needed to control the device to user
-   space using generic netlink as a marshalling mechanism. You can access
-   them using your own code or use the wrappers provided for your
-   convenience in libwimax (in the wimax-tools package).
-
-   For detailed information on the stack, please see
-   include/linux/wimax.h.
-
-2. Usage
-========
-
-   For usage in a driver (registration, API, etc) please refer to the
-   instructions in the header file include/linux/wimax.h.
-
-   When a device is registered with the WiMAX stack, a set of debugfs
-   files will appear in /sys/kernel/debug/wimax:wmxX can tweak for
-   control.
-
-2.1. Obtaining debug information: debugfs entries
--------------------------------------------------
-
-   The WiMAX stack is compiled, by default, with debug messages that can
-   be used to diagnose issues. By default, said messages are disabled.
-
-   The drivers will register debugfs entries that allow the user to tweak
-   debug settings.
-
-   Each driver, when registering with the stack, will cause a debugfs
-   directory named wimax:DEVICENAME to be created; optionally, it might
-   create more subentries below it.
-
-2.1.1. Increasing debug output
-------------------------------
-
-   The files named *dl_* indicate knobs for controlling the debug output
-   of different submodules of the WiMAX stack::
-
-	# find /sys/kernel/debug/wimax\:wmx0 -name \*dl_\*
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_stack
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_rfkill
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_reset
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_msg
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_id_table
-	/sys/kernel/debug/wimax:wmx0/wimax_dl_debugfs
-	/sys/kernel/debug/wimax:wmx0/.... # other driver specific files
-
-   NOTE:
-       Of course, if debugfs is mounted in a directory other than
-       /sys/kernel/debug, those paths will change.
-
-   By reading the file you can obtain the current value of said debug
-   level; by writing to it, you can set it.
-
-   To increase the debug level of, for example, the id-table submodule,
-   just write:
-
-	$ echo 3 > /sys/kernel/debug/wimax:wmx0/wimax_dl_id_table
-
-   Increasing numbers yield increasing debug information; for details of
-   what is printed and the available levels, check the source. The code
-   uses 0 for disabled and increasing values until 8.
diff --git a/Documentation/networking/kapi.rst b/Documentation/networking/kapi.rst
index d198fa5eaacd..ea55f462cefa 100644
--- a/Documentation/networking/kapi.rst
+++ b/Documentation/networking/kapi.rst
@@ -83,27 +83,6 @@ SUN RPC subsystem
 .. kernel-doc:: net/sunrpc/clnt.c
    :export:
 
-WiMAX
------
-
-.. kernel-doc:: net/wimax/op-msg.c
-   :export:
-
-.. kernel-doc:: net/wimax/op-reset.c
-   :export:
-
-.. kernel-doc:: net/wimax/op-rfkill.c
-   :export:
-
-.. kernel-doc:: net/wimax/stack.c
-   :export:
-
-.. kernel-doc:: include/net/wimax.h
-   :internal:
-
-.. kernel-doc:: include/uapi/linux/wimax.h
-   :internal:
-
 Network device support
 ======================
 
diff --git a/Documentation/translations/zh_CN/admin-guide/index.rst b/Documentation/translations/zh_CN/admin-guide/index.rst
index ed5ab7e37f38..48bbd3ebad48 100644
--- a/Documentation/translations/zh_CN/admin-guide/index.rst
+++ b/Documentation/translations/zh_CN/admin-guide/index.rst
@@ -114,7 +114,6 @@ Todolist:
    unicode
    vga-softcursor
    video-output
-   wimax/index
    xfs
 
 .. only::  subproject and html
diff --git a/MAINTAINERS b/MAINTAINERS
index e73636b75f29..17f5571788c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9106,16 +9106,6 @@ W:	https://wireless.wiki.kernel.org/en/users/drivers/iwlwifi
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi.git
 F:	drivers/net/wireless/intel/iwlwifi/
 
-INTEL WIRELESS WIMAX CONNECTION 2400
-M:	Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
-M:	linux-wimax@intel.com
-L:	wimax@linuxwimax.org (subscribers-only)
-S:	Supported
-W:	http://linuxwimax.org
-F:	Documentation/admin-guide/wimax/i2400m.rst
-F:	drivers/net/wimax/i2400m/
-F:	include/uapi/linux/wimax/i2400m.h
-
 INTEL WMI SLIM BOOTLOADER (SBL) FIRMWARE UPDATE DRIVER
 M:	Jithu Joseph <jithu.joseph@intel.com>
 R:	Maurice Ma <maurice.ma@intel.com>
@@ -18907,18 +18897,6 @@ S:	Supported
 W:	https://wireless.wiki.kernel.org/en/users/Drivers/wil6210
 F:	drivers/net/wireless/ath/wil6210/
 
-WIMAX STACK
-M:	Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
-M:	linux-wimax@intel.com
-L:	wimax@linuxwimax.org (subscribers-only)
-S:	Supported
-W:	http://linuxwimax.org
-F:	Documentation/admin-guide/wimax/wimax.rst
-F:	include/linux/wimax/debug.h
-F:	include/net/wimax.h
-F:	include/uapi/linux/wimax.h
-F:	net/wimax/
-
 WINBOND CIR DRIVER
 M:	David Härdeman <david@hardeman.nu>
 S:	Maintained
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index c3dbe64e628e..c0af2dc8b938 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -489,8 +489,6 @@ source "drivers/net/usb/Kconfig"
 
 source "drivers/net/wireless/Kconfig"
 
-source "drivers/net/wimax/Kconfig"
-
 source "drivers/net/wan/Kconfig"
 
 source "drivers/net/ieee802154/Kconfig"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 72e18d505d1a..b27e8633c305 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -66,7 +66,6 @@ obj-$(CONFIG_NET_SB1000) += sb1000.o
 obj-$(CONFIG_SUNGEM_PHY) += sungem_phy.o
 obj-$(CONFIG_WAN) += wan/
 obj-$(CONFIG_WLAN) += wireless/
-obj-$(CONFIG_WIMAX) += wimax/
 obj-$(CONFIG_IEEE802154) += ieee802154/
 
 obj-$(CONFIG_VMXNET3) += vmxnet3/
diff --git a/drivers/net/wimax/Kconfig b/drivers/net/wimax/Kconfig
deleted file mode 100644
index 2249e3d77a76..000000000000
--- a/drivers/net/wimax/Kconfig
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# WiMAX LAN device drivers configuration
-#
-
-
-comment "Enable WiMAX (Networking options) to see the WiMAX drivers"
-	depends on WIMAX = n
-
-if WIMAX
-
-menu "WiMAX Wireless Broadband devices"
-
-source "drivers/net/wimax/i2400m/Kconfig"
-
-endmenu
-
-endif
diff --git a/drivers/net/wimax/Makefile b/drivers/net/wimax/Makefile
deleted file mode 100644
index b4575bacf994..000000000000
--- a/drivers/net/wimax/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_WIMAX_I2400M)	+= i2400m/
diff --git a/drivers/net/wimax/i2400m/Kconfig b/drivers/net/wimax/i2400m/Kconfig
deleted file mode 100644
index 843b905a26a3..000000000000
--- a/drivers/net/wimax/i2400m/Kconfig
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config WIMAX_I2400M
-	tristate
-	depends on WIMAX
-	select FW_LOADER
-
-comment "Enable USB support to see WiMAX USB drivers"
-	depends on USB = n
-
-config WIMAX_I2400M_USB
-	tristate "Intel Wireless WiMAX Connection 2400 over USB (including 5x50)"
-	depends on WIMAX && USB
-	select WIMAX_I2400M
-	help
-	  Select if you have a device based on the Intel WiMAX
-	  Connection 2400 over USB (like any of the Intel Wireless
-	  WiMAX/WiFi Link 5x50 series).
-
-	  If unsure, it is safe to select M (module).
-
-config WIMAX_I2400M_DEBUG_LEVEL
-	int "WiMAX i2400m debug level"
-	depends on WIMAX_I2400M
-	default 8
-	help
-
-	  Select the maximum debug verbosity level to be compiled into
-	  the WiMAX i2400m driver code.
-
-	  By default, this is disabled at runtime and can be
-	  selectively enabled at runtime for different parts of the
-	  code using the sysfs debug-levels file.
-
-	  If set at zero, this will compile out all the debug code.
-
-	  It is recommended that it is left at 8.
diff --git a/drivers/net/wimax/i2400m/Makefile b/drivers/net/wimax/i2400m/Makefile
deleted file mode 100644
index b1db1eff0648..000000000000
--- a/drivers/net/wimax/i2400m/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-$(CONFIG_WIMAX_I2400M) += i2400m.o
-obj-$(CONFIG_WIMAX_I2400M_USB) += i2400m-usb.o
-
-i2400m-y :=		\
-	control.o	\
-	driver.o	\
-	fw.o		\
-	op-rfkill.o	\
-	sysfs.o		\
-	netdev.o	\
-	tx.o		\
-	rx.o
-
-i2400m-$(CONFIG_DEBUG_FS) += debugfs.o
-
-i2400m-usb-y :=			\
-	usb-fw.o		\
-	usb-notif.o		\
-	usb-tx.o		\
-	usb-rx.o		\
-	usb.o
diff --git a/drivers/net/wimax/i2400m/control.c b/drivers/net/wimax/i2400m/control.c
deleted file mode 100644
index 8df98757d901..000000000000
--- a/drivers/net/wimax/i2400m/control.c
+++ /dev/null
@@ -1,1434 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Miscellaneous control functions for managing the device
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Initial implementation
- *
- * This is a collection of functions used to control the device (plus
- * a few helpers).
- *
- * There are utilities for handling TLV buffers, hooks on the device's
- * reports to act on device changes of state [i2400m_report_hook()],
- * on acks to commands [i2400m_msg_ack_hook()], a helper for sending
- * commands to the device and blocking until a reply arrives
- * [i2400m_msg_to_dev()], a few high level commands for manipulating
- * the device state, powersving mode and configuration plus the
- * routines to setup the device once communication is stablished with
- * it [i2400m_dev_initialize()].
- *
- * ROADMAP
- *
- * i2400m_dev_initialize()       Called by i2400m_dev_start()
- *   i2400m_set_init_config()
- *   i2400m_cmd_get_state()
- * i2400m_dev_shutdown()        Called by i2400m_dev_stop()
- *   i2400m_reset()
- *
- * i2400m_{cmd,get,set}_*()
- *   i2400m_msg_to_dev()
- *   i2400m_msg_check_status()
- *
- * i2400m_report_hook()         Called on reception of an event
- *   i2400m_report_state_hook()
- *     i2400m_tlv_buffer_walk()
- *     i2400m_tlv_match()
- *     i2400m_report_tlv_system_state()
- *     i2400m_report_tlv_rf_switches_status()
- *     i2400m_report_tlv_media_status()
- *   i2400m_cmd_enter_powersave()
- *
- * i2400m_msg_ack_hook()        Called on reception of a reply to a
- *                              command, get or set
- */
-
-#include <stdarg.h>
-#include "i2400m.h"
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/wimax/i2400m.h>
-#include <linux/export.h>
-#include <linux/moduleparam.h>
-
-
-#define D_SUBMODULE control
-#include "debug-levels.h"
-
-static int i2400m_idle_mode_disabled;/* 0 (idle mode enabled) by default */
-module_param_named(idle_mode_disabled, i2400m_idle_mode_disabled, int, 0644);
-MODULE_PARM_DESC(idle_mode_disabled,
-		 "If true, the device will not enable idle mode negotiation "
-		 "with the base station (when connected) to save power.");
-
-/* 0 (power saving enabled) by default */
-static int i2400m_power_save_disabled;
-module_param_named(power_save_disabled, i2400m_power_save_disabled, int, 0644);
-MODULE_PARM_DESC(power_save_disabled,
-		 "If true, the driver will not tell the device to enter "
-		 "power saving mode when it reports it is ready for it. "
-		 "False by default (so the device is told to do power "
-		 "saving).");
-
-static int i2400m_passive_mode;	/* 0 (passive mode disabled) by default */
-module_param_named(passive_mode, i2400m_passive_mode, int, 0644);
-MODULE_PARM_DESC(passive_mode,
-		 "If true, the driver will not do any device setup "
-		 "and leave it up to user space, who must be properly "
-		 "setup.");
-
-
-/*
- * Return if a TLV is of a give type and size
- *
- * @tlv_hdr: pointer to the TLV
- * @tlv_type: type of the TLV we are looking for
- * @tlv_size: expected size of the TLV we are looking for (if -1,
- *            don't check the size). This includes the header
- * Returns: 0 if the TLV matches
- *          < 0 if it doesn't match at all
- *          > 0 total TLV + payload size, if the type matches, but not
- *              the size
- */
-static
-ssize_t i2400m_tlv_match(const struct i2400m_tlv_hdr *tlv,
-		     enum i2400m_tlv tlv_type, ssize_t tlv_size)
-{
-	if (le16_to_cpu(tlv->type) != tlv_type)	/* Not our type? skip */
-		return -1;
-	if (tlv_size != -1
-	    && le16_to_cpu(tlv->length) + sizeof(*tlv) != tlv_size) {
-		size_t size = le16_to_cpu(tlv->length) + sizeof(*tlv);
-		printk(KERN_WARNING "W: tlv type 0x%x mismatched because of "
-		       "size (got %zu vs %zd expected)\n",
-		       tlv_type, size, tlv_size);
-		return size;
-	}
-	return 0;
-}
-
-
-/*
- * Given a buffer of TLVs, iterate over them
- *
- * @i2400m: device instance
- * @tlv_buf: pointer to the beginning of the TLV buffer
- * @buf_size: buffer size in bytes
- * @tlv_pos: seek position; this is assumed to be a pointer returned
- *           by i2400m_tlv_buffer_walk() [and thus, validated]. The
- *           TLV returned will be the one following this one.
- *
- * Usage:
- *
- * tlv_itr = NULL;
- * while (tlv_itr = i2400m_tlv_buffer_walk(i2400m, buf, size, tlv_itr))  {
- *         ...
- *         // Do stuff with tlv_itr, DON'T MODIFY IT
- *         ...
- * }
- */
-static
-const struct i2400m_tlv_hdr *i2400m_tlv_buffer_walk(
-	struct i2400m *i2400m,
-	const void *tlv_buf, size_t buf_size,
-	const struct i2400m_tlv_hdr *tlv_pos)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_tlv_hdr *tlv_top = tlv_buf + buf_size;
-	size_t offset, length, avail_size;
-	unsigned type;
-
-	if (tlv_pos == NULL)	/* Take the first one? */
-		tlv_pos = tlv_buf;
-	else			/* Nope, the next one */
-		tlv_pos = (void *) tlv_pos
-			+ le16_to_cpu(tlv_pos->length) + sizeof(*tlv_pos);
-	if (tlv_pos == tlv_top) {	/* buffer done */
-		tlv_pos = NULL;
-		goto error_beyond_end;
-	}
-	if (tlv_pos > tlv_top) {
-		tlv_pos = NULL;
-		WARN_ON(1);
-		goto error_beyond_end;
-	}
-	offset = (void *) tlv_pos - (void *) tlv_buf;
-	avail_size = buf_size - offset;
-	if (avail_size < sizeof(*tlv_pos)) {
-		dev_err(dev, "HW BUG? tlv_buf %p [%zu bytes], tlv @%zu: "
-			"short header\n", tlv_buf, buf_size, offset);
-		goto error_short_header;
-	}
-	type = le16_to_cpu(tlv_pos->type);
-	length = le16_to_cpu(tlv_pos->length);
-	if (avail_size < sizeof(*tlv_pos) + length) {
-		dev_err(dev, "HW BUG? tlv_buf %p [%zu bytes], "
-			"tlv type 0x%04x @%zu: "
-			"short data (%zu bytes vs %zu needed)\n",
-			tlv_buf, buf_size, type, offset, avail_size,
-			sizeof(*tlv_pos) + length);
-		goto error_short_header;
-	}
-error_short_header:
-error_beyond_end:
-	return tlv_pos;
-}
-
-
-/*
- * Find a TLV in a buffer of sequential TLVs
- *
- * @i2400m: device descriptor
- * @tlv_hdr: pointer to the first TLV in the sequence
- * @size: size of the buffer in bytes; all TLVs are assumed to fit
- *        fully in the buffer (otherwise we'll complain).
- * @tlv_type: type of the TLV we are looking for
- * @tlv_size: expected size of the TLV we are looking for (if -1,
- *            don't check the size). This includes the header
- *
- * Returns: NULL if the TLV is not found, otherwise a pointer to
- *          it. If the sizes don't match, an error is printed and NULL
- *          returned.
- */
-static
-const struct i2400m_tlv_hdr *i2400m_tlv_find(
-	struct i2400m *i2400m,
-	const struct i2400m_tlv_hdr *tlv_hdr, size_t size,
-	enum i2400m_tlv tlv_type, ssize_t tlv_size)
-{
-	ssize_t match;
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_tlv_hdr *tlv = NULL;
-	while ((tlv = i2400m_tlv_buffer_walk(i2400m, tlv_hdr, size, tlv))) {
-		match = i2400m_tlv_match(tlv, tlv_type, tlv_size);
-		if (match == 0)		/* found it :) */
-			break;
-		if (match > 0)
-			dev_warn(dev, "TLV type 0x%04x found with size "
-				 "mismatch (%zu vs %zd needed)\n",
-				 tlv_type, match, tlv_size);
-	}
-	return tlv;
-}
-
-
-static const struct
-{
-	char *msg;
-	int errno;
-} ms_to_errno[I2400M_MS_MAX] = {
-	[I2400M_MS_DONE_OK] = { "", 0 },
-	[I2400M_MS_DONE_IN_PROGRESS] = { "", 0 },
-	[I2400M_MS_INVALID_OP] = { "invalid opcode", -ENOSYS },
-	[I2400M_MS_BAD_STATE] = { "invalid state", -EILSEQ },
-	[I2400M_MS_ILLEGAL_VALUE] = { "illegal value", -EINVAL },
-	[I2400M_MS_MISSING_PARAMS] = { "missing parameters", -ENOMSG },
-	[I2400M_MS_VERSION_ERROR] = { "bad version", -EIO },
-	[I2400M_MS_ACCESSIBILITY_ERROR] = { "accesibility error", -EIO },
-	[I2400M_MS_BUSY] = { "busy", -EBUSY },
-	[I2400M_MS_CORRUPTED_TLV] = { "corrupted TLV", -EILSEQ },
-	[I2400M_MS_UNINITIALIZED] = { "uninitialized", -EILSEQ },
-	[I2400M_MS_UNKNOWN_ERROR] = { "unknown error", -EIO },
-	[I2400M_MS_PRODUCTION_ERROR] = { "production error", -EIO },
-	[I2400M_MS_NO_RF] = { "no RF", -EIO },
-	[I2400M_MS_NOT_READY_FOR_POWERSAVE] =
-		{ "not ready for powersave", -EACCES },
-	[I2400M_MS_THERMAL_CRITICAL] = { "thermal critical", -EL3HLT },
-};
-
-
-/*
- * i2400m_msg_check_status - translate a message's status code
- *
- * @i2400m: device descriptor
- * @l3l4_hdr: message header
- * @strbuf: buffer to place a formatted error message (unless NULL).
- * @strbuf_size: max amount of available space; larger messages will
- * be truncated.
- *
- * Returns: errno code corresponding to the status code in @l3l4_hdr
- *          and a message in @strbuf describing the error.
- */
-int i2400m_msg_check_status(const struct i2400m_l3l4_hdr *l3l4_hdr,
-			    char *strbuf, size_t strbuf_size)
-{
-	int result;
-	enum i2400m_ms status = le16_to_cpu(l3l4_hdr->status);
-	const char *str;
-
-	if (status == 0)
-		return 0;
-	if (status >= ARRAY_SIZE(ms_to_errno)) {
-		str = "unknown status code";
-		result = -EBADR;
-	} else {
-		str = ms_to_errno[status].msg;
-		result = ms_to_errno[status].errno;
-	}
-	if (strbuf)
-		snprintf(strbuf, strbuf_size, "%s (%d)", str, status);
-	return result;
-}
-
-
-/*
- * Act on a TLV System State reported by the device
- *
- * @i2400m: device descriptor
- * @ss: validated System State TLV
- */
-static
-void i2400m_report_tlv_system_state(struct i2400m *i2400m,
-				    const struct i2400m_tlv_system_state *ss)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	enum i2400m_system_state i2400m_state = le32_to_cpu(ss->state);
-
-	d_fnstart(3, dev, "(i2400m %p ss %p [%u])\n", i2400m, ss, i2400m_state);
-
-	if (i2400m->state != i2400m_state) {
-		i2400m->state = i2400m_state;
-		wake_up_all(&i2400m->state_wq);
-	}
-	switch (i2400m_state) {
-	case I2400M_SS_UNINITIALIZED:
-	case I2400M_SS_INIT:
-	case I2400M_SS_CONFIG:
-	case I2400M_SS_PRODUCTION:
-		wimax_state_change(wimax_dev, WIMAX_ST_UNINITIALIZED);
-		break;
-
-	case I2400M_SS_RF_OFF:
-	case I2400M_SS_RF_SHUTDOWN:
-		wimax_state_change(wimax_dev, WIMAX_ST_RADIO_OFF);
-		break;
-
-	case I2400M_SS_READY:
-	case I2400M_SS_STANDBY:
-	case I2400M_SS_SLEEPACTIVE:
-		wimax_state_change(wimax_dev, WIMAX_ST_READY);
-		break;
-
-	case I2400M_SS_CONNECTING:
-	case I2400M_SS_WIMAX_CONNECTED:
-		wimax_state_change(wimax_dev, WIMAX_ST_READY);
-		break;
-
-	case I2400M_SS_SCAN:
-	case I2400M_SS_OUT_OF_ZONE:
-		wimax_state_change(wimax_dev, WIMAX_ST_SCANNING);
-		break;
-
-	case I2400M_SS_IDLE:
-		d_printf(1, dev, "entering BS-negotiated idle mode\n");
-		fallthrough;
-	case I2400M_SS_DISCONNECTING:
-	case I2400M_SS_DATA_PATH_CONNECTED:
-		wimax_state_change(wimax_dev, WIMAX_ST_CONNECTED);
-		break;
-
-	default:
-		/* Huh? just in case, shut it down */
-		dev_err(dev, "HW BUG? unknown state %u: shutting down\n",
-			i2400m_state);
-		i2400m_reset(i2400m, I2400M_RT_WARM);
-		break;
-	}
-	d_fnend(3, dev, "(i2400m %p ss %p [%u]) = void\n",
-		i2400m, ss, i2400m_state);
-}
-
-
-/*
- * Parse and act on a TLV Media Status sent by the device
- *
- * @i2400m: device descriptor
- * @ms: validated Media Status TLV
- *
- * This will set the carrier up on down based on the device's link
- * report. This is done asides of what the WiMAX stack does based on
- * the device's state as sometimes we need to do a link-renew (the BS
- * wants us to renew a DHCP lease, for example).
- *
- * In fact, doc says that every time we get a link-up, we should do a
- * DHCP negotiation...
- */
-static
-void i2400m_report_tlv_media_status(struct i2400m *i2400m,
-				    const struct i2400m_tlv_media_status *ms)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	struct net_device *net_dev = wimax_dev->net_dev;
-	enum i2400m_media_status status = le32_to_cpu(ms->media_status);
-
-	d_fnstart(3, dev, "(i2400m %p ms %p [%u])\n", i2400m, ms, status);
-
-	switch (status) {
-	case I2400M_MEDIA_STATUS_LINK_UP:
-		netif_carrier_on(net_dev);
-		break;
-	case I2400M_MEDIA_STATUS_LINK_DOWN:
-		netif_carrier_off(net_dev);
-		break;
-	/*
-	 * This is the network telling us we need to retrain the DHCP
-	 * lease -- so far, we are trusting the WiMAX Network Service
-	 * in user space to pick this up and poke the DHCP client.
-	 */
-	case I2400M_MEDIA_STATUS_LINK_RENEW:
-		netif_carrier_on(net_dev);
-		break;
-	default:
-		dev_err(dev, "HW BUG? unknown media status %u\n",
-			status);
-	}
-	d_fnend(3, dev, "(i2400m %p ms %p [%u]) = void\n",
-		i2400m, ms, status);
-}
-
-
-/*
- * Process a TLV from a 'state report'
- *
- * @i2400m: device descriptor
- * @tlv: pointer to the TLV header; it has been already validated for
- *     consistent size.
- * @tag: for error messages
- *
- * Act on the TLVs from a 'state report'.
- */
-static
-void i2400m_report_state_parse_tlv(struct i2400m *i2400m,
-				   const struct i2400m_tlv_hdr *tlv,
-				   const char *tag)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_tlv_media_status *ms;
-	const struct i2400m_tlv_system_state *ss;
-	const struct i2400m_tlv_rf_switches_status *rfss;
-
-	if (0 == i2400m_tlv_match(tlv, I2400M_TLV_SYSTEM_STATE, sizeof(*ss))) {
-		ss = container_of(tlv, typeof(*ss), hdr);
-		d_printf(2, dev, "%s: system state TLV "
-			 "found (0x%04x), state 0x%08x\n",
-			 tag, I2400M_TLV_SYSTEM_STATE,
-			 le32_to_cpu(ss->state));
-		i2400m_report_tlv_system_state(i2400m, ss);
-	}
-	if (0 == i2400m_tlv_match(tlv, I2400M_TLV_RF_STATUS, sizeof(*rfss))) {
-		rfss = container_of(tlv, typeof(*rfss), hdr);
-		d_printf(2, dev, "%s: RF status TLV "
-			 "found (0x%04x), sw 0x%02x hw 0x%02x\n",
-			 tag, I2400M_TLV_RF_STATUS,
-			 le32_to_cpu(rfss->sw_rf_switch),
-			 le32_to_cpu(rfss->hw_rf_switch));
-		i2400m_report_tlv_rf_switches_status(i2400m, rfss);
-	}
-	if (0 == i2400m_tlv_match(tlv, I2400M_TLV_MEDIA_STATUS, sizeof(*ms))) {
-		ms = container_of(tlv, typeof(*ms), hdr);
-		d_printf(2, dev, "%s: Media Status TLV: %u\n",
-			 tag, le32_to_cpu(ms->media_status));
-		i2400m_report_tlv_media_status(i2400m, ms);
-	}
-}
-
-
-/*
- * Parse a 'state report' and extract information
- *
- * @i2400m: device descriptor
- * @l3l4_hdr: pointer to message; it has been already validated for
- *            consistent size.
- * @size: size of the message (header + payload). The header length
- *        declaration is assumed to be congruent with @size (as in
- *        sizeof(*l3l4_hdr) + l3l4_hdr->length == size)
- *
- * Walk over the TLVs in a report state and act on them.
- */
-static
-void i2400m_report_state_hook(struct i2400m *i2400m,
-			      const struct i2400m_l3l4_hdr *l3l4_hdr,
-			      size_t size, const char *tag)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_tlv_hdr *tlv;
-	size_t tlv_size = le16_to_cpu(l3l4_hdr->length);
-
-	d_fnstart(4, dev, "(i2400m %p, l3l4_hdr %p, size %zu, %s)\n",
-		  i2400m, l3l4_hdr, size, tag);
-	tlv = NULL;
-
-	while ((tlv = i2400m_tlv_buffer_walk(i2400m, &l3l4_hdr->pl,
-					     tlv_size, tlv)))
-		i2400m_report_state_parse_tlv(i2400m, tlv, tag);
-	d_fnend(4, dev, "(i2400m %p, l3l4_hdr %p, size %zu, %s) = void\n",
-		i2400m, l3l4_hdr, size, tag);
-}
-
-
-/*
- * i2400m_report_hook - (maybe) act on a report
- *
- * @i2400m: device descriptor
- * @l3l4_hdr: pointer to message; it has been already validated for
- *            consistent size.
- * @size: size of the message (header + payload). The header length
- *        declaration is assumed to be congruent with @size (as in
- *        sizeof(*l3l4_hdr) + l3l4_hdr->length == size)
- *
- * Extract information we might need (like carrien on/off) from a
- * device report.
- */
-void i2400m_report_hook(struct i2400m *i2400m,
-			const struct i2400m_l3l4_hdr *l3l4_hdr, size_t size)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	unsigned msg_type;
-
-	d_fnstart(3, dev, "(i2400m %p l3l4_hdr %p size %zu)\n",
-		  i2400m, l3l4_hdr, size);
-	/* Chew on the message, we might need some information from
-	 * here */
-	msg_type = le16_to_cpu(l3l4_hdr->type);
-	switch (msg_type) {
-	case I2400M_MT_REPORT_STATE:	/* carrier detection... */
-		i2400m_report_state_hook(i2400m,
-					 l3l4_hdr, size, "REPORT STATE");
-		break;
-	/* If the device is ready for power save, then ask it to do
-	 * it. */
-	case I2400M_MT_REPORT_POWERSAVE_READY:	/* zzzzz */
-		if (l3l4_hdr->status == cpu_to_le16(I2400M_MS_DONE_OK)) {
-			if (i2400m_power_save_disabled)
-				d_printf(1, dev, "ready for powersave, "
-					 "not requesting (disabled by module "
-					 "parameter)\n");
-			else {
-				d_printf(1, dev, "ready for powersave, "
-					 "requesting\n");
-				i2400m_cmd_enter_powersave(i2400m);
-			}
-		}
-		break;
-	}
-	d_fnend(3, dev, "(i2400m %p l3l4_hdr %p size %zu) = void\n",
-		i2400m, l3l4_hdr, size);
-}
-
-
-/*
- * i2400m_msg_ack_hook - process cmd/set/get ack for internal status
- *
- * @i2400m: device descriptor
- * @l3l4_hdr: pointer to message; it has been already validated for
- *            consistent size.
- * @size: size of the message
- *
- * Extract information we might need from acks to commands and act on
- * it. This is akin to i2400m_report_hook(). Note most of this
- * processing should be done in the function that calls the
- * command. This is here for some cases where it can't happen...
- */
-static void i2400m_msg_ack_hook(struct i2400m *i2400m,
-				 const struct i2400m_l3l4_hdr *l3l4_hdr,
-				 size_t size)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	unsigned int ack_type;
-	char strerr[32];
-
-	/* Chew on the message, we might need some information from
-	 * here */
-	ack_type = le16_to_cpu(l3l4_hdr->type);
-	switch (ack_type) {
-	case I2400M_MT_CMD_ENTER_POWERSAVE:
-		/* This is just left here for the sake of example, as
-		 * the processing is done somewhere else. */
-		if (0) {
-			result = i2400m_msg_check_status(
-				l3l4_hdr, strerr, sizeof(strerr));
-			if (result >= 0)
-				d_printf(1, dev, "ready for power save: %zd\n",
-					 size);
-		}
-		break;
-	}
-}
-
-
-/*
- * i2400m_msg_size_check() - verify message size and header are congruent
- *
- * It is ok if the total message size is larger than the expected
- * size, as there can be padding.
- */
-int i2400m_msg_size_check(struct i2400m *i2400m,
-			  const struct i2400m_l3l4_hdr *l3l4_hdr,
-			  size_t msg_size)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	size_t expected_size;
-	d_fnstart(4, dev, "(i2400m %p l3l4_hdr %p msg_size %zu)\n",
-		  i2400m, l3l4_hdr, msg_size);
-	if (msg_size < sizeof(*l3l4_hdr)) {
-		dev_err(dev, "bad size for message header "
-			"(expected at least %zu, got %zu)\n",
-			(size_t) sizeof(*l3l4_hdr), msg_size);
-		result = -EIO;
-		goto error_hdr_size;
-	}
-	expected_size = le16_to_cpu(l3l4_hdr->length) + sizeof(*l3l4_hdr);
-	if (msg_size < expected_size) {
-		dev_err(dev, "bad size for message code 0x%04x (expected %zu, "
-			"got %zu)\n", le16_to_cpu(l3l4_hdr->type),
-			expected_size, msg_size);
-		result = -EIO;
-	} else
-		result = 0;
-error_hdr_size:
-	d_fnend(4, dev,
-		"(i2400m %p l3l4_hdr %p msg_size %zu) = %d\n",
-		i2400m, l3l4_hdr, msg_size, result);
-	return result;
-}
-
-
-
-/*
- * Cancel a wait for a command ACK
- *
- * @i2400m: device descriptor
- * @code: [negative] errno code to cancel with (don't use
- *     -EINPROGRESS)
- *
- * If there is an ack already filled out, free it.
- */
-void i2400m_msg_to_dev_cancel_wait(struct i2400m *i2400m, int code)
-{
-	struct sk_buff *ack_skb;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	ack_skb = i2400m->ack_skb;
-	if (ack_skb && !IS_ERR(ack_skb))
-		kfree_skb(ack_skb);
-	i2400m->ack_skb = ERR_PTR(code);
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-}
-
-
-/**
- * i2400m_msg_to_dev - Send a control message to the device and get a response
- *
- * @i2400m: device descriptor
- *
- * @buf: pointer to the buffer containing the message to be sent; it
- *           has to start with a &struct i2400M_l3l4_hdr and then
- *           followed by the payload. Once this function returns, the
- *           buffer can be reused.
- *
- * @buf_len: buffer size
- *
- * Returns:
- *
- * Pointer to skb containing the ack message. You need to check the
- * pointer with IS_ERR(), as it might be an error code. Error codes
- * could happen because:
- *
- *  - the message wasn't formatted correctly
- *  - couldn't send the message
- *  - failed waiting for a response
- *  - the ack message wasn't formatted correctly
- *
- * The returned skb has been allocated with wimax_msg_to_user_alloc(),
- * it contains the response in a netlink attribute and is ready to be
- * passed up to user space with wimax_msg_to_user_send(). To access
- * the payload and its length, use wimax_msg_{data,len}() on the skb.
- *
- * The skb has to be freed with kfree_skb() once done.
- *
- * Description:
- *
- * This function delivers a message/command to the device and waits
- * for an ack to be received. The format is described in
- * linux/wimax/i2400m.h. In summary, a command/get/set is followed by an
- * ack.
- *
- * This function will not check the ack status, that's left up to the
- * caller.  Once done with the ack skb, it has to be kfree_skb()ed.
- *
- * The i2400m handles only one message at the same time, thus we need
- * the mutex to exclude other players.
- *
- * We write the message and then wait for an answer to come back. The
- * RX path intercepts control messages and handles them in
- * i2400m_rx_ctl(). Reports (notifications) are (maybe) processed
- * locally and then forwarded (as needed) to user space on the WiMAX
- * stack message pipe. Acks are saved and passed back to us through an
- * skb in i2400m->ack_skb which is ready to be given to generic
- * netlink if need be.
- */
-struct sk_buff *i2400m_msg_to_dev(struct i2400m *i2400m,
-				  const void *buf, size_t buf_len)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_l3l4_hdr *msg_l3l4_hdr;
-	struct sk_buff *ack_skb;
-	const struct i2400m_l3l4_hdr *ack_l3l4_hdr;
-	size_t ack_len;
-	int ack_timeout;
-	unsigned msg_type;
-	unsigned long flags;
-
-	d_fnstart(3, dev, "(i2400m %p buf %p len %zu)\n",
-		  i2400m, buf, buf_len);
-
-	rmb();		/* Make sure we see what i2400m_dev_reset_handle() */
-	if (i2400m->boot_mode)
-		return ERR_PTR(-EL3RST);
-
-	msg_l3l4_hdr = buf;
-	/* Check msg & payload consistency */
-	result = i2400m_msg_size_check(i2400m, msg_l3l4_hdr, buf_len);
-	if (result < 0)
-		goto error_bad_msg;
-	msg_type = le16_to_cpu(msg_l3l4_hdr->type);
-	d_printf(1, dev, "CMD/GET/SET 0x%04x %zu bytes\n",
-		 msg_type, buf_len);
-	d_dump(2, dev, buf, buf_len);
-
-	/* Setup the completion, ack_skb ("we are waiting") and send
-	 * the message to the device */
-	mutex_lock(&i2400m->msg_mutex);
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	i2400m->ack_skb = ERR_PTR(-EINPROGRESS);
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	init_completion(&i2400m->msg_completion);
-	result = i2400m_tx(i2400m, buf, buf_len, I2400M_PT_CTRL);
-	if (result < 0) {
-		dev_err(dev, "can't send message 0x%04x: %d\n",
-			le16_to_cpu(msg_l3l4_hdr->type), result);
-		goto error_tx;
-	}
-
-	/* Some commands take longer to execute because of crypto ops,
-	 * so we give them some more leeway on timeout */
-	switch (msg_type) {
-	case I2400M_MT_GET_TLS_OPERATION_RESULT:
-	case I2400M_MT_CMD_SEND_EAP_RESPONSE:
-		ack_timeout = 5 * HZ;
-		break;
-	default:
-		ack_timeout = HZ;
-	}
-
-	if (unlikely(i2400m->trace_msg_from_user))
-		wimax_msg(&i2400m->wimax_dev, "echo", buf, buf_len, GFP_KERNEL);
-	/* The RX path in rx.c will put any response for this message
-	 * in i2400m->ack_skb and wake us up. If we cancel the wait,
-	 * we need to change the value of i2400m->ack_skb to something
-	 * not -EINPROGRESS so RX knows there is no one waiting. */
-	result = wait_for_completion_interruptible_timeout(
-		&i2400m->msg_completion, ack_timeout);
-	if (result == 0) {
-		dev_err(dev, "timeout waiting for reply to message 0x%04x\n",
-			msg_type);
-		result = -ETIMEDOUT;
-		i2400m_msg_to_dev_cancel_wait(i2400m, result);
-		goto error_wait_for_completion;
-	} else if (result < 0) {
-		dev_err(dev, "error waiting for reply to message 0x%04x: %d\n",
-			msg_type, result);
-		i2400m_msg_to_dev_cancel_wait(i2400m, result);
-		goto error_wait_for_completion;
-	}
-
-	/* Pull out the ack data from i2400m->ack_skb -- see if it is
-	 * an error and act accordingly */
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	ack_skb = i2400m->ack_skb;
-	if (IS_ERR(ack_skb))
-		result = PTR_ERR(ack_skb);
-	else
-		result = 0;
-	i2400m->ack_skb = NULL;
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	if (result < 0)
-		goto error_ack_status;
-	ack_l3l4_hdr = wimax_msg_data_len(ack_skb, &ack_len);
-
-	/* Check the ack and deliver it if it is ok */
-	if (unlikely(i2400m->trace_msg_from_user))
-		wimax_msg(&i2400m->wimax_dev, "echo",
-			  ack_l3l4_hdr, ack_len, GFP_KERNEL);
-	result = i2400m_msg_size_check(i2400m, ack_l3l4_hdr, ack_len);
-	if (result < 0) {
-		dev_err(dev, "HW BUG? reply to message 0x%04x: %d\n",
-			msg_type, result);
-		goto error_bad_ack_len;
-	}
-	if (msg_type != le16_to_cpu(ack_l3l4_hdr->type)) {
-		dev_err(dev, "HW BUG? bad reply 0x%04x to message 0x%04x\n",
-			le16_to_cpu(ack_l3l4_hdr->type), msg_type);
-		result = -EIO;
-		goto error_bad_ack_type;
-	}
-	i2400m_msg_ack_hook(i2400m, ack_l3l4_hdr, ack_len);
-	mutex_unlock(&i2400m->msg_mutex);
-	d_fnend(3, dev, "(i2400m %p buf %p len %zu) = %p\n",
-		i2400m, buf, buf_len, ack_skb);
-	return ack_skb;
-
-error_bad_ack_type:
-error_bad_ack_len:
-	kfree_skb(ack_skb);
-error_ack_status:
-error_wait_for_completion:
-error_tx:
-	mutex_unlock(&i2400m->msg_mutex);
-error_bad_msg:
-	d_fnend(3, dev, "(i2400m %p buf %p len %zu) = %d\n",
-		i2400m, buf, buf_len, result);
-	return ERR_PTR(result);
-}
-
-
-/*
- * Definitions for the Enter Power Save command
- *
- * The Enter Power Save command requests the device to go into power
- * saving mode. The device will ack or nak the command depending on it
- * being ready for it. If it acks, we tell the USB subsystem to
- *
- * As well, the device might request to go into power saving mode by
- * sending a report (REPORT_POWERSAVE_READY), in which case, we issue
- * this command. The hookups in the RX coder allow
- */
-enum {
-	I2400M_WAKEUP_ENABLED  = 0x01,
-	I2400M_WAKEUP_DISABLED = 0x02,
-	I2400M_TLV_TYPE_WAKEUP_MODE = 144,
-};
-
-struct i2400m_cmd_enter_power_save {
-	struct i2400m_l3l4_hdr hdr;
-	struct i2400m_tlv_hdr tlv;
-	__le32 val;
-} __packed;
-
-
-/*
- * Request entering power save
- *
- * This command is (mainly) executed when the device indicates that it
- * is ready to go into powersave mode via a REPORT_POWERSAVE_READY.
- */
-int i2400m_cmd_enter_powersave(struct i2400m *i2400m)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-	struct i2400m_cmd_enter_power_save *cmd;
-	char strerr[32];
-
-	result = -ENOMEM;
-	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
-	if (cmd == NULL)
-		goto error_alloc;
-	cmd->hdr.type = cpu_to_le16(I2400M_MT_CMD_ENTER_POWERSAVE);
-	cmd->hdr.length = cpu_to_le16(sizeof(*cmd) - sizeof(cmd->hdr));
-	cmd->hdr.version = cpu_to_le16(I2400M_L3L4_VERSION);
-	cmd->tlv.type = cpu_to_le16(I2400M_TLV_TYPE_WAKEUP_MODE);
-	cmd->tlv.length = cpu_to_le16(sizeof(cmd->val));
-	cmd->val = cpu_to_le32(I2400M_WAKEUP_ENABLED);
-
-	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
-	result = PTR_ERR(ack_skb);
-	if (IS_ERR(ack_skb)) {
-		dev_err(dev, "Failed to issue 'Enter power save' command: %d\n",
-			result);
-		goto error_msg_to_dev;
-	}
-	result = i2400m_msg_check_status(wimax_msg_data(ack_skb),
-					 strerr, sizeof(strerr));
-	if (result == -EACCES)
-		d_printf(1, dev, "Cannot enter power save mode\n");
-	else if (result < 0)
-		dev_err(dev, "'Enter power save' (0x%04x) command failed: "
-			"%d - %s\n", I2400M_MT_CMD_ENTER_POWERSAVE,
-			result, strerr);
-	else
-		d_printf(1, dev, "device ready to power save\n");
-	kfree_skb(ack_skb);
-error_msg_to_dev:
-	kfree(cmd);
-error_alloc:
-	return result;
-}
-EXPORT_SYMBOL_GPL(i2400m_cmd_enter_powersave);
-
-
-/*
- * Definitions for getting device information
- */
-enum {
-	I2400M_TLV_DETAILED_DEVICE_INFO = 140
-};
-
-/**
- * i2400m_get_device_info - Query the device for detailed device information
- *
- * @i2400m: device descriptor
- *
- * Returns: an skb whose skb->data points to a 'struct
- *    i2400m_tlv_detailed_device_info'. When done, kfree_skb() it. The
- *    skb is *guaranteed* to contain the whole TLV data structure.
- *
- *    On error, IS_ERR(skb) is true and ERR_PTR(skb) is the error
- *    code.
- */
-struct sk_buff *i2400m_get_device_info(struct i2400m *i2400m)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-	struct i2400m_l3l4_hdr *cmd;
-	const struct i2400m_l3l4_hdr *ack;
-	size_t ack_len;
-	const struct i2400m_tlv_hdr *tlv;
-	const struct i2400m_tlv_detailed_device_info *ddi;
-	char strerr[32];
-
-	ack_skb = ERR_PTR(-ENOMEM);
-	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
-	if (cmd == NULL)
-		goto error_alloc;
-	cmd->type = cpu_to_le16(I2400M_MT_GET_DEVICE_INFO);
-	cmd->length = 0;
-	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
-
-	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
-	if (IS_ERR(ack_skb)) {
-		dev_err(dev, "Failed to issue 'get device info' command: %ld\n",
-			PTR_ERR(ack_skb));
-		goto error_msg_to_dev;
-	}
-	ack = wimax_msg_data_len(ack_skb, &ack_len);
-	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
-	if (result < 0) {
-		dev_err(dev, "'get device info' (0x%04x) command failed: "
-			"%d - %s\n", I2400M_MT_GET_DEVICE_INFO, result,
-			strerr);
-		goto error_cmd_failed;
-	}
-	tlv = i2400m_tlv_find(i2400m, ack->pl, ack_len - sizeof(*ack),
-			      I2400M_TLV_DETAILED_DEVICE_INFO, sizeof(*ddi));
-	if (tlv == NULL) {
-		dev_err(dev, "GET DEVICE INFO: "
-			"detailed device info TLV not found (0x%04x)\n",
-			I2400M_TLV_DETAILED_DEVICE_INFO);
-		result = -EIO;
-		goto error_no_tlv;
-	}
-	skb_pull(ack_skb, (void *) tlv - (void *) ack_skb->data);
-error_msg_to_dev:
-	kfree(cmd);
-error_alloc:
-	return ack_skb;
-
-error_no_tlv:
-error_cmd_failed:
-	kfree_skb(ack_skb);
-	kfree(cmd);
-	return ERR_PTR(result);
-}
-
-
-/* Firmware interface versions we support */
-enum {
-	I2400M_HDIv_MAJOR = 9,
-	I2400M_HDIv_MINOR = 1,
-	I2400M_HDIv_MINOR_2 = 2,
-};
-
-
-/**
- * i2400m_firmware_check - check firmware versions are compatible with
- * the driver
- *
- * @i2400m: device descriptor
- *
- * Returns: 0 if ok, < 0 errno code an error and a message in the
- *    kernel log.
- *
- * Long function, but quite simple; first chunk launches the command
- * and double checks the reply for the right TLV. Then we process the
- * TLV (where the meat is).
- *
- * Once we process the TLV that gives us the firmware's interface
- * version, we encode it and save it in i2400m->fw_version for future
- * reference.
- */
-int i2400m_firmware_check(struct i2400m *i2400m)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-	struct i2400m_l3l4_hdr *cmd;
-	const struct i2400m_l3l4_hdr *ack;
-	size_t ack_len;
-	const struct i2400m_tlv_hdr *tlv;
-	const struct i2400m_tlv_l4_message_versions *l4mv;
-	char strerr[32];
-	unsigned major, minor, branch;
-
-	result = -ENOMEM;
-	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
-	if (cmd == NULL)
-		goto error_alloc;
-	cmd->type = cpu_to_le16(I2400M_MT_GET_LM_VERSION);
-	cmd->length = 0;
-	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
-
-	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
-	if (IS_ERR(ack_skb)) {
-		result = PTR_ERR(ack_skb);
-		dev_err(dev, "Failed to issue 'get lm version' command: %-d\n",
-			result);
-		goto error_msg_to_dev;
-	}
-	ack = wimax_msg_data_len(ack_skb, &ack_len);
-	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
-	if (result < 0) {
-		dev_err(dev, "'get lm version' (0x%04x) command failed: "
-			"%d - %s\n", I2400M_MT_GET_LM_VERSION, result,
-			strerr);
-		goto error_cmd_failed;
-	}
-	tlv = i2400m_tlv_find(i2400m, ack->pl, ack_len - sizeof(*ack),
-			      I2400M_TLV_L4_MESSAGE_VERSIONS, sizeof(*l4mv));
-	if (tlv == NULL) {
-		dev_err(dev, "get lm version: TLV not found (0x%04x)\n",
-			I2400M_TLV_L4_MESSAGE_VERSIONS);
-		result = -EIO;
-		goto error_no_tlv;
-	}
-	l4mv = container_of(tlv, typeof(*l4mv), hdr);
-	major = le16_to_cpu(l4mv->major);
-	minor = le16_to_cpu(l4mv->minor);
-	branch = le16_to_cpu(l4mv->branch);
-	result = -EINVAL;
-	if (major != I2400M_HDIv_MAJOR) {
-		dev_err(dev, "unsupported major fw version "
-			"%u.%u.%u\n", major, minor, branch);
-		goto error_bad_major;
-	}
-	result = 0;
-	if (minor > I2400M_HDIv_MINOR_2 || minor < I2400M_HDIv_MINOR)
-		dev_warn(dev, "untested minor fw version %u.%u.%u\n",
-			 major, minor, branch);
-	/* Yes, we ignore the branch -- we don't have to track it */
-	i2400m->fw_version = major << 16 | minor;
-	dev_info(dev, "firmware interface version %u.%u.%u\n",
-		 major, minor, branch);
-error_bad_major:
-error_no_tlv:
-error_cmd_failed:
-	kfree_skb(ack_skb);
-error_msg_to_dev:
-	kfree(cmd);
-error_alloc:
-	return result;
-}
-
-
-/*
- * Send an DoExitIdle command to the device to ask it to go out of
- * basestation-idle mode.
- *
- * @i2400m: device descriptor
- *
- * This starts a renegotiation with the basestation that might involve
- * another crypto handshake with user space.
- *
- * Returns: 0 if ok, < 0 errno code on error.
- */
-int i2400m_cmd_exit_idle(struct i2400m *i2400m)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-	struct i2400m_l3l4_hdr *cmd;
-	char strerr[32];
-
-	result = -ENOMEM;
-	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
-	if (cmd == NULL)
-		goto error_alloc;
-	cmd->type = cpu_to_le16(I2400M_MT_CMD_EXIT_IDLE);
-	cmd->length = 0;
-	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
-
-	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
-	result = PTR_ERR(ack_skb);
-	if (IS_ERR(ack_skb)) {
-		dev_err(dev, "Failed to issue 'exit idle' command: %d\n",
-			result);
-		goto error_msg_to_dev;
-	}
-	result = i2400m_msg_check_status(wimax_msg_data(ack_skb),
-					 strerr, sizeof(strerr));
-	kfree_skb(ack_skb);
-error_msg_to_dev:
-	kfree(cmd);
-error_alloc:
-	return result;
-
-}
-
-
-/*
- * Query the device for its state, update the WiMAX stack's idea of it
- *
- * @i2400m: device descriptor
- *
- * Returns: 0 if ok, < 0 errno code on error.
- *
- * Executes a 'Get State' command and parses the returned
- * TLVs.
- *
- * Because this is almost identical to a 'Report State', we use
- * i2400m_report_state_hook() to parse the answer. This will set the
- * carrier state, as well as the RF Kill switches state.
- */
-static int i2400m_cmd_get_state(struct i2400m *i2400m)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-	struct i2400m_l3l4_hdr *cmd;
-	const struct i2400m_l3l4_hdr *ack;
-	size_t ack_len;
-	char strerr[32];
-
-	result = -ENOMEM;
-	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
-	if (cmd == NULL)
-		goto error_alloc;
-	cmd->type = cpu_to_le16(I2400M_MT_GET_STATE);
-	cmd->length = 0;
-	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
-
-	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
-	if (IS_ERR(ack_skb)) {
-		dev_err(dev, "Failed to issue 'get state' command: %ld\n",
-			PTR_ERR(ack_skb));
-		result = PTR_ERR(ack_skb);
-		goto error_msg_to_dev;
-	}
-	ack = wimax_msg_data_len(ack_skb, &ack_len);
-	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
-	if (result < 0) {
-		dev_err(dev, "'get state' (0x%04x) command failed: "
-			"%d - %s\n", I2400M_MT_GET_STATE, result, strerr);
-		goto error_cmd_failed;
-	}
-	i2400m_report_state_hook(i2400m, ack, ack_len - sizeof(*ack),
-				 "GET STATE");
-	result = 0;
-	kfree_skb(ack_skb);
-error_cmd_failed:
-error_msg_to_dev:
-	kfree(cmd);
-error_alloc:
-	return result;
-}
-
-/**
- * Set basic configuration settings
- *
- * @i2400m: device descriptor
- * @args: array of pointers to the TLV headers to send for
- *     configuration (each followed by its payload).
- *     TLV headers and payloads must be properly initialized, with the
- *     right endianess (LE).
- * @arg_size: number of pointers in the @args array
- */
-static int i2400m_set_init_config(struct i2400m *i2400m,
-				  const struct i2400m_tlv_hdr **arg,
-				  size_t args)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-	struct i2400m_l3l4_hdr *cmd;
-	char strerr[32];
-	unsigned argc, argsize, tlv_size;
-	const struct i2400m_tlv_hdr *tlv_hdr;
-	void *buf, *itr;
-
-	d_fnstart(3, dev, "(i2400m %p arg %p args %zu)\n", i2400m, arg, args);
-	result = 0;
-	if (args == 0)
-		goto none;
-	/* Compute the size of all the TLVs, so we can alloc a
-	 * contiguous command block to copy them. */
-	argsize = 0;
-	for (argc = 0; argc < args; argc++) {
-		tlv_hdr = arg[argc];
-		argsize += sizeof(*tlv_hdr) + le16_to_cpu(tlv_hdr->length);
-	}
-	WARN_ON(argc >= 9);	/* As per hw spec */
-
-	/* Alloc the space for the command and TLVs*/
-	result = -ENOMEM;
-	buf = kzalloc(sizeof(*cmd) + argsize, GFP_KERNEL);
-	if (buf == NULL)
-		goto error_alloc;
-	cmd = buf;
-	cmd->type = cpu_to_le16(I2400M_MT_SET_INIT_CONFIG);
-	cmd->length = cpu_to_le16(argsize);
-	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
-
-	/* Copy the TLVs */
-	itr = buf + sizeof(*cmd);
-	for (argc = 0; argc < args; argc++) {
-		tlv_hdr = arg[argc];
-		tlv_size = sizeof(*tlv_hdr) + le16_to_cpu(tlv_hdr->length);
-		memcpy(itr, tlv_hdr, tlv_size);
-		itr += tlv_size;
-	}
-
-	/* Send the message! */
-	ack_skb = i2400m_msg_to_dev(i2400m, buf, sizeof(*cmd) + argsize);
-	result = PTR_ERR(ack_skb);
-	if (IS_ERR(ack_skb)) {
-		dev_err(dev, "Failed to issue 'init config' command: %d\n",
-			result);
-
-		goto error_msg_to_dev;
-	}
-	result = i2400m_msg_check_status(wimax_msg_data(ack_skb),
-					 strerr, sizeof(strerr));
-	if (result < 0)
-		dev_err(dev, "'init config' (0x%04x) command failed: %d - %s\n",
-			I2400M_MT_SET_INIT_CONFIG, result, strerr);
-	kfree_skb(ack_skb);
-error_msg_to_dev:
-	kfree(buf);
-error_alloc:
-none:
-	d_fnend(3, dev, "(i2400m %p arg %p args %zu) = %d\n",
-		i2400m, arg, args, result);
-	return result;
-
-}
-
-/**
- * i2400m_set_idle_timeout - Set the device's idle mode timeout
- *
- * @i2400m: i2400m device descriptor
- *
- * @msecs: milliseconds for the timeout to enter idle mode. Between
- *     100 to 300000 (5m); 0 to disable. In increments of 100.
- *
- * After this @msecs of the link being idle (no data being sent or
- * received), the device will negotiate with the basestation entering
- * idle mode for saving power. The connection is maintained, but
- * getting out of it (done in tx.c) will require some negotiation,
- * possible crypto re-handshake and a possible DHCP re-lease.
- *
- * Only available if fw_version >= 0x00090002.
- *
- * Returns: 0 if ok, < 0 errno code on error.
- */
-int i2400m_set_idle_timeout(struct i2400m *i2400m, unsigned msecs)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-	struct {
-		struct i2400m_l3l4_hdr hdr;
-		struct i2400m_tlv_config_idle_timeout cit;
-	} *cmd;
-	const struct i2400m_l3l4_hdr *ack;
-	size_t ack_len;
-	char strerr[32];
-
-	result = -ENOSYS;
-	if (i2400m_le_v1_3(i2400m))
-		goto error_alloc;
-	result = -ENOMEM;
-	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
-	if (cmd == NULL)
-		goto error_alloc;
-	cmd->hdr.type = cpu_to_le16(I2400M_MT_GET_STATE);
-	cmd->hdr.length = cpu_to_le16(sizeof(*cmd) - sizeof(cmd->hdr));
-	cmd->hdr.version = cpu_to_le16(I2400M_L3L4_VERSION);
-
-	cmd->cit.hdr.type =
-		cpu_to_le16(I2400M_TLV_CONFIG_IDLE_TIMEOUT);
-	cmd->cit.hdr.length = cpu_to_le16(sizeof(cmd->cit.timeout));
-	cmd->cit.timeout = cpu_to_le32(msecs);
-
-	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
-	if (IS_ERR(ack_skb)) {
-		dev_err(dev, "Failed to issue 'set idle timeout' command: "
-			"%ld\n", PTR_ERR(ack_skb));
-		result = PTR_ERR(ack_skb);
-		goto error_msg_to_dev;
-	}
-	ack = wimax_msg_data_len(ack_skb, &ack_len);
-	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
-	if (result < 0) {
-		dev_err(dev, "'set idle timeout' (0x%04x) command failed: "
-			"%d - %s\n", I2400M_MT_GET_STATE, result, strerr);
-		goto error_cmd_failed;
-	}
-	result = 0;
-	kfree_skb(ack_skb);
-error_cmd_failed:
-error_msg_to_dev:
-	kfree(cmd);
-error_alloc:
-	return result;
-}
-
-
-/**
- * i2400m_dev_initialize - Initialize the device once communications are ready
- *
- * @i2400m: device descriptor
- *
- * Returns: 0 if ok, < 0 errno code on error.
- *
- * Configures the device to work the way we like it.
- *
- * At the point of this call, the device is registered with the WiMAX
- * and netdev stacks, firmware is uploaded and we can talk to the
- * device normally.
- */
-int i2400m_dev_initialize(struct i2400m *i2400m)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_tlv_config_idle_parameters idle_params;
-	struct i2400m_tlv_config_idle_timeout idle_timeout;
-	struct i2400m_tlv_config_d2h_data_format df;
-	struct i2400m_tlv_config_dl_host_reorder dlhr;
-	const struct i2400m_tlv_hdr *args[9];
-	unsigned argc = 0;
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	if (i2400m_passive_mode)
-		goto out_passive;
-	/* Disable idle mode? (enabled by default) */
-	if (i2400m_idle_mode_disabled) {
-		if (i2400m_le_v1_3(i2400m)) {
-			idle_params.hdr.type =
-				cpu_to_le16(I2400M_TLV_CONFIG_IDLE_PARAMETERS);
-			idle_params.hdr.length = cpu_to_le16(
-				sizeof(idle_params) - sizeof(idle_params.hdr));
-			idle_params.idle_timeout = 0;
-			idle_params.idle_paging_interval = 0;
-			args[argc++] = &idle_params.hdr;
-		} else {
-			idle_timeout.hdr.type =
-				cpu_to_le16(I2400M_TLV_CONFIG_IDLE_TIMEOUT);
-			idle_timeout.hdr.length = cpu_to_le16(
-				sizeof(idle_timeout) - sizeof(idle_timeout.hdr));
-			idle_timeout.timeout = 0;
-			args[argc++] = &idle_timeout.hdr;
-		}
-	}
-	if (i2400m_ge_v1_4(i2400m)) {
-		/* Enable extended RX data format? */
-		df.hdr.type =
-			cpu_to_le16(I2400M_TLV_CONFIG_D2H_DATA_FORMAT);
-		df.hdr.length = cpu_to_le16(
-			sizeof(df) - sizeof(df.hdr));
-		df.format = 1;
-		args[argc++] = &df.hdr;
-
-		/* Enable RX data reordering?
-		 * (switch flipped in rx.c:i2400m_rx_setup() after fw upload) */
-		if (i2400m->rx_reorder) {
-			dlhr.hdr.type =
-				cpu_to_le16(I2400M_TLV_CONFIG_DL_HOST_REORDER);
-			dlhr.hdr.length = cpu_to_le16(
-				sizeof(dlhr) - sizeof(dlhr.hdr));
-			dlhr.reorder = 1;
-			args[argc++] = &dlhr.hdr;
-		}
-	}
-	result = i2400m_set_init_config(i2400m, args, argc);
-	if (result < 0)
-		goto error;
-out_passive:
-	/*
-	 * Update state: Here it just calls a get state; parsing the
-	 * result (System State TLV and RF Status TLV [done in the rx
-	 * path hooks]) will set the hardware and software RF-Kill
-	 * status.
-	 */
-	result = i2400m_cmd_get_state(i2400m);
-error:
-	if (result < 0)
-		dev_err(dev, "failed to initialize the device: %d\n", result);
-	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
-	return result;
-}
-
-
-/**
- * i2400m_dev_shutdown - Shutdown a running device
- *
- * @i2400m: device descriptor
- *
- * Release resources acquired during the running of the device; in
- * theory, should also tell the device to go to sleep, switch off the
- * radio, all that, but at this point, in most cases (driver
- * disconnection, reset handling) we can't even talk to the device.
- */
-void i2400m_dev_shutdown(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
-}
diff --git a/drivers/net/wimax/i2400m/debug-levels.h b/drivers/net/wimax/i2400m/debug-levels.h
deleted file mode 100644
index 00942bb1489b..000000000000
--- a/drivers/net/wimax/i2400m/debug-levels.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Debug levels control file for the i2400m module
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-#ifndef __debug_levels__h__
-#define __debug_levels__h__
-
-/* Maximum compile and run time debug level for all submodules */
-#define D_MODULENAME i2400m
-#define D_MASTER CONFIG_WIMAX_I2400M_DEBUG_LEVEL
-
-#include <linux/wimax/debug.h>
-
-/* List of all the enabled modules */
-enum d_module {
-	D_SUBMODULE_DECLARE(control),
-	D_SUBMODULE_DECLARE(driver),
-	D_SUBMODULE_DECLARE(debugfs),
-	D_SUBMODULE_DECLARE(fw),
-	D_SUBMODULE_DECLARE(netdev),
-	D_SUBMODULE_DECLARE(rfkill),
-	D_SUBMODULE_DECLARE(rx),
-	D_SUBMODULE_DECLARE(sysfs),
-	D_SUBMODULE_DECLARE(tx),
-};
-
-
-#endif /* #ifndef __debug_levels__h__ */
diff --git a/drivers/net/wimax/i2400m/debugfs.c b/drivers/net/wimax/i2400m/debugfs.c
deleted file mode 100644
index 1c640b41ea4c..000000000000
--- a/drivers/net/wimax/i2400m/debugfs.c
+++ /dev/null
@@ -1,253 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Debugfs interfaces to manipulate driver and device information
- *
- * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-
-#include <linux/debugfs.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/spinlock.h>
-#include <linux/device.h>
-#include <linux/export.h>
-#include "i2400m.h"
-
-
-#define D_SUBMODULE debugfs
-#include "debug-levels.h"
-
-static
-int debugfs_netdev_queue_stopped_get(void *data, u64 *val)
-{
-	struct i2400m *i2400m = data;
-	*val = netif_queue_stopped(i2400m->wimax_dev.net_dev);
-	return 0;
-}
-DEFINE_DEBUGFS_ATTRIBUTE(fops_netdev_queue_stopped,
-			debugfs_netdev_queue_stopped_get,
-			NULL, "%llu\n");
-
-/*
- * We don't allow partial reads of this file, as then the reader would
- * get weirdly confused data as it is updated.
- *
- * So or you read it all or nothing; if you try to read with an offset
- * != 0, we consider you are done reading.
- */
-static
-ssize_t i2400m_rx_stats_read(struct file *filp, char __user *buffer,
-			     size_t count, loff_t *ppos)
-{
-	struct i2400m *i2400m = filp->private_data;
-	char buf[128];
-	unsigned long flags;
-
-	if (*ppos != 0)
-		return 0;
-	if (count < sizeof(buf))
-		return -ENOSPC;
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	snprintf(buf, sizeof(buf), "%u %u %u %u %u %u %u\n",
-		 i2400m->rx_pl_num, i2400m->rx_pl_min,
-		 i2400m->rx_pl_max, i2400m->rx_num,
-		 i2400m->rx_size_acc,
-		 i2400m->rx_size_min, i2400m->rx_size_max);
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-}
-
-
-/* Any write clears the stats */
-static
-ssize_t i2400m_rx_stats_write(struct file *filp, const char __user *buffer,
-			      size_t count, loff_t *ppos)
-{
-	struct i2400m *i2400m = filp->private_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	i2400m->rx_pl_num = 0;
-	i2400m->rx_pl_max = 0;
-	i2400m->rx_pl_min = UINT_MAX;
-	i2400m->rx_num = 0;
-	i2400m->rx_size_acc = 0;
-	i2400m->rx_size_min = UINT_MAX;
-	i2400m->rx_size_max = 0;
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	return count;
-}
-
-static
-const struct file_operations i2400m_rx_stats_fops = {
-	.owner =	THIS_MODULE,
-	.open =		simple_open,
-	.read =		i2400m_rx_stats_read,
-	.write =	i2400m_rx_stats_write,
-	.llseek =	default_llseek,
-};
-
-
-/* See i2400m_rx_stats_read() */
-static
-ssize_t i2400m_tx_stats_read(struct file *filp, char __user *buffer,
-			     size_t count, loff_t *ppos)
-{
-	struct i2400m *i2400m = filp->private_data;
-	char buf[128];
-	unsigned long flags;
-
-	if (*ppos != 0)
-		return 0;
-	if (count < sizeof(buf))
-		return -ENOSPC;
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	snprintf(buf, sizeof(buf), "%u %u %u %u %u %u %u\n",
-		 i2400m->tx_pl_num, i2400m->tx_pl_min,
-		 i2400m->tx_pl_max, i2400m->tx_num,
-		 i2400m->tx_size_acc,
-		 i2400m->tx_size_min, i2400m->tx_size_max);
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-}
-
-/* Any write clears the stats */
-static
-ssize_t i2400m_tx_stats_write(struct file *filp, const char __user *buffer,
-			      size_t count, loff_t *ppos)
-{
-	struct i2400m *i2400m = filp->private_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	i2400m->tx_pl_num = 0;
-	i2400m->tx_pl_max = 0;
-	i2400m->tx_pl_min = UINT_MAX;
-	i2400m->tx_num = 0;
-	i2400m->tx_size_acc = 0;
-	i2400m->tx_size_min = UINT_MAX;
-	i2400m->tx_size_max = 0;
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-	return count;
-}
-
-static
-const struct file_operations i2400m_tx_stats_fops = {
-	.owner =	THIS_MODULE,
-	.open =		simple_open,
-	.read =		i2400m_tx_stats_read,
-	.write =	i2400m_tx_stats_write,
-	.llseek =	default_llseek,
-};
-
-
-/* Write 1 to ask the device to go into suspend */
-static
-int debugfs_i2400m_suspend_set(void *data, u64 val)
-{
-	int result;
-	struct i2400m *i2400m = data;
-	result = i2400m_cmd_enter_powersave(i2400m);
-	if (result >= 0)
-		result = 0;
-	return result;
-}
-DEFINE_DEBUGFS_ATTRIBUTE(fops_i2400m_suspend,
-			NULL, debugfs_i2400m_suspend_set,
-			"%llu\n");
-
-/*
- * Reset the device
- *
- * Write 0 to ask the device to soft reset, 1 to cold reset, 2 to bus
- * reset (as defined by enum i2400m_reset_type).
- */
-static
-int debugfs_i2400m_reset_set(void *data, u64 val)
-{
-	int result;
-	struct i2400m *i2400m = data;
-	enum i2400m_reset_type rt = val;
-	switch(rt) {
-	case I2400M_RT_WARM:
-	case I2400M_RT_COLD:
-	case I2400M_RT_BUS:
-		result = i2400m_reset(i2400m, rt);
-		if (result >= 0)
-			result = 0;
-		break;
-	default:
-		result = -EINVAL;
-	}
-	return result;
-}
-DEFINE_DEBUGFS_ATTRIBUTE(fops_i2400m_reset,
-			NULL, debugfs_i2400m_reset_set,
-			"%llu\n");
-
-void i2400m_debugfs_add(struct i2400m *i2400m)
-{
-	struct dentry *dentry = i2400m->wimax_dev.debugfs_dentry;
-
-	dentry = debugfs_create_dir("i2400m", dentry);
-	i2400m->debugfs_dentry = dentry;
-
-	d_level_register_debugfs("dl_", control, dentry);
-	d_level_register_debugfs("dl_", driver, dentry);
-	d_level_register_debugfs("dl_", debugfs, dentry);
-	d_level_register_debugfs("dl_", fw, dentry);
-	d_level_register_debugfs("dl_", netdev, dentry);
-	d_level_register_debugfs("dl_", rfkill, dentry);
-	d_level_register_debugfs("dl_", rx, dentry);
-	d_level_register_debugfs("dl_", tx, dentry);
-
-	debugfs_create_size_t("tx_in", 0400, dentry, &i2400m->tx_in);
-	debugfs_create_size_t("tx_out", 0400, dentry, &i2400m->tx_out);
-	debugfs_create_u32("state", 0600, dentry, &i2400m->state);
-
-	/*
-	 * Trace received messages from user space
-	 *
-	 * In order to tap the bidirectional message stream in the
-	 * 'msg' pipe, user space can read from the 'msg' pipe;
-	 * however, due to limitations in libnl, we can't know what
-	 * the different applications are sending down to the kernel.
-	 *
-	 * So we have this hack where the driver will echo any message
-	 * received on the msg pipe from user space [through a call to
-	 * wimax_dev->op_msg_from_user() into
-	 * i2400m_op_msg_from_user()] into the 'trace' pipe that this
-	 * driver creates.
-	 *
-	 * So then, reading from both the 'trace' and 'msg' pipes in
-	 * user space will provide a full dump of the traffic.
-	 *
-	 * Write 1 to activate, 0 to clear.
-	 *
-	 * It is not really very atomic, but it is also not too
-	 * critical.
-	 */
-	debugfs_create_u8("trace_msg_from_user", 0600, dentry,
-			  &i2400m->trace_msg_from_user);
-
-	debugfs_create_file("netdev_queue_stopped", 0400, dentry, i2400m,
-			    &fops_netdev_queue_stopped);
-
-	debugfs_create_file("rx_stats", 0600, dentry, i2400m,
-			    &i2400m_rx_stats_fops);
-
-	debugfs_create_file("tx_stats", 0600, dentry, i2400m,
-			    &i2400m_tx_stats_fops);
-
-	debugfs_create_file("suspend", 0200, dentry, i2400m,
-			    &fops_i2400m_suspend);
-
-	debugfs_create_file("reset", 0200, dentry, i2400m, &fops_i2400m_reset);
-}
-
-void i2400m_debugfs_rm(struct i2400m *i2400m)
-{
-	debugfs_remove_recursive(i2400m->debugfs_dentry);
-}
diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
deleted file mode 100644
index ecb3fccca603..000000000000
--- a/drivers/net/wimax/i2400m/driver.c
+++ /dev/null
@@ -1,1002 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Generic probe/disconnect, reset and message passing
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * See i2400m.h for driver documentation. This contains helpers for
- * the driver model glue [_setup()/_release()], handling device resets
- * [_dev_reset_handle()], and the backends for the WiMAX stack ops
- * reset [_op_reset()] and message from user [_op_msg_from_user()].
- *
- * ROADMAP:
- *
- * i2400m_op_msg_from_user()
- *   i2400m_msg_to_dev()
- *   wimax_msg_to_user_send()
- *
- * i2400m_op_reset()
- *   i240m->bus_reset()
- *
- * i2400m_dev_reset_handle()
- *   __i2400m_dev_reset_handle()
- *     __i2400m_dev_stop()
- *     __i2400m_dev_start()
- *
- * i2400m_setup()
- *   i2400m->bus_setup()
- *   i2400m_bootrom_init()
- *   register_netdev()
- *   wimax_dev_add()
- *   i2400m_dev_start()
- *     __i2400m_dev_start()
- *       i2400m_dev_bootstrap()
- *       i2400m_tx_setup()
- *       i2400m->bus_dev_start()
- *       i2400m_firmware_check()
- *       i2400m_check_mac_addr()
- *
- * i2400m_release()
- *   i2400m_dev_stop()
- *     __i2400m_dev_stop()
- *       i2400m_dev_shutdown()
- *       i2400m->bus_dev_stop()
- *       i2400m_tx_release()
- *   i2400m->bus_release()
- *   wimax_dev_rm()
- *   unregister_netdev()
- */
-#include "i2400m.h"
-#include <linux/etherdevice.h>
-#include <linux/wimax/i2400m.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/suspend.h>
-#include <linux/slab.h>
-
-#define D_SUBMODULE driver
-#include "debug-levels.h"
-
-
-static char i2400m_debug_params[128];
-module_param_string(debug, i2400m_debug_params, sizeof(i2400m_debug_params),
-		    0644);
-MODULE_PARM_DESC(debug,
-		 "String of space-separated NAME:VALUE pairs, where NAMEs "
-		 "are the different debug submodules and VALUE are the "
-		 "initial debug value to set.");
-
-static char i2400m_barkers_params[128];
-module_param_string(barkers, i2400m_barkers_params,
-		    sizeof(i2400m_barkers_params), 0644);
-MODULE_PARM_DESC(barkers,
-		 "String of comma-separated 32-bit values; each is "
-		 "recognized as the value the device sends as a reboot "
-		 "signal; values are appended to a list--setting one value "
-		 "as zero cleans the existing list and starts a new one.");
-
-/*
- * WiMAX stack operation: relay a message from user space
- *
- * @wimax_dev: device descriptor
- * @pipe_name: named pipe the message is for
- * @msg_buf: pointer to the message bytes
- * @msg_len: length of the buffer
- * @genl_info: passed by the generic netlink layer
- *
- * The WiMAX stack will call this function when a message was received
- * from user space.
- *
- * For the i2400m, this is an L3L4 message, as specified in
- * include/linux/wimax/i2400m.h, and thus prefixed with a 'struct
- * i2400m_l3l4_hdr'. Driver (and device) expect the messages to be
- * coded in Little Endian.
- *
- * This function just verifies that the header declaration and the
- * payload are consistent and then deals with it, either forwarding it
- * to the device or procesing it locally.
- *
- * In the i2400m, messages are basically commands that will carry an
- * ack, so we use i2400m_msg_to_dev() and then deliver the ack back to
- * user space. The rx.c code might intercept the response and use it
- * to update the driver's state, but then it will pass it on so it can
- * be relayed back to user space.
- *
- * Note that asynchronous events from the device are processed and
- * sent to user space in rx.c.
- */
-static
-int i2400m_op_msg_from_user(struct wimax_dev *wimax_dev,
-			    const char *pipe_name,
-			    const void *msg_buf, size_t msg_len,
-			    const struct genl_info *genl_info)
-{
-	int result;
-	struct i2400m *i2400m = wimax_dev_to_i2400m(wimax_dev);
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-
-	d_fnstart(4, dev, "(wimax_dev %p [i2400m %p] msg_buf %p "
-		  "msg_len %zu genl_info %p)\n", wimax_dev, i2400m,
-		  msg_buf, msg_len, genl_info);
-	ack_skb = i2400m_msg_to_dev(i2400m, msg_buf, msg_len);
-	result = PTR_ERR(ack_skb);
-	if (IS_ERR(ack_skb))
-		goto error_msg_to_dev;
-	result = wimax_msg_send(&i2400m->wimax_dev, ack_skb);
-error_msg_to_dev:
-	d_fnend(4, dev, "(wimax_dev %p [i2400m %p] msg_buf %p msg_len %zu "
-		"genl_info %p) = %d\n", wimax_dev, i2400m, msg_buf, msg_len,
-		genl_info, result);
-	return result;
-}
-
-
-/*
- * Context to wait for a reset to finalize
- */
-struct i2400m_reset_ctx {
-	struct completion completion;
-	int result;
-};
-
-
-/*
- * WiMAX stack operation: reset a device
- *
- * @wimax_dev: device descriptor
- *
- * See the documentation for wimax_reset() and wimax_dev->op_reset for
- * the requirements of this function. The WiMAX stack guarantees
- * serialization on calls to this function.
- *
- * Do a warm reset on the device; if it fails, resort to a cold reset
- * and return -ENODEV. On successful warm reset, we need to block
- * until it is complete.
- *
- * The bus-driver implementation of reset takes care of falling back
- * to cold reset if warm fails.
- */
-static
-int i2400m_op_reset(struct wimax_dev *wimax_dev)
-{
-	int result;
-	struct i2400m *i2400m = wimax_dev_to_i2400m(wimax_dev);
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_reset_ctx ctx = {
-		.completion = COMPLETION_INITIALIZER_ONSTACK(ctx.completion),
-		.result = 0,
-	};
-
-	d_fnstart(4, dev, "(wimax_dev %p)\n", wimax_dev);
-	mutex_lock(&i2400m->init_mutex);
-	i2400m->reset_ctx = &ctx;
-	mutex_unlock(&i2400m->init_mutex);
-	result = i2400m_reset(i2400m, I2400M_RT_WARM);
-	if (result < 0)
-		goto out;
-	result = wait_for_completion_timeout(&ctx.completion, 4*HZ);
-	if (result == 0)
-		result = -ETIMEDOUT;
-	else if (result > 0)
-		result = ctx.result;
-	/* if result < 0, pass it on */
-	mutex_lock(&i2400m->init_mutex);
-	i2400m->reset_ctx = NULL;
-	mutex_unlock(&i2400m->init_mutex);
-out:
-	d_fnend(4, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
-	return result;
-}
-
-
-/*
- * Check the MAC address we got from boot mode is ok
- *
- * @i2400m: device descriptor
- *
- * Returns: 0 if ok, < 0 errno code on error.
- */
-static
-int i2400m_check_mac_addr(struct i2400m *i2400m)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *skb;
-	const struct i2400m_tlv_detailed_device_info *ddi;
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	skb = i2400m_get_device_info(i2400m);
-	if (IS_ERR(skb)) {
-		result = PTR_ERR(skb);
-		dev_err(dev, "Cannot verify MAC address, error reading: %d\n",
-			result);
-		goto error;
-	}
-	/* Extract MAC address */
-	ddi = (void *) skb->data;
-	BUILD_BUG_ON(ETH_ALEN != sizeof(ddi->mac_address));
-	d_printf(2, dev, "GET DEVICE INFO: mac addr %pM\n",
-		 ddi->mac_address);
-	if (!memcmp(net_dev->perm_addr, ddi->mac_address,
-		   sizeof(ddi->mac_address)))
-		goto ok;
-	dev_warn(dev, "warning: device reports a different MAC address "
-		 "to that of boot mode's\n");
-	dev_warn(dev, "device reports     %pM\n", ddi->mac_address);
-	dev_warn(dev, "boot mode reported %pM\n", net_dev->perm_addr);
-	if (is_zero_ether_addr(ddi->mac_address))
-		dev_err(dev, "device reports an invalid MAC address, "
-			"not updating\n");
-	else {
-		dev_warn(dev, "updating MAC address\n");
-		net_dev->addr_len = ETH_ALEN;
-		memcpy(net_dev->perm_addr, ddi->mac_address, ETH_ALEN);
-		memcpy(net_dev->dev_addr, ddi->mac_address, ETH_ALEN);
-	}
-ok:
-	result = 0;
-	kfree_skb(skb);
-error:
-	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
-	return result;
-}
-
-
-/**
- * __i2400m_dev_start - Bring up driver communication with the device
- *
- * @i2400m: device descriptor
- * @flags: boot mode flags
- *
- * Returns: 0 if ok, < 0 errno code on error.
- *
- * Uploads firmware and brings up all the resources needed to be able
- * to communicate with the device.
- *
- * The workqueue has to be setup early, at least before RX handling
- * (it's only real user for now) so it can process reports as they
- * arrive. We also want to destroy it if we retry, to make sure it is
- * flushed...easier like this.
- *
- * TX needs to be setup before the bus-specific code (otherwise on
- * shutdown, the bus-tx code could try to access it).
- */
-static
-int __i2400m_dev_start(struct i2400m *i2400m, enum i2400m_bri flags)
-{
-	int result;
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	struct net_device *net_dev = wimax_dev->net_dev;
-	struct device *dev = i2400m_dev(i2400m);
-	int times = i2400m->bus_bm_retries;
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-retry:
-	result = i2400m_dev_bootstrap(i2400m, flags);
-	if (result < 0) {
-		dev_err(dev, "cannot bootstrap device: %d\n", result);
-		goto error_bootstrap;
-	}
-	result = i2400m_tx_setup(i2400m);
-	if (result < 0)
-		goto error_tx_setup;
-	result = i2400m_rx_setup(i2400m);
-	if (result < 0)
-		goto error_rx_setup;
-	i2400m->work_queue = create_singlethread_workqueue(wimax_dev->name);
-	if (i2400m->work_queue == NULL) {
-		result = -ENOMEM;
-		dev_err(dev, "cannot create workqueue\n");
-		goto error_create_workqueue;
-	}
-	if (i2400m->bus_dev_start) {
-		result = i2400m->bus_dev_start(i2400m);
-		if (result < 0)
-			goto error_bus_dev_start;
-	}
-	i2400m->ready = 1;
-	wmb();		/* see i2400m->ready's documentation  */
-	/* process pending reports from the device */
-	queue_work(i2400m->work_queue, &i2400m->rx_report_ws);
-	result = i2400m_firmware_check(i2400m);	/* fw versions ok? */
-	if (result < 0)
-		goto error_fw_check;
-	/* At this point is ok to send commands to the device */
-	result = i2400m_check_mac_addr(i2400m);
-	if (result < 0)
-		goto error_check_mac_addr;
-	result = i2400m_dev_initialize(i2400m);
-	if (result < 0)
-		goto error_dev_initialize;
-
-	/* We don't want any additional unwanted error recovery triggered
-	 * from any other context so if anything went wrong before we come
-	 * here, let's keep i2400m->error_recovery untouched and leave it to
-	 * dev_reset_handle(). See dev_reset_handle(). */
-
-	atomic_dec(&i2400m->error_recovery);
-	/* Every thing works so far, ok, now we are ready to
-	 * take error recovery if it's required. */
-
-	/* At this point, reports will come for the device and set it
-	 * to the right state if it is different than UNINITIALIZED */
-	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
-		net_dev, i2400m, result);
-	return result;
-
-error_dev_initialize:
-error_check_mac_addr:
-error_fw_check:
-	i2400m->ready = 0;
-	wmb();		/* see i2400m->ready's documentation  */
-	flush_workqueue(i2400m->work_queue);
-	if (i2400m->bus_dev_stop)
-		i2400m->bus_dev_stop(i2400m);
-error_bus_dev_start:
-	destroy_workqueue(i2400m->work_queue);
-error_create_workqueue:
-	i2400m_rx_release(i2400m);
-error_rx_setup:
-	i2400m_tx_release(i2400m);
-error_tx_setup:
-error_bootstrap:
-	if (result == -EL3RST && times-- > 0) {
-		flags = I2400M_BRI_SOFT|I2400M_BRI_MAC_REINIT;
-		goto retry;
-	}
-	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
-		net_dev, i2400m, result);
-	return result;
-}
-
-
-static
-int i2400m_dev_start(struct i2400m *i2400m, enum i2400m_bri bm_flags)
-{
-	int result = 0;
-	mutex_lock(&i2400m->init_mutex);	/* Well, start the device */
-	if (i2400m->updown == 0) {
-		result = __i2400m_dev_start(i2400m, bm_flags);
-		if (result >= 0) {
-			i2400m->updown = 1;
-			i2400m->alive = 1;
-			wmb();/* see i2400m->updown and i2400m->alive's doc */
-		}
-	}
-	mutex_unlock(&i2400m->init_mutex);
-	return result;
-}
-
-
-/**
- * i2400m_dev_stop - Tear down driver communication with the device
- *
- * @i2400m: device descriptor
- *
- * Returns: 0 if ok, < 0 errno code on error.
- *
- * Releases all the resources allocated to communicate with the
- * device. Note we cannot destroy the workqueue earlier as until RX is
- * fully destroyed, it could still try to schedule jobs.
- */
-static
-void __i2400m_dev_stop(struct i2400m *i2400m)
-{
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING);
-	i2400m_msg_to_dev_cancel_wait(i2400m, -EL3RST);
-	complete(&i2400m->msg_completion);
-	i2400m_net_wake_stop(i2400m);
-	i2400m_dev_shutdown(i2400m);
-	/*
-	 * Make sure no report hooks are running *before* we stop the
-	 * communication infrastructure with the device.
-	 */
-	i2400m->ready = 0;	/* nobody can queue work anymore */
-	wmb();		/* see i2400m->ready's documentation  */
-	flush_workqueue(i2400m->work_queue);
-
-	if (i2400m->bus_dev_stop)
-		i2400m->bus_dev_stop(i2400m);
-	destroy_workqueue(i2400m->work_queue);
-	i2400m_rx_release(i2400m);
-	i2400m_tx_release(i2400m);
-	wimax_state_change(wimax_dev, WIMAX_ST_DOWN);
-	d_fnend(3, dev, "(i2400m %p) = 0\n", i2400m);
-}
-
-
-/*
- * Watch out -- we only need to stop if there is a need for it. The
- * device could have reset itself and failed to come up again (see
- * _i2400m_dev_reset_handle()).
- */
-static
-void i2400m_dev_stop(struct i2400m *i2400m)
-{
-	mutex_lock(&i2400m->init_mutex);
-	if (i2400m->updown) {
-		__i2400m_dev_stop(i2400m);
-		i2400m->updown = 0;
-		i2400m->alive = 0;
-		wmb();	/* see i2400m->updown and i2400m->alive's doc */
-	}
-	mutex_unlock(&i2400m->init_mutex);
-}
-
-
-/*
- * Listen to PM events to cache the firmware before suspend/hibernation
- *
- * When the device comes out of suspend, it might go into reset and
- * firmware has to be uploaded again. At resume, most of the times, we
- * can't load firmware images from disk, so we need to cache it.
- *
- * i2400m_fw_cache() will allocate a kobject and attach the firmware
- * to it; that way we don't have to worry too much about the fw loader
- * hitting a race condition.
- *
- * Note: modus operandi stolen from the Orinoco driver; thx.
- */
-static
-int i2400m_pm_notifier(struct notifier_block *notifier,
-		       unsigned long pm_event,
-		       void *unused)
-{
-	struct i2400m *i2400m =
-		container_of(notifier, struct i2400m, pm_notifier);
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(i2400m %p pm_event %lx)\n", i2400m, pm_event);
-	switch (pm_event) {
-	case PM_HIBERNATION_PREPARE:
-	case PM_SUSPEND_PREPARE:
-		i2400m_fw_cache(i2400m);
-		break;
-	case PM_POST_RESTORE:
-		/* Restore from hibernation failed. We need to clean
-		 * up in exactly the same way, so fall through. */
-	case PM_POST_HIBERNATION:
-	case PM_POST_SUSPEND:
-		i2400m_fw_uncache(i2400m);
-		break;
-
-	case PM_RESTORE_PREPARE:
-	default:
-		break;
-	}
-	d_fnend(3, dev, "(i2400m %p pm_event %lx) = void\n", i2400m, pm_event);
-	return NOTIFY_DONE;
-}
-
-
-/*
- * pre-reset is called before a device is going on reset
- *
- * This has to be followed by a call to i2400m_post_reset(), otherwise
- * bad things might happen.
- */
-int i2400m_pre_reset(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	d_printf(1, dev, "pre-reset shut down\n");
-
-	mutex_lock(&i2400m->init_mutex);
-	if (i2400m->updown) {
-		netif_tx_disable(i2400m->wimax_dev.net_dev);
-		__i2400m_dev_stop(i2400m);
-		/* down't set updown to zero -- this way
-		 * post_reset can restore properly */
-	}
-	mutex_unlock(&i2400m->init_mutex);
-	if (i2400m->bus_release)
-		i2400m->bus_release(i2400m);
-	d_fnend(3, dev, "(i2400m %p) = 0\n", i2400m);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(i2400m_pre_reset);
-
-
-/*
- * Restore device state after a reset
- *
- * Do the work needed after a device reset to bring it up to the same
- * state as it was before the reset.
- *
- * NOTE: this requires i2400m->init_mutex taken
- */
-int i2400m_post_reset(struct i2400m *i2400m)
-{
-	int result = 0;
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	d_printf(1, dev, "post-reset start\n");
-	if (i2400m->bus_setup) {
-		result = i2400m->bus_setup(i2400m);
-		if (result < 0) {
-			dev_err(dev, "bus-specific setup failed: %d\n",
-				result);
-			goto error_bus_setup;
-		}
-	}
-	mutex_lock(&i2400m->init_mutex);
-	if (i2400m->updown) {
-		result = __i2400m_dev_start(
-			i2400m, I2400M_BRI_SOFT | I2400M_BRI_MAC_REINIT);
-		if (result < 0)
-			goto error_dev_start;
-	}
-	mutex_unlock(&i2400m->init_mutex);
-	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
-	return result;
-
-error_dev_start:
-	if (i2400m->bus_release)
-		i2400m->bus_release(i2400m);
-	/* even if the device was up, it could not be recovered, so we
-	 * mark it as down. */
-	i2400m->updown = 0;
-	wmb();		/* see i2400m->updown's documentation  */
-	mutex_unlock(&i2400m->init_mutex);
-error_bus_setup:
-	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
-	return result;
-}
-EXPORT_SYMBOL_GPL(i2400m_post_reset);
-
-
-/*
- * The device has rebooted; fix up the device and the driver
- *
- * Tear down the driver communication with the device, reload the
- * firmware and reinitialize the communication with the device.
- *
- * If someone calls a reset when the device's firmware is down, in
- * theory we won't see it because we are not listening. However, just
- * in case, leave the code to handle it.
- *
- * If there is a reset context, use it; this means someone is waiting
- * for us to tell him when the reset operation is complete and the
- * device is ready to rock again.
- *
- * NOTE: if we are in the process of bringing up or down the
- *       communication with the device [running i2400m_dev_start() or
- *       _stop()], don't do anything, let it fail and handle it.
- *
- * This function is ran always in a thread context
- *
- * This function gets passed, as payload to i2400m_work() a 'const
- * char *' ptr with a "reason" why the reset happened (for messages).
- */
-static
-void __i2400m_dev_reset_handle(struct work_struct *ws)
-{
-	struct i2400m *i2400m = container_of(ws, struct i2400m, reset_ws);
-	const char *reason = i2400m->reset_reason;
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_reset_ctx *ctx = i2400m->reset_ctx;
-	int result;
-
-	d_fnstart(3, dev, "(ws %p i2400m %p reason %s)\n", ws, i2400m, reason);
-
-	i2400m->boot_mode = 1;
-	wmb();		/* Make sure i2400m_msg_to_dev() sees boot_mode */
-
-	result = 0;
-	if (mutex_trylock(&i2400m->init_mutex) == 0) {
-		/* We are still in i2400m_dev_start() [let it fail] or
-		 * i2400m_dev_stop() [we are shutting down anyway, so
-		 * ignore it] or we are resetting somewhere else. */
-		dev_err(dev, "device rebooted somewhere else?\n");
-		i2400m_msg_to_dev_cancel_wait(i2400m, -EL3RST);
-		complete(&i2400m->msg_completion);
-		goto out;
-	}
-
-	dev_err(dev, "%s: reinitializing driver\n", reason);
-	rmb();
-	if (i2400m->updown) {
-		__i2400m_dev_stop(i2400m);
-		i2400m->updown = 0;
-		wmb();		/* see i2400m->updown's documentation  */
-	}
-
-	if (i2400m->alive) {
-		result = __i2400m_dev_start(i2400m,
-				    I2400M_BRI_SOFT | I2400M_BRI_MAC_REINIT);
-		if (result < 0) {
-			dev_err(dev, "%s: cannot start the device: %d\n",
-				reason, result);
-			result = -EUCLEAN;
-			if (atomic_read(&i2400m->bus_reset_retries)
-					>= I2400M_BUS_RESET_RETRIES) {
-				result = -ENODEV;
-				dev_err(dev, "tried too many times to "
-					"reset the device, giving up\n");
-			}
-		}
-	}
-
-	if (i2400m->reset_ctx) {
-		ctx->result = result;
-		complete(&ctx->completion);
-	}
-	mutex_unlock(&i2400m->init_mutex);
-	if (result == -EUCLEAN) {
-		/*
-		 * We come here because the reset during operational mode
-		 * wasn't successfully done and need to proceed to a bus
-		 * reset. For the dev_reset_handle() to be able to handle
-		 * the reset event later properly, we restore boot_mode back
-		 * to the state before previous reset. ie: just like we are
-		 * issuing the bus reset for the first time
-		 */
-		i2400m->boot_mode = 0;
-		wmb();
-
-		atomic_inc(&i2400m->bus_reset_retries);
-		/* ops, need to clean up [w/ init_mutex not held] */
-		result = i2400m_reset(i2400m, I2400M_RT_BUS);
-		if (result >= 0)
-			result = -ENODEV;
-	} else {
-		rmb();
-		if (i2400m->alive) {
-			/* great, we expect the device state up and
-			 * dev_start() actually brings the device state up */
-			i2400m->updown = 1;
-			wmb();
-			atomic_set(&i2400m->bus_reset_retries, 0);
-		}
-	}
-out:
-	d_fnend(3, dev, "(ws %p i2400m %p reason %s) = void\n",
-		ws, i2400m, reason);
-}
-
-
-/**
- * i2400m_dev_reset_handle - Handle a device's reset in a thread context
- *
- * Schedule a device reset handling out on a thread context, so it
- * is safe to call from atomic context. We can't use the i2400m's
- * queue as we are going to destroy it and reinitialize it as part of
- * the driver bringup/bringup process.
- *
- * See __i2400m_dev_reset_handle() for details; that takes care of
- * reinitializing the driver to handle the reset, calling into the
- * bus-specific functions ops as needed.
- */
-int i2400m_dev_reset_handle(struct i2400m *i2400m, const char *reason)
-{
-	i2400m->reset_reason = reason;
-	return schedule_work(&i2400m->reset_ws);
-}
-EXPORT_SYMBOL_GPL(i2400m_dev_reset_handle);
-
-
- /*
- * The actual work of error recovery.
- *
- * The current implementation of error recovery is to trigger a bus reset.
- */
-static
-void __i2400m_error_recovery(struct work_struct *ws)
-{
-	struct i2400m *i2400m = container_of(ws, struct i2400m, recovery_ws);
-
-	i2400m_reset(i2400m, I2400M_RT_BUS);
-}
-
-/*
- * Schedule a work struct for error recovery.
- *
- * The intention of error recovery is to bring back the device to some
- * known state whenever TX sees -110 (-ETIMEOUT) on copying the data to
- * the device. The TX failure could mean a device bus stuck, so the current
- * error recovery implementation is to trigger a bus reset to the device
- * and hopefully it can bring back the device.
- *
- * The actual work of error recovery has to be in a thread context because
- * it is kicked off in the TX thread (i2400ms->tx_workqueue) which is to be
- * destroyed by the error recovery mechanism (currently a bus reset).
- *
- * Also, there may be already a queue of TX works that all hit
- * the -ETIMEOUT error condition because the device is stuck already.
- * Since bus reset is used as the error recovery mechanism and we don't
- * want consecutive bus resets simply because the multiple TX works
- * in the queue all hit the same device erratum, the flag "error_recovery"
- * is introduced for preventing unwanted consecutive bus resets.
- *
- * Error recovery shall only be invoked again if previous one was completed.
- * The flag error_recovery is set when error recovery mechanism is scheduled,
- * and is checked when we need to schedule another error recovery. If it is
- * in place already, then we shouldn't schedule another one.
- */
-void i2400m_error_recovery(struct i2400m *i2400m)
-{
-	if (atomic_add_return(1, &i2400m->error_recovery) == 1)
-		schedule_work(&i2400m->recovery_ws);
-	else
-		atomic_dec(&i2400m->error_recovery);
-}
-EXPORT_SYMBOL_GPL(i2400m_error_recovery);
-
-/*
- * Alloc the command and ack buffers for boot mode
- *
- * Get the buffers needed to deal with boot mode messages.
- */
-static
-int i2400m_bm_buf_alloc(struct i2400m *i2400m)
-{
-	i2400m->bm_cmd_buf = kzalloc(I2400M_BM_CMD_BUF_SIZE, GFP_KERNEL);
-	if (i2400m->bm_cmd_buf == NULL)
-		goto error_bm_cmd_kzalloc;
-	i2400m->bm_ack_buf = kzalloc(I2400M_BM_ACK_BUF_SIZE, GFP_KERNEL);
-	if (i2400m->bm_ack_buf == NULL)
-		goto error_bm_ack_buf_kzalloc;
-	return 0;
-
-error_bm_ack_buf_kzalloc:
-	kfree(i2400m->bm_cmd_buf);
-error_bm_cmd_kzalloc:
-	return -ENOMEM;
-}
-
-
-/*
- * Free boot mode command and ack buffers.
- */
-static
-void i2400m_bm_buf_free(struct i2400m *i2400m)
-{
-	kfree(i2400m->bm_ack_buf);
-	kfree(i2400m->bm_cmd_buf);
-}
-
-
-/**
- * i2400m_init - Initialize a 'struct i2400m' from all zeroes
- *
- * This is a bus-generic API call.
- */
-void i2400m_init(struct i2400m *i2400m)
-{
-	wimax_dev_init(&i2400m->wimax_dev);
-
-	i2400m->boot_mode = 1;
-	i2400m->rx_reorder = 1;
-	init_waitqueue_head(&i2400m->state_wq);
-
-	spin_lock_init(&i2400m->tx_lock);
-	i2400m->tx_pl_min = UINT_MAX;
-	i2400m->tx_size_min = UINT_MAX;
-
-	spin_lock_init(&i2400m->rx_lock);
-	i2400m->rx_pl_min = UINT_MAX;
-	i2400m->rx_size_min = UINT_MAX;
-	INIT_LIST_HEAD(&i2400m->rx_reports);
-	INIT_WORK(&i2400m->rx_report_ws, i2400m_report_hook_work);
-
-	mutex_init(&i2400m->msg_mutex);
-	init_completion(&i2400m->msg_completion);
-
-	mutex_init(&i2400m->init_mutex);
-	/* wake_tx_ws is initialized in i2400m_tx_setup() */
-
-	INIT_WORK(&i2400m->reset_ws, __i2400m_dev_reset_handle);
-	INIT_WORK(&i2400m->recovery_ws, __i2400m_error_recovery);
-
-	atomic_set(&i2400m->bus_reset_retries, 0);
-
-	i2400m->alive = 0;
-
-	/* initialize error_recovery to 1 for denoting we
-	 * are not yet ready to take any error recovery */
-	atomic_set(&i2400m->error_recovery, 1);
-}
-EXPORT_SYMBOL_GPL(i2400m_init);
-
-
-int i2400m_reset(struct i2400m *i2400m, enum i2400m_reset_type rt)
-{
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-
-	/*
-	 * Make sure we stop TXs and down the carrier before
-	 * resetting; this is needed to avoid things like
-	 * i2400m_wake_tx() scheduling stuff in parallel.
-	 */
-	if (net_dev->reg_state == NETREG_REGISTERED) {
-		netif_tx_disable(net_dev);
-		netif_carrier_off(net_dev);
-	}
-	return i2400m->bus_reset(i2400m, rt);
-}
-EXPORT_SYMBOL_GPL(i2400m_reset);
-
-
-/**
- * i2400m_setup - bus-generic setup function for the i2400m device
- *
- * @i2400m: device descriptor (bus-specific parts have been initialized)
- *
- * Returns: 0 if ok, < 0 errno code on error.
- *
- * Sets up basic device comunication infrastructure, boots the ROM to
- * read the MAC address, registers with the WiMAX and network stacks
- * and then brings up the device.
- */
-int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-
-	snprintf(wimax_dev->name, sizeof(wimax_dev->name),
-		 "i2400m-%s:%s", dev->bus->name, dev_name(dev));
-
-	result = i2400m_bm_buf_alloc(i2400m);
-	if (result < 0) {
-		dev_err(dev, "cannot allocate bootmode scratch buffers\n");
-		goto error_bm_buf_alloc;
-	}
-
-	if (i2400m->bus_setup) {
-		result = i2400m->bus_setup(i2400m);
-		if (result < 0) {
-			dev_err(dev, "bus-specific setup failed: %d\n",
-				result);
-			goto error_bus_setup;
-		}
-	}
-
-	result = i2400m_bootrom_init(i2400m, bm_flags);
-	if (result < 0) {
-		dev_err(dev, "read mac addr: bootrom init "
-			"failed: %d\n", result);
-		goto error_bootrom_init;
-	}
-	result = i2400m_read_mac_addr(i2400m);
-	if (result < 0)
-		goto error_read_mac_addr;
-	eth_random_addr(i2400m->src_mac_addr);
-
-	i2400m->pm_notifier.notifier_call = i2400m_pm_notifier;
-	register_pm_notifier(&i2400m->pm_notifier);
-
-	result = register_netdev(net_dev);	/* Okey dokey, bring it up */
-	if (result < 0) {
-		dev_err(dev, "cannot register i2400m network device: %d\n",
-			result);
-		goto error_register_netdev;
-	}
-	netif_carrier_off(net_dev);
-
-	i2400m->wimax_dev.op_msg_from_user = i2400m_op_msg_from_user;
-	i2400m->wimax_dev.op_rfkill_sw_toggle = i2400m_op_rfkill_sw_toggle;
-	i2400m->wimax_dev.op_reset = i2400m_op_reset;
-
-	result = wimax_dev_add(&i2400m->wimax_dev, net_dev);
-	if (result < 0)
-		goto error_wimax_dev_add;
-
-	/* Now setup all that requires a registered net and wimax device. */
-	result = sysfs_create_group(&net_dev->dev.kobj, &i2400m_dev_attr_group);
-	if (result < 0) {
-		dev_err(dev, "cannot setup i2400m's sysfs: %d\n", result);
-		goto error_sysfs_setup;
-	}
-
-	i2400m_debugfs_add(i2400m);
-
-	result = i2400m_dev_start(i2400m, bm_flags);
-	if (result < 0)
-		goto error_dev_start;
-	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
-	return result;
-
-error_dev_start:
-	i2400m_debugfs_rm(i2400m);
-	sysfs_remove_group(&i2400m->wimax_dev.net_dev->dev.kobj,
-			   &i2400m_dev_attr_group);
-error_sysfs_setup:
-	wimax_dev_rm(&i2400m->wimax_dev);
-error_wimax_dev_add:
-	unregister_netdev(net_dev);
-error_register_netdev:
-	unregister_pm_notifier(&i2400m->pm_notifier);
-error_read_mac_addr:
-error_bootrom_init:
-	if (i2400m->bus_release)
-		i2400m->bus_release(i2400m);
-error_bus_setup:
-	i2400m_bm_buf_free(i2400m);
-error_bm_buf_alloc:
-	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
-	return result;
-}
-EXPORT_SYMBOL_GPL(i2400m_setup);
-
-
-/**
- * i2400m_release - release the bus-generic driver resources
- *
- * Sends a disconnect message and undoes any setup done by i2400m_setup()
- */
-void i2400m_release(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	netif_stop_queue(i2400m->wimax_dev.net_dev);
-
-	i2400m_dev_stop(i2400m);
-
-	cancel_work_sync(&i2400m->reset_ws);
-	cancel_work_sync(&i2400m->recovery_ws);
-
-	i2400m_debugfs_rm(i2400m);
-	sysfs_remove_group(&i2400m->wimax_dev.net_dev->dev.kobj,
-			   &i2400m_dev_attr_group);
-	wimax_dev_rm(&i2400m->wimax_dev);
-	unregister_netdev(i2400m->wimax_dev.net_dev);
-	unregister_pm_notifier(&i2400m->pm_notifier);
-	if (i2400m->bus_release)
-		i2400m->bus_release(i2400m);
-	i2400m_bm_buf_free(i2400m);
-	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
-}
-EXPORT_SYMBOL_GPL(i2400m_release);
-
-
-/*
- * Debug levels control; see debug.h
- */
-struct d_level D_LEVEL[] = {
-	D_SUBMODULE_DEFINE(control),
-	D_SUBMODULE_DEFINE(driver),
-	D_SUBMODULE_DEFINE(debugfs),
-	D_SUBMODULE_DEFINE(fw),
-	D_SUBMODULE_DEFINE(netdev),
-	D_SUBMODULE_DEFINE(rfkill),
-	D_SUBMODULE_DEFINE(rx),
-	D_SUBMODULE_DEFINE(sysfs),
-	D_SUBMODULE_DEFINE(tx),
-};
-size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
-
-
-static
-int __init i2400m_driver_init(void)
-{
-	d_parse_params(D_LEVEL, D_LEVEL_SIZE, i2400m_debug_params,
-		       "i2400m.debug");
-	return i2400m_barker_db_init(i2400m_barkers_params);
-}
-module_init(i2400m_driver_init);
-
-static
-void __exit i2400m_driver_exit(void)
-{
-	i2400m_barker_db_exit();
-}
-module_exit(i2400m_driver_exit);
-
-MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
-MODULE_DESCRIPTION("Intel 2400M WiMAX networking bus-generic driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/net/wimax/i2400m/fw.c b/drivers/net/wimax/i2400m/fw.c
deleted file mode 100644
index 6c9a41bff2e0..000000000000
--- a/drivers/net/wimax/i2400m/fw.c
+++ /dev/null
@@ -1,1653 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Firmware uploader
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Initial implementation
- *
- *
- * THE PROCEDURE
- *
- * The 2400m and derived devices work in two modes: boot-mode or
- * normal mode. In boot mode we can execute only a handful of commands
- * targeted at uploading the firmware and launching it.
- *
- * The 2400m enters boot mode when it is first connected to the
- * system, when it crashes and when you ask it to reboot. There are
- * two submodes of the boot mode: signed and non-signed. Signed takes
- * firmwares signed with a certain private key, non-signed takes any
- * firmware. Normal hardware takes only signed firmware.
- *
- * On boot mode, in USB, we write to the device using the bulk out
- * endpoint and read from it in the notification endpoint.
- *
- * Upon entrance to boot mode, the device sends (preceded with a few
- * zero length packets (ZLPs) on the notification endpoint in USB) a
- * reboot barker (4 le32 words with the same value). We ack it by
- * sending the same barker to the device. The device acks with a
- * reboot ack barker (4 le32 words with value I2400M_ACK_BARKER) and
- * then is fully booted. At this point we can upload the firmware.
- *
- * Note that different iterations of the device and EEPROM
- * configurations will send different [re]boot barkers; these are
- * collected in i2400m_barker_db along with the firmware
- * characteristics they require.
- *
- * This process is accomplished by the i2400m_bootrom_init()
- * function. All the device interaction happens through the
- * i2400m_bm_cmd() [boot mode command]. Special return values will
- * indicate if the device did reset during the process.
- *
- * After this, we read the MAC address and then (if needed)
- * reinitialize the device. We need to read it ahead of time because
- * in the future, we might not upload the firmware until userspace
- * 'ifconfig up's the device.
- *
- * We can then upload the firmware file. The file is composed of a BCF
- * header (basic data, keys and signatures) and a list of write
- * commands and payloads. Optionally more BCF headers might follow the
- * main payload. We first upload the header [i2400m_dnload_init()] and
- * then pass the commands and payloads verbatim to the i2400m_bm_cmd()
- * function [i2400m_dnload_bcf()]. Then we tell the device to jump to
- * the new firmware [i2400m_dnload_finalize()].
- *
- * Once firmware is uploaded, we are good to go :)
- *
- * When we don't know in which mode we are, we first try by sending a
- * warm reset request that will take us to boot-mode. If we time out
- * waiting for a reboot barker, that means maybe we are already in
- * boot mode, so we send a reboot barker.
- *
- * COMMAND EXECUTION
- *
- * This code (and process) is single threaded; for executing commands,
- * we post a URB to the notification endpoint, post the command, wait
- * for data on the notification buffer. We don't need to worry about
- * others as we know we are the only ones in there.
- *
- * BACKEND IMPLEMENTATION
- *
- * This code is bus-generic; the bus-specific driver provides back end
- * implementations to send a boot mode command to the device and to
- * read an acknolwedgement from it (or an asynchronous notification)
- * from it.
- *
- * FIRMWARE LOADING
- *
- * Note that in some cases, we can't just load a firmware file (for
- * example, when resuming). For that, we might cache the firmware
- * file. Thus, when doing the bootstrap, if there is a cache firmware
- * file, it is used; if not, loading from disk is attempted.
- *
- * ROADMAP
- *
- * i2400m_barker_db_init              Called by i2400m_driver_init()
- *   i2400m_barker_db_add
- *
- * i2400m_barker_db_exit              Called by i2400m_driver_exit()
- *
- * i2400m_dev_bootstrap               Called by __i2400m_dev_start()
- *   request_firmware
- *   i2400m_fw_bootstrap
- *     i2400m_fw_check
- *       i2400m_fw_hdr_check
- *     i2400m_fw_dnload
- *   release_firmware
- *
- * i2400m_fw_dnload
- *   i2400m_bootrom_init
- *     i2400m_bm_cmd
- *     i2400m_reset
- *   i2400m_dnload_init
- *     i2400m_dnload_init_signed
- *     i2400m_dnload_init_nonsigned
- *       i2400m_download_chunk
- *         i2400m_bm_cmd
- *   i2400m_dnload_bcf
- *     i2400m_bm_cmd
- *   i2400m_dnload_finalize
- *     i2400m_bm_cmd
- *
- * i2400m_bm_cmd
- *   i2400m->bus_bm_cmd_send()
- *   i2400m->bus_bm_wait_for_ack
- *   __i2400m_bm_ack_verify
- *     i2400m_is_boot_barker
- *
- * i2400m_bm_cmd_prepare              Used by bus-drivers to prep
- *                                    commands before sending
- *
- * i2400m_pm_notifier                 Called on Power Management events
- *   i2400m_fw_cache
- *   i2400m_fw_uncache
- */
-#include <linux/firmware.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/usb.h>
-#include <linux/export.h>
-#include "i2400m.h"
-
-
-#define D_SUBMODULE fw
-#include "debug-levels.h"
-
-
-static const __le32 i2400m_ACK_BARKER[4] = {
-	cpu_to_le32(I2400M_ACK_BARKER),
-	cpu_to_le32(I2400M_ACK_BARKER),
-	cpu_to_le32(I2400M_ACK_BARKER),
-	cpu_to_le32(I2400M_ACK_BARKER)
-};
-
-
-/**
- * Prepare a boot-mode command for delivery
- *
- * @cmd: pointer to bootrom header to prepare
- *
- * Computes checksum if so needed. After calling this function, DO NOT
- * modify the command or header as the checksum won't work anymore.
- *
- * We do it from here because some times we cannot do it in the
- * original context the command was sent (it is a const), so when we
- * copy it to our staging buffer, we add the checksum there.
- */
-void i2400m_bm_cmd_prepare(struct i2400m_bootrom_header *cmd)
-{
-	if (i2400m_brh_get_use_checksum(cmd)) {
-		int i;
-		u32 checksum = 0;
-		const u32 *checksum_ptr = (void *) cmd->payload;
-		for (i = 0; i < cmd->data_size / 4; i++)
-			checksum += cpu_to_le32(*checksum_ptr++);
-		checksum += cmd->command + cmd->target_addr + cmd->data_size;
-		cmd->block_checksum = cpu_to_le32(checksum);
-	}
-}
-EXPORT_SYMBOL_GPL(i2400m_bm_cmd_prepare);
-
-
-/*
- * Database of known barkers.
- *
- * A barker is what the device sends indicating he is ready to be
- * bootloaded. Different versions of the device will send different
- * barkers. Depending on the barker, it might mean the device wants
- * some kind of firmware or the other.
- */
-static struct i2400m_barker_db {
-	__le32 data[4];
-} *i2400m_barker_db;
-static size_t i2400m_barker_db_used, i2400m_barker_db_size;
-
-
-static
-int i2400m_zrealloc_2x(void **ptr, size_t *_count, size_t el_size,
-		       gfp_t gfp_flags)
-{
-	size_t old_count = *_count,
-		new_count = old_count ? 2 * old_count : 2,
-		old_size = el_size * old_count,
-		new_size = el_size * new_count;
-	void *nptr = krealloc(*ptr, new_size, gfp_flags);
-	if (nptr) {
-		/* zero the other half or the whole thing if old_count
-		 * was zero */
-		if (old_size == 0)
-			memset(nptr, 0, new_size);
-		else
-			memset(nptr + old_size, 0, old_size);
-		*_count = new_count;
-		*ptr = nptr;
-		return 0;
-	} else
-		return -ENOMEM;
-}
-
-
-/*
- * Add a barker to the database
- *
- * This cannot used outside of this module and only at at module_init
- * time. This is to avoid the need to do locking.
- */
-static
-int i2400m_barker_db_add(u32 barker_id)
-{
-	int result;
-
-	struct i2400m_barker_db *barker;
-	if (i2400m_barker_db_used >= i2400m_barker_db_size) {
-		result = i2400m_zrealloc_2x(
-			(void **) &i2400m_barker_db, &i2400m_barker_db_size,
-			sizeof(i2400m_barker_db[0]), GFP_KERNEL);
-		if (result < 0)
-			return result;
-	}
-	barker = i2400m_barker_db + i2400m_barker_db_used++;
-	barker->data[0] = le32_to_cpu(barker_id);
-	barker->data[1] = le32_to_cpu(barker_id);
-	barker->data[2] = le32_to_cpu(barker_id);
-	barker->data[3] = le32_to_cpu(barker_id);
-	return 0;
-}
-
-
-void i2400m_barker_db_exit(void)
-{
-	kfree(i2400m_barker_db);
-	i2400m_barker_db = NULL;
-	i2400m_barker_db_size = 0;
-	i2400m_barker_db_used = 0;
-}
-
-
-/*
- * Helper function to add all the known stable barkers to the barker
- * database.
- */
-static
-int i2400m_barker_db_known_barkers(void)
-{
-	int result;
-
-	result = i2400m_barker_db_add(I2400M_NBOOT_BARKER);
-	if (result < 0)
-		goto error_add;
-	result = i2400m_barker_db_add(I2400M_SBOOT_BARKER);
-	if (result < 0)
-		goto error_add;
-	result = i2400m_barker_db_add(I2400M_SBOOT_BARKER_6050);
-	if (result < 0)
-		goto error_add;
-error_add:
-       return result;
-}
-
-
-/*
- * Initialize the barker database
- *
- * This can only be used from the module_init function for this
- * module; this is to avoid the need to do locking.
- *
- * @options: command line argument with extra barkers to
- *     recognize. This is a comma-separated list of 32-bit hex
- *     numbers. They are appended to the existing list. Setting 0
- *     cleans the existing list and starts a new one.
- */
-int i2400m_barker_db_init(const char *_options)
-{
-	int result;
-	char *options = NULL, *options_orig, *token;
-
-	i2400m_barker_db = NULL;
-	i2400m_barker_db_size = 0;
-	i2400m_barker_db_used = 0;
-
-	result = i2400m_barker_db_known_barkers();
-	if (result < 0)
-		goto error_add;
-	/* parse command line options from i2400m.barkers */
-	if (_options != NULL) {
-		unsigned barker;
-
-		options_orig = kstrdup(_options, GFP_KERNEL);
-		if (options_orig == NULL) {
-			result = -ENOMEM;
-			goto error_parse;
-		}
-		options = options_orig;
-
-		while ((token = strsep(&options, ",")) != NULL) {
-			if (*token == '\0')	/* eat joint commas */
-				continue;
-			if (sscanf(token, "%x", &barker) != 1
-			    || barker > 0xffffffff) {
-				printk(KERN_ERR "%s: can't recognize "
-				       "i2400m.barkers value '%s' as "
-				       "a 32-bit number\n",
-				       __func__, token);
-				result = -EINVAL;
-				goto error_parse;
-			}
-			if (barker == 0) {
-				/* clean list and start new */
-				i2400m_barker_db_exit();
-				continue;
-			}
-			result = i2400m_barker_db_add(barker);
-			if (result < 0)
-				goto error_parse_add;
-		}
-		kfree(options_orig);
-	}
-	return 0;
-
-error_parse_add:
-error_parse:
-	kfree(options_orig);
-error_add:
-	kfree(i2400m_barker_db);
-	return result;
-}
-
-
-/*
- * Recognize a boot barker
- *
- * @buf: buffer where the boot barker.
- * @buf_size: size of the buffer (has to be 16 bytes). It is passed
- *     here so the function can check it for the caller.
- *
- * Note that as a side effect, upon identifying the obtained boot
- * barker, this function will set i2400m->barker to point to the right
- * barker database entry. Subsequent calls to the function will result
- * in verifying that the same type of boot barker is returned when the
- * device [re]boots (as long as the same device instance is used).
- *
- * Return: 0 if @buf matches a known boot barker. -ENOENT if the
- *     buffer in @buf doesn't match any boot barker in the database or
- *     -EILSEQ if the buffer doesn't have the right size.
- */
-int i2400m_is_boot_barker(struct i2400m *i2400m,
-			  const void *buf, size_t buf_size)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_barker_db *barker;
-	int i;
-
-	result = -ENOENT;
-	if (buf_size != sizeof(i2400m_barker_db[i].data))
-		return result;
-
-	/* Short circuit if we have already discovered the barker
-	 * associated with the device. */
-	if (i2400m->barker &&
-	    !memcmp(buf, i2400m->barker, sizeof(i2400m->barker->data)))
-		return 0;
-
-	for (i = 0; i < i2400m_barker_db_used; i++) {
-		barker = &i2400m_barker_db[i];
-		BUILD_BUG_ON(sizeof(barker->data) != 16);
-		if (memcmp(buf, barker->data, sizeof(barker->data)))
-			continue;
-
-		if (i2400m->barker == NULL) {
-			i2400m->barker = barker;
-			d_printf(1, dev, "boot barker set to #%u/%08x\n",
-				 i, le32_to_cpu(barker->data[0]));
-			if (barker->data[0] == le32_to_cpu(I2400M_NBOOT_BARKER))
-				i2400m->sboot = 0;
-			else
-				i2400m->sboot = 1;
-		} else if (i2400m->barker != barker) {
-			dev_err(dev, "HW inconsistency: device "
-				"reports a different boot barker "
-				"than set (from %08x to %08x)\n",
-				le32_to_cpu(i2400m->barker->data[0]),
-				le32_to_cpu(barker->data[0]));
-			result = -EIO;
-		} else
-			d_printf(2, dev, "boot barker confirmed #%u/%08x\n",
-				 i, le32_to_cpu(barker->data[0]));
-		result = 0;
-		break;
-	}
-	return result;
-}
-EXPORT_SYMBOL_GPL(i2400m_is_boot_barker);
-
-
-/*
- * Verify the ack data received
- *
- * Given a reply to a boot mode command, chew it and verify everything
- * is ok.
- *
- * @opcode: opcode which generated this ack. For error messages.
- * @ack: pointer to ack data we received
- * @ack_size: size of that data buffer
- * @flags: I2400M_BM_CMD_* flags we called the command with.
- *
- * Way too long function -- maybe it should be further split
- */
-static
-ssize_t __i2400m_bm_ack_verify(struct i2400m *i2400m, int opcode,
-			       struct i2400m_bootrom_header *ack,
-			       size_t ack_size, int flags)
-{
-	ssize_t result = -ENOMEM;
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(8, dev, "(i2400m %p opcode %d ack %p size %zu)\n",
-		  i2400m, opcode, ack, ack_size);
-	if (ack_size < sizeof(*ack)) {
-		result = -EIO;
-		dev_err(dev, "boot-mode cmd %d: HW BUG? notification didn't "
-			"return enough data (%zu bytes vs %zu expected)\n",
-			opcode, ack_size, sizeof(*ack));
-		goto error_ack_short;
-	}
-	result = i2400m_is_boot_barker(i2400m, ack, ack_size);
-	if (result >= 0) {
-		result = -ERESTARTSYS;
-		d_printf(6, dev, "boot-mode cmd %d: HW boot barker\n", opcode);
-		goto error_reboot;
-	}
-	if (ack_size == sizeof(i2400m_ACK_BARKER)
-		 && memcmp(ack, i2400m_ACK_BARKER, sizeof(*ack)) == 0) {
-		result = -EISCONN;
-		d_printf(3, dev, "boot-mode cmd %d: HW reboot ack barker\n",
-			 opcode);
-		goto error_reboot_ack;
-	}
-	result = 0;
-	if (flags & I2400M_BM_CMD_RAW)
-		goto out_raw;
-	ack->data_size = le32_to_cpu(ack->data_size);
-	ack->target_addr = le32_to_cpu(ack->target_addr);
-	ack->block_checksum = le32_to_cpu(ack->block_checksum);
-	d_printf(5, dev, "boot-mode cmd %d: notification for opcode %u "
-		 "response %u csum %u rr %u da %u\n",
-		 opcode, i2400m_brh_get_opcode(ack),
-		 i2400m_brh_get_response(ack),
-		 i2400m_brh_get_use_checksum(ack),
-		 i2400m_brh_get_response_required(ack),
-		 i2400m_brh_get_direct_access(ack));
-	result = -EIO;
-	if (i2400m_brh_get_signature(ack) != 0xcbbc) {
-		dev_err(dev, "boot-mode cmd %d: HW BUG? wrong signature "
-			"0x%04x\n", opcode, i2400m_brh_get_signature(ack));
-		goto error_ack_signature;
-	}
-	if (opcode != -1 && opcode != i2400m_brh_get_opcode(ack)) {
-		dev_err(dev, "boot-mode cmd %d: HW BUG? "
-			"received response for opcode %u, expected %u\n",
-			opcode, i2400m_brh_get_opcode(ack), opcode);
-		goto error_ack_opcode;
-	}
-	if (i2400m_brh_get_response(ack) != 0) {	/* failed? */
-		dev_err(dev, "boot-mode cmd %d: error; hw response %u\n",
-			opcode, i2400m_brh_get_response(ack));
-		goto error_ack_failed;
-	}
-	if (ack_size < ack->data_size + sizeof(*ack)) {
-		dev_err(dev, "boot-mode cmd %d: SW BUG "
-			"driver provided only %zu bytes for %zu bytes "
-			"of data\n", opcode, ack_size,
-			(size_t) le32_to_cpu(ack->data_size) + sizeof(*ack));
-		goto error_ack_short_buffer;
-	}
-	result = ack_size;
-	/* Don't you love this stack of empty targets? Well, I don't
-	 * either, but it helps track exactly who comes in here and
-	 * why :) */
-error_ack_short_buffer:
-error_ack_failed:
-error_ack_opcode:
-error_ack_signature:
-out_raw:
-error_reboot_ack:
-error_reboot:
-error_ack_short:
-	d_fnend(8, dev, "(i2400m %p opcode %d ack %p size %zu) = %d\n",
-		i2400m, opcode, ack, ack_size, (int) result);
-	return result;
-}
-
-
-/**
- * i2400m_bm_cmd - Execute a boot mode command
- *
- * @cmd: buffer containing the command data (pointing at the header).
- *     This data can be ANYWHERE (for USB, we will copy it to an
- *     specific buffer). Make sure everything is in proper little
- *     endian.
- *
- *     A raw buffer can be also sent, just cast it and set flags to
- *     I2400M_BM_CMD_RAW.
- *
- *     This function will generate a checksum for you if the
- *     checksum bit in the command is set (unless I2400M_BM_CMD_RAW
- *     is set).
- *
- *     You can use the i2400m->bm_cmd_buf to stage your commands and
- *     send them.
- *
- *     If NULL, no command is sent (we just wait for an ack).
- *
- * @cmd_size: size of the command. Will be auto padded to the
- *     bus-specific drivers padding requirements.
- *
- * @ack: buffer where to place the acknowledgement. If it is a regular
- *     command response, all fields will be returned with the right,
- *     native endianess.
- *
- *     You *cannot* use i2400m->bm_ack_buf for this buffer.
- *
- * @ack_size: size of @ack, 16 aligned; you need to provide at least
- *     sizeof(*ack) bytes and then enough to contain the return data
- *     from the command
- *
- * @flags: see I2400M_BM_CMD_* above.
- *
- * @returns: bytes received by the notification; if < 0, an errno code
- *     denoting an error or:
- *
- *     -ERESTARTSYS  The device has rebooted
- *
- * Executes a boot-mode command and waits for a response, doing basic
- * validation on it; if a zero length response is received, it retries
- * waiting for a response until a non-zero one is received (timing out
- * after %I2400M_BOOT_RETRIES retries).
- */
-static
-ssize_t i2400m_bm_cmd(struct i2400m *i2400m,
-		      const struct i2400m_bootrom_header *cmd, size_t cmd_size,
-		      struct i2400m_bootrom_header *ack, size_t ack_size,
-		      int flags)
-{
-	ssize_t result = -ENOMEM, rx_bytes;
-	struct device *dev = i2400m_dev(i2400m);
-	int opcode = cmd == NULL ? -1 : i2400m_brh_get_opcode(cmd);
-
-	d_fnstart(6, dev, "(i2400m %p cmd %p size %zu ack %p size %zu)\n",
-		  i2400m, cmd, cmd_size, ack, ack_size);
-	BUG_ON(ack_size < sizeof(*ack));
-	BUG_ON(i2400m->boot_mode == 0);
-
-	if (cmd != NULL) {		/* send the command */
-		result = i2400m->bus_bm_cmd_send(i2400m, cmd, cmd_size, flags);
-		if (result < 0)
-			goto error_cmd_send;
-		if ((flags & I2400M_BM_CMD_RAW) == 0)
-			d_printf(5, dev,
-				 "boot-mode cmd %d csum %u rr %u da %u: "
-				 "addr 0x%04x size %u block csum 0x%04x\n",
-				 opcode, i2400m_brh_get_use_checksum(cmd),
-				 i2400m_brh_get_response_required(cmd),
-				 i2400m_brh_get_direct_access(cmd),
-				 cmd->target_addr, cmd->data_size,
-				 cmd->block_checksum);
-	}
-	result = i2400m->bus_bm_wait_for_ack(i2400m, ack, ack_size);
-	if (result < 0) {
-		dev_err(dev, "boot-mode cmd %d: error waiting for an ack: %d\n",
-			opcode, (int) result);	/* bah, %zd doesn't work */
-		goto error_wait_for_ack;
-	}
-	rx_bytes = result;
-	/* verify the ack and read more if necessary [result is the
-	 * final amount of bytes we get in the ack]  */
-	result = __i2400m_bm_ack_verify(i2400m, opcode, ack, ack_size, flags);
-	if (result < 0)
-		goto error_bad_ack;
-	/* Don't you love this stack of empty targets? Well, I don't
-	 * either, but it helps track exactly who comes in here and
-	 * why :) */
-	result = rx_bytes;
-error_bad_ack:
-error_wait_for_ack:
-error_cmd_send:
-	d_fnend(6, dev, "(i2400m %p cmd %p size %zu ack %p size %zu) = %d\n",
-		i2400m, cmd, cmd_size, ack, ack_size, (int) result);
-	return result;
-}
-
-
-/**
- * i2400m_download_chunk - write a single chunk of data to the device's memory
- *
- * @i2400m: device descriptor
- * @buf: the buffer to write
- * @buf_len: length of the buffer to write
- * @addr: address in the device memory space
- * @direct: bootrom write mode
- * @do_csum: should a checksum validation be performed
- */
-static int i2400m_download_chunk(struct i2400m *i2400m, const void *chunk,
-				 size_t __chunk_len, unsigned long addr,
-				 unsigned int direct, unsigned int do_csum)
-{
-	int ret;
-	size_t chunk_len = ALIGN(__chunk_len, I2400M_PL_ALIGN);
-	struct device *dev = i2400m_dev(i2400m);
-	struct {
-		struct i2400m_bootrom_header cmd;
-		u8 cmd_payload[];
-	} __packed *buf;
-	struct i2400m_bootrom_header ack;
-
-	d_fnstart(5, dev, "(i2400m %p chunk %p __chunk_len %zu addr 0x%08lx "
-		  "direct %u do_csum %u)\n", i2400m, chunk, __chunk_len,
-		  addr, direct, do_csum);
-	buf = i2400m->bm_cmd_buf;
-	memcpy(buf->cmd_payload, chunk, __chunk_len);
-	memset(buf->cmd_payload + __chunk_len, 0xad, chunk_len - __chunk_len);
-
-	buf->cmd.command = i2400m_brh_command(I2400M_BRH_WRITE,
-					      __chunk_len & 0x3 ? 0 : do_csum,
-					      __chunk_len & 0xf ? 0 : direct);
-	buf->cmd.target_addr = cpu_to_le32(addr);
-	buf->cmd.data_size = cpu_to_le32(__chunk_len);
-	ret = i2400m_bm_cmd(i2400m, &buf->cmd, sizeof(buf->cmd) + chunk_len,
-			    &ack, sizeof(ack), 0);
-	if (ret >= 0)
-		ret = 0;
-	d_fnend(5, dev, "(i2400m %p chunk %p __chunk_len %zu addr 0x%08lx "
-		"direct %u do_csum %u) = %d\n", i2400m, chunk, __chunk_len,
-		addr, direct, do_csum, ret);
-	return ret;
-}
-
-
-/*
- * Download a BCF file's sections to the device
- *
- * @i2400m: device descriptor
- * @bcf: pointer to firmware data (first header followed by the
- *     payloads). Assumed verified and consistent.
- * @bcf_len: length (in bytes) of the @bcf buffer.
- *
- * Returns: < 0 errno code on error or the offset to the jump instruction.
- *
- * Given a BCF file, downloads each section (a command and a payload)
- * to the device's address space. Actually, it just executes each
- * command i the BCF file.
- *
- * The section size has to be aligned to 4 bytes AND the padding has
- * to be taken from the firmware file, as the signature takes it into
- * account.
- */
-static
-ssize_t i2400m_dnload_bcf(struct i2400m *i2400m,
-			  const struct i2400m_bcf_hdr *bcf, size_t bcf_len)
-{
-	ssize_t ret;
-	struct device *dev = i2400m_dev(i2400m);
-	size_t offset,		/* iterator offset */
-		data_size,	/* Size of the data payload */
-		section_size,	/* Size of the whole section (cmd + payload) */
-		section = 1;
-	const struct i2400m_bootrom_header *bh;
-	struct i2400m_bootrom_header ack;
-
-	d_fnstart(3, dev, "(i2400m %p bcf %p bcf_len %zu)\n",
-		  i2400m, bcf, bcf_len);
-	/* Iterate over the command blocks in the BCF file that start
-	 * after the header */
-	offset = le32_to_cpu(bcf->header_len) * sizeof(u32);
-	while (1) {	/* start sending the file */
-		bh = (void *) bcf + offset;
-		data_size = le32_to_cpu(bh->data_size);
-		section_size = ALIGN(sizeof(*bh) + data_size, 4);
-		d_printf(7, dev,
-			 "downloading section #%zu (@%zu %zu B) to 0x%08x\n",
-			 section, offset, sizeof(*bh) + data_size,
-			 le32_to_cpu(bh->target_addr));
-		/*
-		 * We look for JUMP cmd from the bootmode header,
-		 * either I2400M_BRH_SIGNED_JUMP for secure boot
-		 * or I2400M_BRH_JUMP for unsecure boot, the last chunk
-		 * should be the bootmode header with JUMP cmd.
-		 */
-		if (i2400m_brh_get_opcode(bh) == I2400M_BRH_SIGNED_JUMP ||
-			i2400m_brh_get_opcode(bh) == I2400M_BRH_JUMP) {
-			d_printf(5, dev,  "jump found @%zu\n", offset);
-			break;
-		}
-		if (offset + section_size > bcf_len) {
-			dev_err(dev, "fw %s: bad section #%zu, "
-				"end (@%zu) beyond EOF (@%zu)\n",
-				i2400m->fw_name, section,
-				offset + section_size,  bcf_len);
-			ret = -EINVAL;
-			goto error_section_beyond_eof;
-		}
-		__i2400m_msleep(20);
-		ret = i2400m_bm_cmd(i2400m, bh, section_size,
-				    &ack, sizeof(ack), I2400M_BM_CMD_RAW);
-		if (ret < 0) {
-			dev_err(dev, "fw %s: section #%zu (@%zu %zu B) "
-				"failed %d\n", i2400m->fw_name, section,
-				offset, sizeof(*bh) + data_size, (int) ret);
-			goto error_send;
-		}
-		offset += section_size;
-		section++;
-	}
-	ret = offset;
-error_section_beyond_eof:
-error_send:
-	d_fnend(3, dev, "(i2400m %p bcf %p bcf_len %zu) = %d\n",
-		i2400m, bcf, bcf_len, (int) ret);
-	return ret;
-}
-
-
-/*
- * Indicate if the device emitted a reboot barker that indicates
- * "signed boot"
- */
-static
-unsigned i2400m_boot_is_signed(struct i2400m *i2400m)
-{
-	return likely(i2400m->sboot);
-}
-
-
-/*
- * Do the final steps of uploading firmware
- *
- * @bcf_hdr: BCF header we are actually using
- * @bcf: pointer to the firmware image (which matches the first header
- *     that is followed by the actual payloads).
- * @offset: [byte] offset into @bcf for the command we need to send.
- *
- * Depending on the boot mode (signed vs non-signed), different
- * actions need to be taken.
- */
-static
-int i2400m_dnload_finalize(struct i2400m *i2400m,
-			   const struct i2400m_bcf_hdr *bcf_hdr,
-			   const struct i2400m_bcf_hdr *bcf, size_t offset)
-{
-	int ret = 0;
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_bootrom_header *cmd, ack;
-	struct {
-		struct i2400m_bootrom_header cmd;
-		u8 cmd_pl[0];
-	} __packed *cmd_buf;
-	size_t signature_block_offset, signature_block_size;
-
-	d_fnstart(3, dev, "offset %zu\n", offset);
-	cmd = (void *) bcf + offset;
-	if (i2400m_boot_is_signed(i2400m) == 0) {
-		struct i2400m_bootrom_header jump_ack;
-		d_printf(1, dev, "unsecure boot, jumping to 0x%08x\n",
-			le32_to_cpu(cmd->target_addr));
-		cmd_buf = i2400m->bm_cmd_buf;
-		memcpy(&cmd_buf->cmd, cmd, sizeof(*cmd));
-		cmd = &cmd_buf->cmd;
-		/* now cmd points to the actual bootrom_header in cmd_buf */
-		i2400m_brh_set_opcode(cmd, I2400M_BRH_JUMP);
-		cmd->data_size = 0;
-		ret = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
-				    &jump_ack, sizeof(jump_ack), 0);
-	} else {
-		d_printf(1, dev, "secure boot, jumping to 0x%08x\n",
-			 le32_to_cpu(cmd->target_addr));
-		cmd_buf = i2400m->bm_cmd_buf;
-		memcpy(&cmd_buf->cmd, cmd, sizeof(*cmd));
-		signature_block_offset =
-			sizeof(*bcf_hdr)
-			+ le32_to_cpu(bcf_hdr->key_size) * sizeof(u32)
-			+ le32_to_cpu(bcf_hdr->exponent_size) * sizeof(u32);
-		signature_block_size =
-			le32_to_cpu(bcf_hdr->modulus_size) * sizeof(u32);
-		memcpy(cmd_buf->cmd_pl,
-		       (void *) bcf_hdr + signature_block_offset,
-		       signature_block_size);
-		ret = i2400m_bm_cmd(i2400m, &cmd_buf->cmd,
-				    sizeof(cmd_buf->cmd) + signature_block_size,
-				    &ack, sizeof(ack), I2400M_BM_CMD_RAW);
-	}
-	d_fnend(3, dev, "returning %d\n", ret);
-	return ret;
-}
-
-
-/**
- * i2400m_bootrom_init - Reboots a powered device into boot mode
- *
- * @i2400m: device descriptor
- * @flags:
- *      I2400M_BRI_SOFT: a reboot barker has been seen
- *          already, so don't wait for it.
- *
- *      I2400M_BRI_NO_REBOOT: Don't send a reboot command, but wait
- *          for a reboot barker notification. This is a one shot; if
- *          the state machine needs to send a reboot command it will.
- *
- * Returns:
- *
- *     < 0 errno code on error, 0 if ok.
- *
- * Description:
- *
- * Tries hard enough to put the device in boot-mode. There are two
- * main phases to this:
- *
- * a. (1) send a reboot command and (2) get a reboot barker
- *
- * b. (1) echo/ack the reboot sending the reboot barker back and (2)
- *        getting an ack barker in return
- *
- * We want to skip (a) in some cases [soft]. The state machine is
- * horrible, but it is basically: on each phase, send what has to be
- * sent (if any), wait for the answer and act on the answer. We might
- * have to backtrack and retry, so we keep a max tries counter for
- * that.
- *
- * It sucks because we don't know ahead of time which is going to be
- * the reboot barker (the device might send different ones depending
- * on its EEPROM config) and once the device reboots and waits for the
- * echo/ack reboot barker being sent back, it doesn't understand
- * anything else. So we can be left at the point where we don't know
- * what to send to it -- cold reset and bus reset seem to have little
- * effect. So the function iterates (in this case) through all the
- * known barkers and tries them all until an ACK is
- * received. Otherwise, it gives up.
- *
- * If we get a timeout after sending a warm reset, we do it again.
- */
-int i2400m_bootrom_init(struct i2400m *i2400m, enum i2400m_bri flags)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_bootrom_header *cmd;
-	struct i2400m_bootrom_header ack;
-	int count = i2400m->bus_bm_retries;
-	int ack_timeout_cnt = 1;
-	unsigned i;
-
-	BUILD_BUG_ON(sizeof(*cmd) != sizeof(i2400m_barker_db[0].data));
-	BUILD_BUG_ON(sizeof(ack) != sizeof(i2400m_ACK_BARKER));
-
-	d_fnstart(4, dev, "(i2400m %p flags 0x%08x)\n", i2400m, flags);
-	result = -ENOMEM;
-	cmd = i2400m->bm_cmd_buf;
-	if (flags & I2400M_BRI_SOFT)
-		goto do_reboot_ack;
-do_reboot:
-	ack_timeout_cnt = 1;
-	if (--count < 0)
-		goto error_timeout;
-	d_printf(4, dev, "device reboot: reboot command [%d # left]\n",
-		 count);
-	if ((flags & I2400M_BRI_NO_REBOOT) == 0)
-		i2400m_reset(i2400m, I2400M_RT_WARM);
-	result = i2400m_bm_cmd(i2400m, NULL, 0, &ack, sizeof(ack),
-			       I2400M_BM_CMD_RAW);
-	flags &= ~I2400M_BRI_NO_REBOOT;
-	switch (result) {
-	case -ERESTARTSYS:
-		/*
-		 * at this point, i2400m_bm_cmd(), through
-		 * __i2400m_bm_ack_process(), has updated
-		 * i2400m->barker and we are good to go.
-		 */
-		d_printf(4, dev, "device reboot: got reboot barker\n");
-		break;
-	case -EISCONN:	/* we don't know how it got here...but we follow it */
-		d_printf(4, dev, "device reboot: got ack barker - whatever\n");
-		goto do_reboot;
-	case -ETIMEDOUT:
-		/*
-		 * Device has timed out, we might be in boot mode
-		 * already and expecting an ack; if we don't know what
-		 * the barker is, we just send them all. Cold reset
-		 * and bus reset don't work. Beats me.
-		 */
-		if (i2400m->barker != NULL) {
-			dev_err(dev, "device boot: reboot barker timed out, "
-				"trying (set) %08x echo/ack\n",
-				le32_to_cpu(i2400m->barker->data[0]));
-			goto do_reboot_ack;
-		}
-		for (i = 0; i < i2400m_barker_db_used; i++) {
-			struct i2400m_barker_db *barker = &i2400m_barker_db[i];
-			memcpy(cmd, barker->data, sizeof(barker->data));
-			result = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
-					       &ack, sizeof(ack),
-					       I2400M_BM_CMD_RAW);
-			if (result == -EISCONN) {
-				dev_warn(dev, "device boot: got ack barker "
-					 "after sending echo/ack barker "
-					 "#%d/%08x; rebooting j.i.c.\n",
-					 i, le32_to_cpu(barker->data[0]));
-				flags &= ~I2400M_BRI_NO_REBOOT;
-				goto do_reboot;
-			}
-		}
-		dev_err(dev, "device boot: tried all the echo/acks, could "
-			"not get device to respond; giving up");
-		result = -ESHUTDOWN;
-	case -EPROTO:
-	case -ESHUTDOWN:	/* dev is gone */
-	case -EINTR:		/* user cancelled */
-		goto error_dev_gone;
-	default:
-		dev_err(dev, "device reboot: error %d while waiting "
-			"for reboot barker - rebooting\n", result);
-		d_dump(1, dev, &ack, result);
-		goto do_reboot;
-	}
-	/* At this point we ack back with 4 REBOOT barkers and expect
-	 * 4 ACK barkers. This is ugly, as we send a raw command --
-	 * hence the cast. _bm_cmd() will catch the reboot ack
-	 * notification and report it as -EISCONN. */
-do_reboot_ack:
-	d_printf(4, dev, "device reboot ack: sending ack [%d # left]\n", count);
-	memcpy(cmd, i2400m->barker->data, sizeof(i2400m->barker->data));
-	result = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
-			       &ack, sizeof(ack), I2400M_BM_CMD_RAW);
-	switch (result) {
-	case -ERESTARTSYS:
-		d_printf(4, dev, "reboot ack: got reboot barker - retrying\n");
-		if (--count < 0)
-			goto error_timeout;
-		goto do_reboot_ack;
-	case -EISCONN:
-		d_printf(4, dev, "reboot ack: got ack barker - good\n");
-		break;
-	case -ETIMEDOUT:	/* no response, maybe it is the other type? */
-		if (ack_timeout_cnt-- < 0) {
-			d_printf(4, dev, "reboot ack timedout: retrying\n");
-			goto do_reboot_ack;
-		} else {
-			dev_err(dev, "reboot ack timedout too long: "
-				"trying reboot\n");
-			goto do_reboot;
-		}
-		break;
-	case -EPROTO:
-	case -ESHUTDOWN:	/* dev is gone */
-		goto error_dev_gone;
-	default:
-		dev_err(dev, "device reboot ack: error %d while waiting for "
-			"reboot ack barker - rebooting\n", result);
-		goto do_reboot;
-	}
-	d_printf(2, dev, "device reboot ack: got ack barker - boot done\n");
-	result = 0;
-exit_timeout:
-error_dev_gone:
-	d_fnend(4, dev, "(i2400m %p flags 0x%08x) = %d\n",
-		i2400m, flags, result);
-	return result;
-
-error_timeout:
-	dev_err(dev, "Timed out waiting for reboot ack\n");
-	result = -ETIMEDOUT;
-	goto exit_timeout;
-}
-
-
-/*
- * Read the MAC addr
- *
- * The position this function reads is fixed in device memory and
- * always available, even without firmware.
- *
- * Note we specify we want to read only six bytes, but provide space
- * for 16, as we always get it rounded up.
- */
-int i2400m_read_mac_addr(struct i2400m *i2400m)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-	struct i2400m_bootrom_header *cmd;
-	struct {
-		struct i2400m_bootrom_header ack;
-		u8 ack_pl[16];
-	} __packed ack_buf;
-
-	d_fnstart(5, dev, "(i2400m %p)\n", i2400m);
-	cmd = i2400m->bm_cmd_buf;
-	cmd->command = i2400m_brh_command(I2400M_BRH_READ, 0, 1);
-	cmd->target_addr = cpu_to_le32(0x00203fe8);
-	cmd->data_size = cpu_to_le32(6);
-	result = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
-			       &ack_buf.ack, sizeof(ack_buf), 0);
-	if (result < 0) {
-		dev_err(dev, "BM: read mac addr failed: %d\n", result);
-		goto error_read_mac;
-	}
-	d_printf(2, dev, "mac addr is %pM\n", ack_buf.ack_pl);
-	if (i2400m->bus_bm_mac_addr_impaired == 1) {
-		ack_buf.ack_pl[0] = 0x00;
-		ack_buf.ack_pl[1] = 0x16;
-		ack_buf.ack_pl[2] = 0xd3;
-		get_random_bytes(&ack_buf.ack_pl[3], 3);
-		dev_err(dev, "BM is MAC addr impaired, faking MAC addr to "
-			"mac addr is %pM\n", ack_buf.ack_pl);
-		result = 0;
-	}
-	net_dev->addr_len = ETH_ALEN;
-	memcpy(net_dev->dev_addr, ack_buf.ack_pl, ETH_ALEN);
-error_read_mac:
-	d_fnend(5, dev, "(i2400m %p) = %d\n", i2400m, result);
-	return result;
-}
-
-
-/*
- * Initialize a non signed boot
- *
- * This implies sending some magic values to the device's memory. Note
- * we convert the values to little endian in the same array
- * declaration.
- */
-static
-int i2400m_dnload_init_nonsigned(struct i2400m *i2400m)
-{
-	unsigned i = 0;
-	int ret = 0;
-	struct device *dev = i2400m_dev(i2400m);
-	d_fnstart(5, dev, "(i2400m %p)\n", i2400m);
-	if (i2400m->bus_bm_pokes_table) {
-		while (i2400m->bus_bm_pokes_table[i].address) {
-			ret = i2400m_download_chunk(
-				i2400m,
-				&i2400m->bus_bm_pokes_table[i].data,
-				sizeof(i2400m->bus_bm_pokes_table[i].data),
-				i2400m->bus_bm_pokes_table[i].address, 1, 1);
-			if (ret < 0)
-				break;
-			i++;
-		}
-	}
-	d_fnend(5, dev, "(i2400m %p) = %d\n", i2400m, ret);
-	return ret;
-}
-
-
-/*
- * Initialize the signed boot process
- *
- * @i2400m: device descriptor
- *
- * @bcf_hdr: pointer to the firmware header; assumes it is fully in
- *     memory (it has gone through basic validation).
- *
- * Returns: 0 if ok, < 0 errno code on error, -ERESTARTSYS if the hw
- *     rebooted.
- *
- * This writes the firmware BCF header to the device using the
- * HASH_PAYLOAD_ONLY command.
- */
-static
-int i2400m_dnload_init_signed(struct i2400m *i2400m,
-			      const struct i2400m_bcf_hdr *bcf_hdr)
-{
-	int ret;
-	struct device *dev = i2400m_dev(i2400m);
-	struct {
-		struct i2400m_bootrom_header cmd;
-		struct i2400m_bcf_hdr cmd_pl;
-	} __packed *cmd_buf;
-	struct i2400m_bootrom_header ack;
-
-	d_fnstart(5, dev, "(i2400m %p bcf_hdr %p)\n", i2400m, bcf_hdr);
-	cmd_buf = i2400m->bm_cmd_buf;
-	cmd_buf->cmd.command =
-		i2400m_brh_command(I2400M_BRH_HASH_PAYLOAD_ONLY, 0, 0);
-	cmd_buf->cmd.target_addr = 0;
-	cmd_buf->cmd.data_size = cpu_to_le32(sizeof(cmd_buf->cmd_pl));
-	memcpy(&cmd_buf->cmd_pl, bcf_hdr, sizeof(*bcf_hdr));
-	ret = i2400m_bm_cmd(i2400m, &cmd_buf->cmd, sizeof(*cmd_buf),
-			    &ack, sizeof(ack), 0);
-	if (ret >= 0)
-		ret = 0;
-	d_fnend(5, dev, "(i2400m %p bcf_hdr %p) = %d\n", i2400m, bcf_hdr, ret);
-	return ret;
-}
-
-
-/*
- * Initialize the firmware download at the device size
- *
- * Multiplex to the one that matters based on the device's mode
- * (signed or non-signed).
- */
-static
-int i2400m_dnload_init(struct i2400m *i2400m,
-		       const struct i2400m_bcf_hdr *bcf_hdr)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-
-	if (i2400m_boot_is_signed(i2400m)) {
-		d_printf(1, dev, "signed boot\n");
-		result = i2400m_dnload_init_signed(i2400m, bcf_hdr);
-		if (result == -ERESTARTSYS)
-			return result;
-		if (result < 0)
-			dev_err(dev, "firmware %s: signed boot download "
-				"initialization failed: %d\n",
-				i2400m->fw_name, result);
-	} else {
-		/* non-signed boot process without pokes */
-		d_printf(1, dev, "non-signed boot\n");
-		result = i2400m_dnload_init_nonsigned(i2400m);
-		if (result == -ERESTARTSYS)
-			return result;
-		if (result < 0)
-			dev_err(dev, "firmware %s: non-signed download "
-				"initialization failed: %d\n",
-				i2400m->fw_name, result);
-	}
-	return result;
-}
-
-
-/*
- * Run consistency tests on the firmware file and load up headers
- *
- * Check for the firmware being made for the i2400m device,
- * etc...These checks are mostly informative, as the device will make
- * them too; but the driver's response is more informative on what
- * went wrong.
- *
- * This will also look at all the headers present on the firmware
- * file, and update i2400m->fw_bcf_hdr to point to them.
- */
-static
-int i2400m_fw_hdr_check(struct i2400m *i2400m,
-			const struct i2400m_bcf_hdr *bcf_hdr,
-			size_t index, size_t offset)
-{
-	struct device *dev = i2400m_dev(i2400m);
-
-	unsigned module_type, header_len, major_version, minor_version,
-		module_id, module_vendor, date, size;
-
-	module_type = le32_to_cpu(bcf_hdr->module_type);
-	header_len = sizeof(u32) * le32_to_cpu(bcf_hdr->header_len);
-	major_version = (le32_to_cpu(bcf_hdr->header_version) & 0xffff0000)
-		>> 16;
-	minor_version = le32_to_cpu(bcf_hdr->header_version) & 0x0000ffff;
-	module_id = le32_to_cpu(bcf_hdr->module_id);
-	module_vendor = le32_to_cpu(bcf_hdr->module_vendor);
-	date = le32_to_cpu(bcf_hdr->date);
-	size = sizeof(u32) * le32_to_cpu(bcf_hdr->size);
-
-	d_printf(1, dev, "firmware %s #%zd@%08zx: BCF header "
-		 "type:vendor:id 0x%x:%x:%x v%u.%u (%u/%u B) built %08x\n",
-		 i2400m->fw_name, index, offset,
-		 module_type, module_vendor, module_id,
-		 major_version, minor_version, header_len, size, date);
-
-	/* Hard errors */
-	if (major_version != 1) {
-		dev_err(dev, "firmware %s #%zd@%08zx: major header version "
-			"v%u.%u not supported\n",
-			i2400m->fw_name, index, offset,
-			major_version, minor_version);
-		return -EBADF;
-	}
-
-	if (module_type != 6) {		/* built for the right hardware? */
-		dev_err(dev, "firmware %s #%zd@%08zx: unexpected module "
-			"type 0x%x; aborting\n",
-			i2400m->fw_name, index, offset,
-			module_type);
-		return -EBADF;
-	}
-
-	if (module_vendor != 0x8086) {
-		dev_err(dev, "firmware %s #%zd@%08zx: unexpected module "
-			"vendor 0x%x; aborting\n",
-			i2400m->fw_name, index, offset, module_vendor);
-		return -EBADF;
-	}
-
-	if (date < 0x20080300)
-		dev_warn(dev, "firmware %s #%zd@%08zx: build date %08x "
-			 "too old; unsupported\n",
-			 i2400m->fw_name, index, offset, date);
-	return 0;
-}
-
-
-/*
- * Run consistency tests on the firmware file and load up headers
- *
- * Check for the firmware being made for the i2400m device,
- * etc...These checks are mostly informative, as the device will make
- * them too; but the driver's response is more informative on what
- * went wrong.
- *
- * This will also look at all the headers present on the firmware
- * file, and update i2400m->fw_hdrs to point to them.
- */
-static
-int i2400m_fw_check(struct i2400m *i2400m, const void *bcf, size_t bcf_size)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	size_t headers = 0;
-	const struct i2400m_bcf_hdr *bcf_hdr;
-	const void *itr, *next, *top;
-	size_t slots = 0, used_slots = 0;
-
-	for (itr = bcf, top = itr + bcf_size;
-	     itr < top;
-	     headers++, itr = next) {
-		size_t leftover, offset, header_len, size;
-
-		leftover = top - itr;
-		offset = itr - bcf;
-		if (leftover <= sizeof(*bcf_hdr)) {
-			dev_err(dev, "firmware %s: %zu B left at @%zx, "
-				"not enough for BCF header\n",
-				i2400m->fw_name, leftover, offset);
-			break;
-		}
-		bcf_hdr = itr;
-		/* Only the first header is supposed to be followed by
-		 * payload */
-		header_len = sizeof(u32) * le32_to_cpu(bcf_hdr->header_len);
-		size = sizeof(u32) * le32_to_cpu(bcf_hdr->size);
-		if (headers == 0)
-			next = itr + size;
-		else
-			next = itr + header_len;
-
-		result = i2400m_fw_hdr_check(i2400m, bcf_hdr, headers, offset);
-		if (result < 0)
-			continue;
-		if (used_slots + 1 >= slots) {
-			/* +1 -> we need to account for the one we'll
-			 * occupy and at least an extra one for
-			 * always being NULL */
-			result = i2400m_zrealloc_2x(
-				(void **) &i2400m->fw_hdrs, &slots,
-				sizeof(i2400m->fw_hdrs[0]),
-				GFP_KERNEL);
-			if (result < 0)
-				goto error_zrealloc;
-		}
-		i2400m->fw_hdrs[used_slots] = bcf_hdr;
-		used_slots++;
-	}
-	if (headers == 0) {
-		dev_err(dev, "firmware %s: no usable headers found\n",
-			i2400m->fw_name);
-		result = -EBADF;
-	} else
-		result = 0;
-error_zrealloc:
-	return result;
-}
-
-
-/*
- * Match a barker to a BCF header module ID
- *
- * The device sends a barker which tells the firmware loader which
- * header in the BCF file has to be used. This does the matching.
- */
-static
-unsigned i2400m_bcf_hdr_match(struct i2400m *i2400m,
-			      const struct i2400m_bcf_hdr *bcf_hdr)
-{
-	u32 barker = le32_to_cpu(i2400m->barker->data[0])
-		& 0x7fffffff;
-	u32 module_id = le32_to_cpu(bcf_hdr->module_id)
-		& 0x7fffffff;	/* high bit used for something else */
-
-	/* special case for 5x50 */
-	if (barker == I2400M_SBOOT_BARKER && module_id == 0)
-		return 1;
-	if (module_id == barker)
-		return 1;
-	return 0;
-}
-
-static
-const struct i2400m_bcf_hdr *i2400m_bcf_hdr_find(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_bcf_hdr **bcf_itr, *bcf_hdr;
-	unsigned i = 0;
-	u32 barker = le32_to_cpu(i2400m->barker->data[0]);
-
-	d_printf(2, dev, "finding BCF header for barker %08x\n", barker);
-	if (barker == I2400M_NBOOT_BARKER) {
-		bcf_hdr = i2400m->fw_hdrs[0];
-		d_printf(1, dev, "using BCF header #%u/%08x for non-signed "
-			 "barker\n", 0, le32_to_cpu(bcf_hdr->module_id));
-		return bcf_hdr;
-	}
-	for (bcf_itr = i2400m->fw_hdrs; *bcf_itr != NULL; bcf_itr++, i++) {
-		bcf_hdr = *bcf_itr;
-		if (i2400m_bcf_hdr_match(i2400m, bcf_hdr)) {
-			d_printf(1, dev, "hit on BCF hdr #%u/%08x\n",
-				 i, le32_to_cpu(bcf_hdr->module_id));
-			return bcf_hdr;
-		} else
-			d_printf(1, dev, "miss on BCF hdr #%u/%08x\n",
-				 i, le32_to_cpu(bcf_hdr->module_id));
-	}
-	dev_err(dev, "cannot find a matching BCF header for barker %08x\n",
-		barker);
-	return NULL;
-}
-
-
-/*
- * Download the firmware to the device
- *
- * @i2400m: device descriptor
- * @bcf: pointer to loaded (and minimally verified for consistency)
- *    firmware
- * @bcf_size: size of the @bcf buffer (header plus payloads)
- *
- * The process for doing this is described in this file's header.
- *
- * Note we only reinitialize boot-mode if the flags say so. Some hw
- * iterations need it, some don't. In any case, if we loop, we always
- * need to reinitialize the boot room, hence the flags modification.
- */
-static
-int i2400m_fw_dnload(struct i2400m *i2400m, const struct i2400m_bcf_hdr *bcf,
-		     size_t fw_size, enum i2400m_bri flags)
-{
-	int ret = 0;
-	struct device *dev = i2400m_dev(i2400m);
-	int count = i2400m->bus_bm_retries;
-	const struct i2400m_bcf_hdr *bcf_hdr;
-	size_t bcf_size;
-
-	d_fnstart(5, dev, "(i2400m %p bcf %p fw size %zu)\n",
-		  i2400m, bcf, fw_size);
-	i2400m->boot_mode = 1;
-	wmb();		/* Make sure other readers see it */
-hw_reboot:
-	if (count-- == 0) {
-		ret = -ERESTARTSYS;
-		dev_err(dev, "device rebooted too many times, aborting\n");
-		goto error_too_many_reboots;
-	}
-	if (flags & I2400M_BRI_MAC_REINIT) {
-		ret = i2400m_bootrom_init(i2400m, flags);
-		if (ret < 0) {
-			dev_err(dev, "bootrom init failed: %d\n", ret);
-			goto error_bootrom_init;
-		}
-	}
-	flags |= I2400M_BRI_MAC_REINIT;
-
-	/*
-	 * Initialize the download, push the bytes to the device and
-	 * then jump to the new firmware. Note @ret is passed with the
-	 * offset of the jump instruction to _dnload_finalize()
-	 *
-	 * Note we need to use the BCF header in the firmware image
-	 * that matches the barker that the device sent when it
-	 * rebooted, so it has to be passed along.
-	 */
-	ret = -EBADF;
-	bcf_hdr = i2400m_bcf_hdr_find(i2400m);
-	if (bcf_hdr == NULL)
-		goto error_bcf_hdr_find;
-
-	ret = i2400m_dnload_init(i2400m, bcf_hdr);
-	if (ret == -ERESTARTSYS)
-		goto error_dev_rebooted;
-	if (ret < 0)
-		goto error_dnload_init;
-
-	/*
-	 * bcf_size refers to one header size plus the fw sections size
-	 * indicated by the header,ie. if there are other extended headers
-	 * at the tail, they are not counted
-	 */
-	bcf_size = sizeof(u32) * le32_to_cpu(bcf_hdr->size);
-	ret = i2400m_dnload_bcf(i2400m, bcf, bcf_size);
-	if (ret == -ERESTARTSYS)
-		goto error_dev_rebooted;
-	if (ret < 0) {
-		dev_err(dev, "fw %s: download failed: %d\n",
-			i2400m->fw_name, ret);
-		goto error_dnload_bcf;
-	}
-
-	ret = i2400m_dnload_finalize(i2400m, bcf_hdr, bcf, ret);
-	if (ret == -ERESTARTSYS)
-		goto error_dev_rebooted;
-	if (ret < 0) {
-		dev_err(dev, "fw %s: "
-			"download finalization failed: %d\n",
-			i2400m->fw_name, ret);
-		goto error_dnload_finalize;
-	}
-
-	d_printf(2, dev, "fw %s successfully uploaded\n",
-		 i2400m->fw_name);
-	i2400m->boot_mode = 0;
-	wmb();		/* Make sure i2400m_msg_to_dev() sees boot_mode */
-error_dnload_finalize:
-error_dnload_bcf:
-error_dnload_init:
-error_bcf_hdr_find:
-error_bootrom_init:
-error_too_many_reboots:
-	d_fnend(5, dev, "(i2400m %p bcf %p size %zu) = %d\n",
-		i2400m, bcf, fw_size, ret);
-	return ret;
-
-error_dev_rebooted:
-	dev_err(dev, "device rebooted, %d tries left\n", count);
-	/* we got the notification already, no need to wait for it again */
-	flags |= I2400M_BRI_SOFT;
-	goto hw_reboot;
-}
-
-static
-int i2400m_fw_bootstrap(struct i2400m *i2400m, const struct firmware *fw,
-			enum i2400m_bri flags)
-{
-	int ret;
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_bcf_hdr *bcf;	/* Firmware data */
-
-	d_fnstart(5, dev, "(i2400m %p)\n", i2400m);
-	bcf = (void *) fw->data;
-	ret = i2400m_fw_check(i2400m, bcf, fw->size);
-	if (ret >= 0)
-		ret = i2400m_fw_dnload(i2400m, bcf, fw->size, flags);
-	if (ret < 0)
-		dev_err(dev, "%s: cannot use: %d, skipping\n",
-			i2400m->fw_name, ret);
-	kfree(i2400m->fw_hdrs);
-	i2400m->fw_hdrs = NULL;
-	d_fnend(5, dev, "(i2400m %p) = %d\n", i2400m, ret);
-	return ret;
-}
-
-
-/* Refcounted container for firmware data */
-struct i2400m_fw {
-	struct kref kref;
-	const struct firmware *fw;
-};
-
-
-static
-void i2400m_fw_destroy(struct kref *kref)
-{
-	struct i2400m_fw *i2400m_fw =
-		container_of(kref, struct i2400m_fw, kref);
-	release_firmware(i2400m_fw->fw);
-	kfree(i2400m_fw);
-}
-
-
-static
-struct i2400m_fw *i2400m_fw_get(struct i2400m_fw *i2400m_fw)
-{
-	if (i2400m_fw != NULL && i2400m_fw != (void *) ~0)
-		kref_get(&i2400m_fw->kref);
-	return i2400m_fw;
-}
-
-
-static
-void i2400m_fw_put(struct i2400m_fw *i2400m_fw)
-{
-	kref_put(&i2400m_fw->kref, i2400m_fw_destroy);
-}
-
-
-/**
- * i2400m_dev_bootstrap - Bring the device to a known state and upload firmware
- *
- * @i2400m: device descriptor
- *
- * Returns: >= 0 if ok, < 0 errno code on error.
- *
- * This sets up the firmware upload environment, loads the firmware
- * file from disk, verifies and then calls the firmware upload process
- * per se.
- *
- * Can be called either from probe, or after a warm reset.  Can not be
- * called from within an interrupt.  All the flow in this code is
- * single-threade; all I/Os are synchronous.
- */
-int i2400m_dev_bootstrap(struct i2400m *i2400m, enum i2400m_bri flags)
-{
-	int ret, itr;
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_fw *i2400m_fw;
-	const struct firmware *fw;
-	const char *fw_name;
-
-	d_fnstart(5, dev, "(i2400m %p)\n", i2400m);
-
-	ret = -ENODEV;
-	spin_lock(&i2400m->rx_lock);
-	i2400m_fw = i2400m_fw_get(i2400m->fw_cached);
-	spin_unlock(&i2400m->rx_lock);
-	if (i2400m_fw == (void *) ~0) {
-		dev_err(dev, "can't load firmware now!");
-		goto out;
-	} else if (i2400m_fw != NULL) {
-		dev_info(dev, "firmware %s: loading from cache\n",
-			 i2400m->fw_name);
-		ret = i2400m_fw_bootstrap(i2400m, i2400m_fw->fw, flags);
-		i2400m_fw_put(i2400m_fw);
-		goto out;
-	}
-
-	/* Load firmware files to memory. */
-	for (itr = 0, ret = -ENOENT; ; itr++) {
-		fw_name = i2400m->bus_fw_names[itr];
-		if (fw_name == NULL) {
-			dev_err(dev, "Could not find a usable firmware image\n");
-			break;
-		}
-		d_printf(1, dev, "trying firmware %s (%d)\n", fw_name, itr);
-		ret = request_firmware(&fw, fw_name, dev);
-		if (ret < 0) {
-			dev_err(dev, "fw %s: cannot load file: %d\n",
-				fw_name, ret);
-			continue;
-		}
-		i2400m->fw_name = fw_name;
-		ret = i2400m_fw_bootstrap(i2400m, fw, flags);
-		release_firmware(fw);
-		if (ret >= 0)	/* firmware loaded successfully */
-			break;
-		i2400m->fw_name = NULL;
-	}
-out:
-	d_fnend(5, dev, "(i2400m %p) = %d\n", i2400m, ret);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(i2400m_dev_bootstrap);
-
-
-void i2400m_fw_cache(struct i2400m *i2400m)
-{
-	int result;
-	struct i2400m_fw *i2400m_fw;
-	struct device *dev = i2400m_dev(i2400m);
-
-	/* if there is anything there, free it -- now, this'd be weird */
-	spin_lock(&i2400m->rx_lock);
-	i2400m_fw = i2400m->fw_cached;
-	spin_unlock(&i2400m->rx_lock);
-	if (i2400m_fw != NULL && i2400m_fw != (void *) ~0) {
-		i2400m_fw_put(i2400m_fw);
-		WARN(1, "%s:%u: still cached fw still present?\n",
-		     __func__, __LINE__);
-	}
-
-	if (i2400m->fw_name == NULL) {
-		dev_err(dev, "firmware n/a: can't cache\n");
-		i2400m_fw = (void *) ~0;
-		goto out;
-	}
-
-	i2400m_fw = kzalloc(sizeof(*i2400m_fw), GFP_ATOMIC);
-	if (i2400m_fw == NULL)
-		goto out;
-	kref_init(&i2400m_fw->kref);
-	result = request_firmware(&i2400m_fw->fw, i2400m->fw_name, dev);
-	if (result < 0) {
-		dev_err(dev, "firmware %s: failed to cache: %d\n",
-			i2400m->fw_name, result);
-		kfree(i2400m_fw);
-		i2400m_fw = (void *) ~0;
-	} else
-		dev_info(dev, "firmware %s: cached\n", i2400m->fw_name);
-out:
-	spin_lock(&i2400m->rx_lock);
-	i2400m->fw_cached = i2400m_fw;
-	spin_unlock(&i2400m->rx_lock);
-}
-
-
-void i2400m_fw_uncache(struct i2400m *i2400m)
-{
-	struct i2400m_fw *i2400m_fw;
-
-	spin_lock(&i2400m->rx_lock);
-	i2400m_fw = i2400m->fw_cached;
-	i2400m->fw_cached = NULL;
-	spin_unlock(&i2400m->rx_lock);
-
-	if (i2400m_fw != NULL && i2400m_fw != (void *) ~0)
-		i2400m_fw_put(i2400m_fw);
-}
-
diff --git a/drivers/net/wimax/i2400m/i2400m-usb.h b/drivers/net/wimax/i2400m/i2400m-usb.h
deleted file mode 100644
index eff4f464a23e..000000000000
--- a/drivers/net/wimax/i2400m/i2400m-usb.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * USB-specific i2400m driver definitions
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- *  - Initial implementation
- *
- *
- * This driver implements the bus-specific part of the i2400m for
- * USB. Check i2400m.h for a generic driver description.
- *
- * ARCHITECTURE
- *
- * This driver listens to notifications sent from the notification
- * endpoint (in usb-notif.c); when data is ready to read, the code in
- * there schedules a read from the device (usb-rx.c) and then passes
- * the data to the generic RX code (rx.c).
- *
- * When the generic driver needs to send data (network or control), it
- * queues up in the TX FIFO (tx.c) and that will notify the driver
- * through the i2400m->bus_tx_kick() callback
- * (usb-tx.c:i2400mu_bus_tx_kick) which will send the items in the
- * FIFO queue.
- *
- * This driver, as well, implements the USB-specific ops for the generic
- * driver to be able to setup/teardown communication with the device
- * [i2400m_bus_dev_start() and i2400m_bus_dev_stop()], reseting the
- * device [i2400m_bus_reset()] and performing firmware upload
- * [i2400m_bus_bm_cmd() and i2400_bus_bm_wait_for_ack()].
- */
-
-#ifndef __I2400M_USB_H__
-#define __I2400M_USB_H__
-
-#include "i2400m.h"
-#include <linux/kthread.h>
-
-
-/*
- * Error Density Count: cheapo error density (over time) counter
- *
- * Originally by Reinette Chatre <reinette.chatre@intel.com>
- *
- * Embed an 'struct edc' somewhere. Each time there is a soft or
- * retryable error, call edc_inc() and check if the error top
- * watermark has been reached.
- */
-enum {
-	EDC_MAX_ERRORS = 10,
-	EDC_ERROR_TIMEFRAME = HZ,
-};
-
-/* error density counter */
-struct edc {
-	unsigned long timestart;
-	u16 errorcount;
-};
-
-struct i2400m_endpoint_cfg {
-	unsigned char bulk_out;
-	unsigned char notification;
-	unsigned char reset_cold;
-	unsigned char bulk_in;
-};
-
-static inline void edc_init(struct edc *edc)
-{
-	edc->timestart = jiffies;
-}
-
-/**
- * edc_inc - report a soft error and check if we are over the watermark
- *
- * @edc: pointer to error density counter.
- * @max_err: maximum number of errors we can accept over the timeframe
- * @timeframe: length of the timeframe (in jiffies).
- *
- * Returns: !0 1 if maximum acceptable errors per timeframe has been
- *     exceeded. 0 otherwise.
- *
- * This is way to determine if the number of acceptable errors per time
- * period has been exceeded. It is not accurate as there are cases in which
- * this scheme will not work, for example if there are periodic occurrences
- * of errors that straddle updates to the start time. This scheme is
- * sufficient for our usage.
- *
- * To use, embed a 'struct edc' somewhere, initialize it with
- * edc_init() and when an error hits:
- *
- * if (do_something_fails_with_a_soft_error) {
- *        if (edc_inc(&my->edc, MAX_ERRORS, MAX_TIMEFRAME))
- * 	           Ops, hard error, do something about it
- *        else
- *                 Retry or ignore, depending on whatever
- * }
- */
-static inline int edc_inc(struct edc *edc, u16 max_err, u16 timeframe)
-{
-	unsigned long now;
-
-	now = jiffies;
-	if (time_after(now, edc->timestart + timeframe)) {
-		edc->errorcount = 1;
-		edc->timestart = now;
-	} else if (++edc->errorcount > max_err) {
-		edc->errorcount = 0;
-		edc->timestart = now;
-		return 1;
-	}
-	return 0;
-}
-
-/* Host-Device interface for USB */
-enum {
-	I2400M_USB_BOOT_RETRIES = 3,
-	I2400MU_MAX_NOTIFICATION_LEN = 256,
-	I2400MU_BLK_SIZE = 16,
-	I2400MU_PL_SIZE_MAX = 0x3EFF,
-
-	/* Device IDs */
-	USB_DEVICE_ID_I6050 = 0x0186,
-	USB_DEVICE_ID_I6050_2 = 0x0188,
-	USB_DEVICE_ID_I6150 = 0x07d6,
-	USB_DEVICE_ID_I6150_2 = 0x07d7,
-	USB_DEVICE_ID_I6150_3 = 0x07d9,
-	USB_DEVICE_ID_I6250 = 0x0187,
-};
-
-
-/**
- * struct i2400mu - descriptor for a USB connected i2400m
- *
- * @i2400m: bus-generic i2400m implementation; has to be first (see
- *     it's documentation in i2400m.h).
- *
- * @usb_dev: pointer to our USB device
- *
- * @usb_iface: pointer to our USB interface
- *
- * @urb_edc: error density counter; used to keep a density-on-time tab
- *     on how many soft (retryable or ignorable) errors we get. If we
- *     go over the threshold, we consider the bus transport is failing
- *     too much and reset.
- *
- * @notif_urb: URB for receiving notifications from the device.
- *
- * @tx_kthread: thread we use for data TX. We use a thread because in
- *     order to do deep power saving and put the device to sleep, we
- *     need to call usb_autopm_*() [blocking functions].
- *
- * @tx_wq: waitqueue for the TX kthread to sleep when there is no data
- *     to be sent; when more data is available, it is woken up by
- *     i2400mu_bus_tx_kick().
- *
- * @rx_kthread: thread we use for data RX. We use a thread because in
- *     order to do deep power saving and put the device to sleep, we
- *     need to call usb_autopm_*() [blocking functions].
- *
- * @rx_wq: waitqueue for the RX kthread to sleep when there is no data
- *     to receive. When data is available, it is woken up by
- *     usb-notif.c:i2400mu_notification_grok().
- *
- * @rx_pending_count: number of rx-data-ready notifications that were
- *     still not handled by the RX kthread.
- *
- * @rx_size: current RX buffer size that is being used.
- *
- * @rx_size_acc: accumulator of the sizes of the previous read
- *     transactions.
- *
- * @rx_size_cnt: number of read transactions accumulated in
- *     @rx_size_acc.
- *
- * @do_autopm: disable(0)/enable(>0) calling the
- *     usb_autopm_get/put_interface() barriers when executing
- *     commands. See doc in i2400mu_suspend() for more information.
- *
- * @rx_size_auto_shrink: if true, the rx_size is shrunk
- *     automatically based on the average size of the received
- *     transactions. This allows the receive code to allocate smaller
- *     chunks of memory and thus reduce pressure on the memory
- *     allocator by not wasting so much space. By default it is
- *     enabled.
- *
- * @debugfs_dentry: hookup for debugfs files.
- *     These have to be in a separate directory, a child of
- *     (wimax_dev->debugfs_dentry) so they can be removed when the
- *     module unloads, as we don't keep each dentry.
- */
-struct i2400mu {
-	struct i2400m i2400m;		/* FIRST! See doc */
-
-	struct usb_device *usb_dev;
-	struct usb_interface *usb_iface;
-	struct edc urb_edc;		/* Error density counter */
-	struct i2400m_endpoint_cfg endpoint_cfg;
-
-	struct urb *notif_urb;
-	struct task_struct *tx_kthread;
-	wait_queue_head_t tx_wq;
-
-	struct task_struct *rx_kthread;
-	wait_queue_head_t rx_wq;
-	atomic_t rx_pending_count;
-	size_t rx_size, rx_size_acc, rx_size_cnt;
-	atomic_t do_autopm;
-	u8 rx_size_auto_shrink;
-
-	struct dentry *debugfs_dentry;
-	unsigned i6050:1;	/* 1 if this is a 6050 based SKU */
-};
-
-
-static inline
-void i2400mu_init(struct i2400mu *i2400mu)
-{
-	i2400m_init(&i2400mu->i2400m);
-	edc_init(&i2400mu->urb_edc);
-	init_waitqueue_head(&i2400mu->tx_wq);
-	atomic_set(&i2400mu->rx_pending_count, 0);
-	init_waitqueue_head(&i2400mu->rx_wq);
-	i2400mu->rx_size = PAGE_SIZE - sizeof(struct skb_shared_info);
-	atomic_set(&i2400mu->do_autopm, 1);
-	i2400mu->rx_size_auto_shrink = 1;
-}
-
-int i2400mu_notification_setup(struct i2400mu *);
-void i2400mu_notification_release(struct i2400mu *);
-
-int i2400mu_rx_setup(struct i2400mu *);
-void i2400mu_rx_release(struct i2400mu *);
-void i2400mu_rx_kick(struct i2400mu *);
-
-int i2400mu_tx_setup(struct i2400mu *);
-void i2400mu_tx_release(struct i2400mu *);
-void i2400mu_bus_tx_kick(struct i2400m *);
-
-ssize_t i2400mu_bus_bm_cmd_send(struct i2400m *,
-				const struct i2400m_bootrom_header *, size_t,
-				int);
-ssize_t i2400mu_bus_bm_wait_for_ack(struct i2400m *,
-				    struct i2400m_bootrom_header *, size_t);
-#endif /* #ifndef __I2400M_USB_H__ */
diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h
deleted file mode 100644
index a3733a6d14f5..000000000000
--- a/drivers/net/wimax/i2400m/i2400m.h
+++ /dev/null
@@ -1,970 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Declarations for bus-generic internal APIs
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- *  - Initial implementation
- *
- *
- * GENERAL DRIVER ARCHITECTURE
- *
- * The i2400m driver is split in the following two major parts:
- *
- *  - bus specific driver
- *  - bus generic driver (this part)
- *
- * The bus specific driver sets up stuff specific to the bus the
- * device is connected to (USB, PCI, tam-tam...non-authoritative
- * nor binding list) which is basically the device-model management
- * (probe/disconnect, etc), moving data from device to kernel and
- * back, doing the power saving details and reseting the device.
- *
- * For details on each bus-specific driver, see it's include file,
- * i2400m-BUSNAME.h
- *
- * The bus-generic functionality break up is:
- *
- *  - Firmware upload: fw.c - takes care of uploading firmware to the
- *        device. bus-specific driver just needs to provides a way to
- *        execute boot-mode commands and to reset the device.
- *
- *  - RX handling: rx.c - receives data from the bus-specific code and
- *        feeds it to the network or WiMAX stack or uses it to modify
- *        the driver state. bus-specific driver only has to receive
- *        frames and pass them to this module.
- *
- *  - TX handling: tx.c - manages the TX FIFO queue and provides means
- *        for the bus-specific TX code to pull data from the FIFO
- *        queue. bus-specific code just pulls frames from this module
- *        to sends them to the device.
- *
- *  - netdev glue: netdev.c - interface with Linux networking
- *        stack. Pass around data frames, and configure when the
- *        device is up and running or shutdown (through ifconfig up /
- *        down). Bus-generic only.
- *
- *  - control ops: control.c - implements various commands for
- *        controlling the device. bus-generic only.
- *
- *  - device model glue: driver.c - implements helpers for the
- *        device-model glue done by the bus-specific layer
- *        (setup/release the driver resources), turning the device on
- *        and off, handling the device reboots/resets and a few simple
- *        WiMAX stack ops.
- *
- * Code is also broken up in linux-glue / device-glue.
- *
- * Linux glue contains functions that deal mostly with gluing with the
- * rest of the Linux kernel.
- *
- * Device-glue are functions that deal mostly with the way the device
- * does things and talk the device's language.
- *
- * device-glue code is licensed BSD so other open source OSes can take
- * it to implement their drivers.
- *
- *
- * APIs AND HEADER FILES
- *
- * This bus generic code exports three APIs:
- *
- *  - HDI (host-device interface) definitions common to all busses
- *    (include/linux/wimax/i2400m.h); these can be also used by user
- *    space code.
- *  - internal API for the bus-generic code
- *  - external API for the bus-specific drivers
- *
- *
- * LIFE CYCLE:
- *
- * When the bus-specific driver probes, it allocates a network device
- * with enough space for it's data structue, that must contain a
- * &struct i2400m at the top.
- *
- * On probe, it needs to fill the i2400m members marked as [fill], as
- * well as i2400m->wimax_dev.net_dev and call i2400m_setup(). The
- * i2400m driver will only register with the WiMAX and network stacks;
- * the only access done to the device is to read the MAC address so we
- * can register a network device.
- *
- * The high-level call flow is:
- *
- * bus_probe()
- *   i2400m_setup()
- *     i2400m->bus_setup()
- *     boot rom initialization / read mac addr
- *     network / WiMAX stacks registration
- *     i2400m_dev_start()
- *       i2400m->bus_dev_start()
- *       i2400m_dev_initialize()
- *
- * The reverse applies for a disconnect() call:
- *
- * bus_disconnect()
- *   i2400m_release()
- *     i2400m_dev_stop()
- *       i2400m_dev_shutdown()
- *       i2400m->bus_dev_stop()
- *     network / WiMAX stack unregistration
- *     i2400m->bus_release()
- *
- * At this point, control and data communications are possible.
- *
- * While the device is up, it might reset. The bus-specific driver has
- * to catch that situation and call i2400m_dev_reset_handle() to deal
- * with it (reset the internal driver structures and go back to square
- * one).
- */
-
-#ifndef __I2400M_H__
-#define __I2400M_H__
-
-#include <linux/usb.h>
-#include <linux/netdevice.h>
-#include <linux/completion.h>
-#include <linux/rwsem.h>
-#include <linux/atomic.h>
-#include <net/wimax.h>
-#include <linux/wimax/i2400m.h>
-#include <asm/byteorder.h>
-
-enum {
-/* netdev interface */
-	/*
-	 * Out of NWG spec (R1_v1.2.2), 3.3.3 ASN Bearer Plane MTU Size
-	 *
-	 * The MTU is 1400 or less
-	 */
-	I2400M_MAX_MTU = 1400,
-};
-
-/* Misc constants */
-enum {
-	/* Size of the Boot Mode Command buffer */
-	I2400M_BM_CMD_BUF_SIZE = 16 * 1024,
-	I2400M_BM_ACK_BUF_SIZE = 256,
-};
-
-enum {
-	/* Maximum number of bus reset can be retried */
-	I2400M_BUS_RESET_RETRIES = 3,
-};
-
-/**
- * struct i2400m_poke_table - Hardware poke table for the Intel 2400m
- *
- * This structure will be used to create a device specific poke table
- * to put the device in a consistent state at boot time.
- *
- * @address: The device address to poke
- *
- * @data: The data value to poke to the device address
- *
- */
-struct i2400m_poke_table{
-	__le32 address;
-	__le32 data;
-};
-
-#define I2400M_FW_POKE(a, d) {		\
-	.address = cpu_to_le32(a),	\
-	.data = cpu_to_le32(d)		\
-}
-
-
-/**
- * i2400m_reset_type - methods to reset a device
- *
- * @I2400M_RT_WARM: Reset without device disconnection, device handles
- *     are kept valid but state is back to power on, with firmware
- *     re-uploaded.
- * @I2400M_RT_COLD: Tell the device to disconnect itself from the bus
- *     and reconnect. Renders all device handles invalid.
- * @I2400M_RT_BUS: Tells the bus to reset the device; last measure
- *     used when both types above don't work.
- */
-enum i2400m_reset_type {
-	I2400M_RT_WARM,	/* first measure */
-	I2400M_RT_COLD,	/* second measure */
-	I2400M_RT_BUS,	/* call in artillery */
-};
-
-struct i2400m_reset_ctx;
-struct i2400m_roq;
-struct i2400m_barker_db;
-
-/**
- * struct i2400m - descriptor for an Intel 2400m
- *
- * Members marked with [fill] must be filled out/initialized before
- * calling i2400m_setup().
- *
- * Note the @bus_setup/@bus_release, @bus_dev_start/@bus_dev_release
- * call pairs are very much doing almost the same, and depending on
- * the underlying bus, some stuff has to be put in one or the
- * other. The idea of setup/release is that they setup the minimal
- * amount needed for loading firmware, where us dev_start/stop setup
- * the rest needed to do full data/control traffic.
- *
- * @bus_tx_block_size: [fill] USB imposes a 16 block size, but other
- *     busses will differ.  So we have a tx_blk_size variable that the
- *     bus layer sets to tell the engine how much of that we need.
- *
- * @bus_tx_room_min: [fill] Minimum room required while allocating
- *     TX queue's buffer space for message header. USB requires
- *     16 bytes. Refer to bus specific driver code for details.
- *
- * @bus_pl_size_max: [fill] Maximum payload size.
- *
- * @bus_setup: [optional fill] Function called by the bus-generic code
- *     [i2400m_setup()] to setup the basic bus-specific communications
- *     to the the device needed to load firmware. See LIFE CYCLE above.
- *
- *     NOTE: Doesn't need to upload the firmware, as that is taken
- *     care of by the bus-generic code.
- *
- * @bus_release: [optional fill] Function called by the bus-generic
- *     code [i2400m_release()] to shutdown the basic bus-specific
- *     communications to the the device needed to load firmware. See
- *     LIFE CYCLE above.
- *
- *     This function does not need to reset the device, just tear down
- *     all the host resources created to  handle communication with
- *     the device.
- *
- * @bus_dev_start: [optional fill] Function called by the bus-generic
- *     code [i2400m_dev_start()] to do things needed to start the
- *     device. See LIFE CYCLE above.
- *
- *     NOTE: Doesn't need to upload the firmware, as that is taken
- *     care of by the bus-generic code.
- *
- * @bus_dev_stop: [optional fill] Function called by the bus-generic
- *     code [i2400m_dev_stop()] to do things needed for stopping the
- *     device. See LIFE CYCLE above.
- *
- *     This function does not need to reset the device, just tear down
- *     all the host resources created to handle communication with
- *     the device.
- *
- * @bus_tx_kick: [fill] Function called by the bus-generic code to let
- *     the bus-specific code know that there is data available in the
- *     TX FIFO for transmission to the device.
- *
- *     This function cannot sleep.
- *
- * @bus_reset: [fill] Function called by the bus-generic code to reset
- *     the device in in various ways. Doesn't need to wait for the
- *     reset to finish.
- *
- *     If warm or cold reset fail, this function is expected to do a
- *     bus-specific reset (eg: USB reset) to get the device to a
- *     working state (even if it implies device disconecction).
- *
- *     Note the warm reset is used by the firmware uploader to
- *     reinitialize the device.
- *
- *     IMPORTANT: this is called very early in the device setup
- *     process, so it cannot rely on common infrastructure being laid
- *     out.
- *
- *     IMPORTANT: don't call reset on RT_BUS with i2400m->init_mutex
- *     held, as the .pre/.post reset handlers will deadlock.
- *
- * @bus_bm_retries: [fill] How many times shall a firmware upload /
- *     device initialization be retried? Different models of the same
- *     device might need different values, hence it is set by the
- *     bus-specific driver. Note this value is used in two places,
- *     i2400m_fw_dnload() and __i2400m_dev_start(); they won't become
- *     multiplicative (__i2400m_dev_start() calling N times
- *     i2400m_fw_dnload() and this trying N times to download the
- *     firmware), as if __i2400m_dev_start() only retries if the
- *     firmware crashed while initializing the device (not in a
- *     general case).
- *
- * @bus_bm_cmd_send: [fill] Function called to send a boot-mode
- *     command. Flags are defined in 'enum i2400m_bm_cmd_flags'. This
- *     is synchronous and has to return 0 if ok or < 0 errno code in
- *     any error condition.
- *
- * @bus_bm_wait_for_ack: [fill] Function called to wait for a
- *     boot-mode notification (that can be a response to a previously
- *     issued command or an asynchronous one). Will read until all the
- *     indicated size is read or timeout. Reading more or less data
- *     than asked for is an error condition. Return 0 if ok, < 0 errno
- *     code on error.
- *
- *     The caller to this function will check if the response is a
- *     barker that indicates the device going into reset mode.
- *
- * @bus_fw_names: [fill] a NULL-terminated array with the names of the
- *     firmware images to try loading. This is made a list so we can
- *     support backward compatibility of firmware releases (eg: if we
- *     can't find the default v1.4, we try v1.3). In general, the name
- *     should be i2400m-fw-X-VERSION.sbcf, where X is the bus name.
- *     The list is tried in order and the first one that loads is
- *     used. The fw loader will set i2400m->fw_name to point to the
- *     active firmware image.
- *
- * @bus_bm_mac_addr_impaired: [fill] Set to true if the device's MAC
- *     address provided in boot mode is kind of broken and needs to
- *     be re-read later on.
- *
- * @bus_bm_pokes_table: [fill/optional] A table of device addresses
- *     and values that will be poked at device init time to move the
- *     device to the correct state for the type of boot/firmware being
- *     used.  This table MUST be terminated with (0x000000,
- *     0x00000000) or bad things will happen.
- *
- *
- * @wimax_dev: WiMAX generic device for linkage into the kernel WiMAX
- *     stack. Due to the way a net_device is allocated, we need to
- *     force this to be the first field so that we can get from
- *     netdev_priv() the right pointer.
- *
- * @updown: the device is up and ready for transmitting control and
- *     data packets. This implies @ready (communication infrastructure
- *     with the device is ready) and the device's firmware has been
- *     loaded and the device initialized.
- *
- *     Write to it only inside a i2400m->init_mutex protected area
- *     followed with a wmb(); rmb() before accesing (unless locked
- *     inside i2400m->init_mutex). Read access can be loose like that
- *     [just using rmb()] because the paths that use this also do
- *     other error checks later on.
- *
- * @ready: Communication infrastructure with the device is ready, data
- *     frames can start to be passed around (this is lighter than
- *     using the WiMAX state for certain hot paths).
- *
- *     Write to it only inside a i2400m->init_mutex protected area
- *     followed with a wmb(); rmb() before accesing (unless locked
- *     inside i2400m->init_mutex). Read access can be loose like that
- *     [just using rmb()] because the paths that use this also do
- *     other error checks later on.
- *
- * @rx_reorder: 1 if RX reordering is enabled; this can only be
- *     set at probe time.
- *
- * @state: device's state (as reported by it)
- *
- * @state_wq: waitqueue that is woken up whenever the state changes
- *
- * @tx_lock: spinlock to protect TX members
- *
- * @tx_buf: FIFO buffer for TX; we queue data here
- *
- * @tx_in: FIFO index for incoming data. Note this doesn't wrap around
- *     and it is always greater than @tx_out.
- *
- * @tx_out: FIFO index for outgoing data
- *
- * @tx_msg: current TX message that is active in the FIFO for
- *     appending payloads.
- *
- * @tx_sequence: current sequence number for TX messages from the
- *     device to the host.
- *
- * @tx_msg_size: size of the current message being transmitted by the
- *     bus-specific code.
- *
- * @tx_pl_num: total number of payloads sent
- *
- * @tx_pl_max: maximum number of payloads sent in a TX message
- *
- * @tx_pl_min: minimum number of payloads sent in a TX message
- *
- * @tx_num: number of TX messages sent
- *
- * @tx_size_acc: number of bytes in all TX messages sent
- *     (this is different to net_dev's statistics as it also counts
- *     control messages).
- *
- * @tx_size_min: smallest TX message sent.
- *
- * @tx_size_max: biggest TX message sent.
- *
- * @rx_lock: spinlock to protect RX members and rx_roq_refcount.
- *
- * @rx_pl_num: total number of payloads received
- *
- * @rx_pl_max: maximum number of payloads received in a RX message
- *
- * @rx_pl_min: minimum number of payloads received in a RX message
- *
- * @rx_num: number of RX messages received
- *
- * @rx_size_acc: number of bytes in all RX messages received
- *     (this is different to net_dev's statistics as it also counts
- *     control messages).
- *
- * @rx_size_min: smallest RX message received.
- *
- * @rx_size_max: buggest RX message received.
- *
- * @rx_roq: RX ReOrder queues. (fw >= v1.4) When packets are received
- *     out of order, the device will ask the driver to hold certain
- *     packets until the ones that are received out of order can be
- *     delivered. Then the driver can release them to the host. See
- *     drivers/net/i2400m/rx.c for details.
- *
- * @rx_roq_refcount: refcount rx_roq. This refcounts any access to
- *     rx_roq thus preventing rx_roq being destroyed when rx_roq
- *     is being accessed. rx_roq_refcount is protected by rx_lock.
- *
- * @rx_reports: reports received from the device that couldn't be
- *     processed because the driver wasn't still ready; when ready,
- *     they are pulled from here and chewed.
- *
- * @rx_reports_ws: Work struct used to kick a scan of the RX reports
- *     list and to process each.
- *
- * @src_mac_addr: MAC address used to make ethernet packets be coming
- *     from. This is generated at i2400m_setup() time and used during
- *     the life cycle of the instance. See i2400m_fake_eth_header().
- *
- * @init_mutex: Mutex used for serializing the device bringup
- *     sequence; this way if the device reboots in the middle, we
- *     don't try to do a bringup again while we are tearing down the
- *     one that failed.
- *
- *     Can't reuse @msg_mutex because from within the bringup sequence
- *     we need to send messages to the device and thus use @msg_mutex.
- *
- * @msg_mutex: mutex used to send control commands to the device (we
- *     only allow one at a time, per host-device interface design).
- *
- * @msg_completion: used to wait for an ack to a control command sent
- *     to the device.
- *
- * @ack_skb: used to store the actual ack to a control command if the
- *     reception of the command was successful. Otherwise, a ERR_PTR()
- *     errno code that indicates what failed with the ack reception.
- *
- *     Only valid after @msg_completion is woken up. Only updateable
- *     if @msg_completion is armed. Only touched by
- *     i2400m_msg_to_dev().
- *
- *     Protected by @rx_lock. In theory the command execution flow is
- *     sequential, but in case the device sends an out-of-phase or
- *     very delayed response, we need to avoid it trampling current
- *     execution.
- *
- * @bm_cmd_buf: boot mode command buffer for composing firmware upload
- *     commands.
- *
- *     USB can't r/w to stack, vmalloc, etc...as well, we end up
- *     having to alloc/free a lot to compose commands, so we use these
- *     for stagging and not having to realloc all the time.
- *
- *     This assumes the code always runs serialized. Only one thread
- *     can call i2400m_bm_cmd() at the same time.
- *
- * @bm_ack_buf: boot mode acknoledge buffer for staging reception of
- *     responses to commands.
- *
- *     See @bm_cmd_buf.
- *
- * @work_queue: work queue for processing device reports. This
- *     workqueue cannot be used for processing TX or RX to the device,
- *     as from it we'll process device reports, which might require
- *     further communication with the device.
- *
- * @debugfs_dentry: hookup for debugfs files.
- *     These have to be in a separate directory, a child of
- *     (wimax_dev->debugfs_dentry) so they can be removed when the
- *     module unloads, as we don't keep each dentry.
- *
- * @fw_name: name of the firmware image that is currently being used.
- *
- * @fw_version: version of the firmware interface, Major.minor,
- *     encoded in the high word and low word (major << 16 | minor).
- *
- * @fw_hdrs: NULL terminated array of pointers to the firmware
- *     headers. This is only available during firmware load time.
- *
- * @fw_cached: Used to cache firmware when the system goes to
- *     suspend/standby/hibernation (as on resume we can't read it). If
- *     NULL, no firmware was cached, read it. If ~0, you can't read
- *     any firmware files (the system still didn't come out of suspend
- *     and failed to cache one), so abort; otherwise, a valid cached
- *     firmware to be used. Access to this variable is protected by
- *     the spinlock i2400m->rx_lock.
- *
- * @barker: barker type that the device uses; this is initialized by
- *     i2400m_is_boot_barker() the first time it is called. Then it
- *     won't change during the life cycle of the device and every time
- *     a boot barker is received, it is just verified for it being the
- *     same.
- *
- * @pm_notifier: used to register for PM events
- *
- * @bus_reset_retries: counter for the number of bus resets attempted for
- *	this boot. It's not for tracking the number of bus resets during
- *	the whole driver life cycle (from insmod to rmmod) but for the
- *	number of dev_start() executed until dev_start() returns a success
- *	(ie: a good boot means a dev_stop() followed by a successful
- *	dev_start()). dev_reset_handler() increments this counter whenever
- *	it is triggering a bus reset. It checks this counter to decide if a
- *	subsequent bus reset should be retried. dev_reset_handler() retries
- *	the bus reset until dev_start() succeeds or the counter reaches
- *	I2400M_BUS_RESET_RETRIES. The counter is cleared to 0 in
- *	dev_reset_handle() when dev_start() returns a success,
- *	ie: a successul boot is completed.
- *
- * @alive: flag to denote if the device *should* be alive. This flag is
- *	everything like @updown (see doc for @updown) except reflecting
- *	the device state *we expect* rather than the actual state as denoted
- *	by @updown. It is set 1 whenever @updown is set 1 in dev_start().
- *	Then the device is expected to be alive all the time
- *	(i2400m->alive remains 1) until the driver is removed. Therefore
- *	all the device reboot events detected can be still handled properly
- *	by either dev_reset_handle() or .pre_reset/.post_reset as long as
- *	the driver presents. It is set 0 along with @updown in dev_stop().
- *
- * @error_recovery: flag to denote if we are ready to take an error recovery.
- *	0 for ready to take an error recovery; 1 for not ready. It is
- *	initialized to 1 while probe() since we don't tend to take any error
- *	recovery during probe(). It is decremented by 1 whenever dev_start()
- *	succeeds to indicate we are ready to take error recovery from now on.
- *	It is checked every time we wanna schedule an error recovery. If an
- *	error recovery is already in place (error_recovery was set 1), we
- *	should not schedule another one until the last one is done.
- */
-struct i2400m {
-	struct wimax_dev wimax_dev;	/* FIRST! See doc */
-
-	unsigned updown:1;		/* Network device is up or down */
-	unsigned boot_mode:1;		/* is the device in boot mode? */
-	unsigned sboot:1;		/* signed or unsigned fw boot */
-	unsigned ready:1;		/* Device comm infrastructure ready */
-	unsigned rx_reorder:1;		/* RX reorder is enabled */
-	u8 trace_msg_from_user;		/* echo rx msgs to 'trace' pipe */
-					/* typed u8 so /sys/kernel/debug/u8 can tweak */
-	enum i2400m_system_state state;
-	wait_queue_head_t state_wq;	/* Woken up when on state updates */
-
-	size_t bus_tx_block_size;
-	size_t bus_tx_room_min;
-	size_t bus_pl_size_max;
-	unsigned bus_bm_retries;
-
-	int (*bus_setup)(struct i2400m *);
-	int (*bus_dev_start)(struct i2400m *);
-	void (*bus_dev_stop)(struct i2400m *);
-	void (*bus_release)(struct i2400m *);
-	void (*bus_tx_kick)(struct i2400m *);
-	int (*bus_reset)(struct i2400m *, enum i2400m_reset_type);
-	ssize_t (*bus_bm_cmd_send)(struct i2400m *,
-				   const struct i2400m_bootrom_header *,
-				   size_t, int flags);
-	ssize_t (*bus_bm_wait_for_ack)(struct i2400m *,
-				       struct i2400m_bootrom_header *, size_t);
-	const char **bus_fw_names;
-	unsigned bus_bm_mac_addr_impaired:1;
-	const struct i2400m_poke_table *bus_bm_pokes_table;
-
-	spinlock_t tx_lock;		/* protect TX state */
-	void *tx_buf;
-	size_t tx_in, tx_out;
-	struct i2400m_msg_hdr *tx_msg;
-	size_t tx_sequence, tx_msg_size;
-	/* TX stats */
-	unsigned tx_pl_num, tx_pl_max, tx_pl_min,
-		tx_num, tx_size_acc, tx_size_min, tx_size_max;
-
-	/* RX stuff */
-	/* protect RX state and rx_roq_refcount */
-	spinlock_t rx_lock;
-	unsigned rx_pl_num, rx_pl_max, rx_pl_min,
-		rx_num, rx_size_acc, rx_size_min, rx_size_max;
-	struct i2400m_roq *rx_roq;	/* access is refcounted */
-	struct kref rx_roq_refcount;	/* refcount access to rx_roq */
-	u8 src_mac_addr[ETH_HLEN];
-	struct list_head rx_reports;	/* under rx_lock! */
-	struct work_struct rx_report_ws;
-
-	struct mutex msg_mutex;		/* serialize command execution */
-	struct completion msg_completion;
-	struct sk_buff *ack_skb;	/* protected by rx_lock */
-
-	void *bm_ack_buf;		/* for receiving acks over USB */
-	void *bm_cmd_buf;		/* for issuing commands over USB */
-
-	struct workqueue_struct *work_queue;
-
-	struct mutex init_mutex;	/* protect bringup seq */
-	struct i2400m_reset_ctx *reset_ctx;	/* protected by init_mutex */
-
-	struct work_struct wake_tx_ws;
-	struct sk_buff *wake_tx_skb;
-
-	struct work_struct reset_ws;
-	const char *reset_reason;
-
-	struct work_struct recovery_ws;
-
-	struct dentry *debugfs_dentry;
-	const char *fw_name;		/* name of the current firmware image */
-	unsigned long fw_version;	/* version of the firmware interface */
-	const struct i2400m_bcf_hdr **fw_hdrs;
-	struct i2400m_fw *fw_cached;	/* protected by rx_lock */
-	struct i2400m_barker_db *barker;
-
-	struct notifier_block pm_notifier;
-
-	/* counting bus reset retries in this boot */
-	atomic_t bus_reset_retries;
-
-	/* if the device is expected to be alive */
-	unsigned alive;
-
-	/* 0 if we are ready for error recovery; 1 if not ready  */
-	atomic_t error_recovery;
-
-};
-
-
-/*
- * Bus-generic internal APIs
- * -------------------------
- */
-
-static inline
-struct i2400m *wimax_dev_to_i2400m(struct wimax_dev *wimax_dev)
-{
-	return container_of(wimax_dev, struct i2400m, wimax_dev);
-}
-
-static inline
-struct i2400m *net_dev_to_i2400m(struct net_device *net_dev)
-{
-	return wimax_dev_to_i2400m(netdev_priv(net_dev));
-}
-
-/*
- * Boot mode support
- */
-
-/**
- * i2400m_bm_cmd_flags - flags to i2400m_bm_cmd()
- *
- * @I2400M_BM_CMD_RAW: send the command block as-is, without doing any
- *     extra processing for adding CRC.
- */
-enum i2400m_bm_cmd_flags {
-	I2400M_BM_CMD_RAW	= 1 << 2,
-};
-
-/**
- * i2400m_bri - Boot-ROM indicators
- *
- * Flags for i2400m_bootrom_init() and i2400m_dev_bootstrap() [which
- * are passed from things like i2400m_setup()]. Can be combined with
- * |.
- *
- * @I2400M_BRI_SOFT: The device rebooted already and a reboot
- *     barker received, proceed directly to ack the boot sequence.
- * @I2400M_BRI_NO_REBOOT: Do not reboot the device and proceed
- *     directly to wait for a reboot barker from the device.
- * @I2400M_BRI_MAC_REINIT: We need to reinitialize the boot
- *     rom after reading the MAC address. This is quite a dirty hack,
- *     if you ask me -- the device requires the bootrom to be
- *     initialized after reading the MAC address.
- */
-enum i2400m_bri {
-	I2400M_BRI_SOFT       = 1 << 1,
-	I2400M_BRI_NO_REBOOT  = 1 << 2,
-	I2400M_BRI_MAC_REINIT = 1 << 3,
-};
-
-void i2400m_bm_cmd_prepare(struct i2400m_bootrom_header *);
-int i2400m_dev_bootstrap(struct i2400m *, enum i2400m_bri);
-int i2400m_read_mac_addr(struct i2400m *);
-int i2400m_bootrom_init(struct i2400m *, enum i2400m_bri);
-int i2400m_is_boot_barker(struct i2400m *, const void *, size_t);
-static inline
-int i2400m_is_d2h_barker(const void *buf)
-{
-	const __le32 *barker = buf;
-	return le32_to_cpu(*barker) == I2400M_D2H_MSG_BARKER;
-}
-void i2400m_unknown_barker(struct i2400m *, const void *, size_t);
-
-/* Make/grok boot-rom header commands */
-
-static inline
-__le32 i2400m_brh_command(enum i2400m_brh_opcode opcode, unsigned use_checksum,
-			  unsigned direct_access)
-{
-	return cpu_to_le32(
-		I2400M_BRH_SIGNATURE
-		| (direct_access ? I2400M_BRH_DIRECT_ACCESS : 0)
-		| I2400M_BRH_RESPONSE_REQUIRED /* response always required */
-		| (use_checksum ? I2400M_BRH_USE_CHECKSUM : 0)
-		| (opcode & I2400M_BRH_OPCODE_MASK));
-}
-
-static inline
-void i2400m_brh_set_opcode(struct i2400m_bootrom_header *hdr,
-			   enum i2400m_brh_opcode opcode)
-{
-	hdr->command = cpu_to_le32(
-		(le32_to_cpu(hdr->command) & ~I2400M_BRH_OPCODE_MASK)
-		| (opcode & I2400M_BRH_OPCODE_MASK));
-}
-
-static inline
-unsigned i2400m_brh_get_opcode(const struct i2400m_bootrom_header *hdr)
-{
-	return le32_to_cpu(hdr->command) & I2400M_BRH_OPCODE_MASK;
-}
-
-static inline
-unsigned i2400m_brh_get_response(const struct i2400m_bootrom_header *hdr)
-{
-	return (le32_to_cpu(hdr->command) & I2400M_BRH_RESPONSE_MASK)
-		>> I2400M_BRH_RESPONSE_SHIFT;
-}
-
-static inline
-unsigned i2400m_brh_get_use_checksum(const struct i2400m_bootrom_header *hdr)
-{
-	return le32_to_cpu(hdr->command) & I2400M_BRH_USE_CHECKSUM;
-}
-
-static inline
-unsigned i2400m_brh_get_response_required(
-	const struct i2400m_bootrom_header *hdr)
-{
-	return le32_to_cpu(hdr->command) & I2400M_BRH_RESPONSE_REQUIRED;
-}
-
-static inline
-unsigned i2400m_brh_get_direct_access(const struct i2400m_bootrom_header *hdr)
-{
-	return le32_to_cpu(hdr->command) & I2400M_BRH_DIRECT_ACCESS;
-}
-
-static inline
-unsigned i2400m_brh_get_signature(const struct i2400m_bootrom_header *hdr)
-{
-	return (le32_to_cpu(hdr->command) & I2400M_BRH_SIGNATURE_MASK)
-		>> I2400M_BRH_SIGNATURE_SHIFT;
-}
-
-
-/*
- * Driver / device setup and internal functions
- */
-void i2400m_init(struct i2400m *);
-int i2400m_reset(struct i2400m *, enum i2400m_reset_type);
-void i2400m_netdev_setup(struct net_device *net_dev);
-int i2400m_sysfs_setup(struct device_driver *);
-void i2400m_sysfs_release(struct device_driver *);
-int i2400m_tx_setup(struct i2400m *);
-void i2400m_wake_tx_work(struct work_struct *);
-void i2400m_tx_release(struct i2400m *);
-
-int i2400m_rx_setup(struct i2400m *);
-void i2400m_rx_release(struct i2400m *);
-
-void i2400m_fw_cache(struct i2400m *);
-void i2400m_fw_uncache(struct i2400m *);
-
-void i2400m_net_rx(struct i2400m *, struct sk_buff *, unsigned, const void *,
-		   int);
-void i2400m_net_erx(struct i2400m *, struct sk_buff *, enum i2400m_cs);
-void i2400m_net_wake_stop(struct i2400m *);
-enum i2400m_pt;
-int i2400m_tx(struct i2400m *, const void *, size_t, enum i2400m_pt);
-
-#ifdef CONFIG_DEBUG_FS
-void i2400m_debugfs_add(struct i2400m *);
-void i2400m_debugfs_rm(struct i2400m *);
-#else
-static inline void i2400m_debugfs_add(struct i2400m *i2400m) {}
-static inline void i2400m_debugfs_rm(struct i2400m *i2400m) {}
-#endif
-
-/* Initialize/shutdown the device */
-int i2400m_dev_initialize(struct i2400m *);
-void i2400m_dev_shutdown(struct i2400m *);
-
-extern struct attribute_group i2400m_dev_attr_group;
-
-
-/* HDI message's payload description handling */
-
-static inline
-size_t i2400m_pld_size(const struct i2400m_pld *pld)
-{
-	return I2400M_PLD_SIZE_MASK & le32_to_cpu(pld->val);
-}
-
-static inline
-enum i2400m_pt i2400m_pld_type(const struct i2400m_pld *pld)
-{
-	return (I2400M_PLD_TYPE_MASK & le32_to_cpu(pld->val))
-		>> I2400M_PLD_TYPE_SHIFT;
-}
-
-static inline
-void i2400m_pld_set(struct i2400m_pld *pld, size_t size,
-		    enum i2400m_pt type)
-{
-	pld->val = cpu_to_le32(
-		((type << I2400M_PLD_TYPE_SHIFT) & I2400M_PLD_TYPE_MASK)
-		|  (size & I2400M_PLD_SIZE_MASK));
-}
-
-
-/*
- * API for the bus-specific drivers
- * --------------------------------
- */
-
-static inline
-struct i2400m *i2400m_get(struct i2400m *i2400m)
-{
-	dev_hold(i2400m->wimax_dev.net_dev);
-	return i2400m;
-}
-
-static inline
-void i2400m_put(struct i2400m *i2400m)
-{
-	dev_put(i2400m->wimax_dev.net_dev);
-}
-
-int i2400m_dev_reset_handle(struct i2400m *, const char *);
-int i2400m_pre_reset(struct i2400m *);
-int i2400m_post_reset(struct i2400m *);
-void i2400m_error_recovery(struct i2400m *);
-
-/*
- * _setup()/_release() are called by the probe/disconnect functions of
- * the bus-specific drivers.
- */
-int i2400m_setup(struct i2400m *, enum i2400m_bri bm_flags);
-void i2400m_release(struct i2400m *);
-
-int i2400m_rx(struct i2400m *, struct sk_buff *);
-struct i2400m_msg_hdr *i2400m_tx_msg_get(struct i2400m *, size_t *);
-void i2400m_tx_msg_sent(struct i2400m *);
-
-
-/*
- * Utility functions
- */
-
-static inline
-struct device *i2400m_dev(struct i2400m *i2400m)
-{
-	return i2400m->wimax_dev.net_dev->dev.parent;
-}
-
-int i2400m_msg_check_status(const struct i2400m_l3l4_hdr *, char *, size_t);
-int i2400m_msg_size_check(struct i2400m *, const struct i2400m_l3l4_hdr *,
-			  size_t);
-struct sk_buff *i2400m_msg_to_dev(struct i2400m *, const void *, size_t);
-void i2400m_msg_to_dev_cancel_wait(struct i2400m *, int);
-void i2400m_report_hook(struct i2400m *, const struct i2400m_l3l4_hdr *,
-			size_t);
-void i2400m_report_hook_work(struct work_struct *);
-int i2400m_cmd_enter_powersave(struct i2400m *);
-int i2400m_cmd_exit_idle(struct i2400m *);
-struct sk_buff *i2400m_get_device_info(struct i2400m *);
-int i2400m_firmware_check(struct i2400m *);
-int i2400m_set_idle_timeout(struct i2400m *, unsigned);
-
-static inline
-struct usb_endpoint_descriptor *usb_get_epd(struct usb_interface *iface, int ep)
-{
-	return &iface->cur_altsetting->endpoint[ep].desc;
-}
-
-int i2400m_op_rfkill_sw_toggle(struct wimax_dev *, enum wimax_rf_state);
-void i2400m_report_tlv_rf_switches_status(struct i2400m *,
-					  const struct i2400m_tlv_rf_switches_status *);
-
-/*
- * Helpers for firmware backwards compatibility
- *
- * As we aim to support at least the firmware version that was
- * released with the previous kernel/driver release, some code will be
- * conditionally executed depending on the firmware version. On each
- * release, the code to support fw releases past the last two ones
- * will be purged.
- *
- * By making it depend on this macros, it is easier to keep it a tab
- * on what has to go and what not.
- */
-static inline
-unsigned i2400m_le_v1_3(struct i2400m *i2400m)
-{
-	/* running fw is lower or v1.3 */
-	return i2400m->fw_version <= 0x00090001;
-}
-
-static inline
-unsigned i2400m_ge_v1_4(struct i2400m *i2400m)
-{
-	/* running fw is higher or v1.4 */
-	return i2400m->fw_version >= 0x00090002;
-}
-
-
-/*
- * Do a millisecond-sleep for allowing wireshark to dump all the data
- * packets. Used only for debugging.
- */
-static inline
-void __i2400m_msleep(unsigned ms)
-{
-#if 1
-#else
-	msleep(ms);
-#endif
-}
-
-
-/* module initialization helpers */
-int i2400m_barker_db_init(const char *);
-void i2400m_barker_db_exit(void);
-
-
-
-#endif /* #ifndef __I2400M_H__ */
diff --git a/drivers/net/wimax/i2400m/netdev.c b/drivers/net/wimax/i2400m/netdev.c
deleted file mode 100644
index a7fcbceb6e6b..000000000000
--- a/drivers/net/wimax/i2400m/netdev.c
+++ /dev/null
@@ -1,603 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Glue with the networking stack
- *
- * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This implements an ethernet device for the i2400m.
- *
- * We fake being an ethernet device to simplify the support from user
- * space and from the other side. The world is (sadly) configured to
- * take in only Ethernet devices...
- *
- * Because of this, when using firmwares <= v1.3, there is an
- * copy-each-rxed-packet overhead on the RX path. Each IP packet has
- * to be reallocated to add an ethernet header (as there is no space
- * in what we get from the device). This is a known drawback and
- * firmwares >= 1.4 add header space that can be used to insert the
- * ethernet header without having to reallocate and copy.
- *
- * TX error handling is tricky; because we have to FIFO/queue the
- * buffers for transmission (as the hardware likes it aggregated), we
- * just give the skb to the TX subsystem and by the time it is
- * transmitted, we have long forgotten about it. So we just don't care
- * too much about it.
- *
- * Note that when the device is in idle mode with the basestation, we
- * need to negotiate coming back up online. That involves negotiation
- * and possible user space interaction. Thus, we defer to a workqueue
- * to do all that. By default, we only queue a single packet and drop
- * the rest, as potentially the time to go back from idle to normal is
- * long.
- *
- * ROADMAP
- *
- * i2400m_open         Called on ifconfig up
- * i2400m_stop         Called on ifconfig down
- *
- * i2400m_hard_start_xmit Called by the network stack to send a packet
- *   i2400m_net_wake_tx	  Wake up device from basestation-IDLE & TX
- *     i2400m_wake_tx_work
- *       i2400m_cmd_exit_idle
- *       i2400m_tx
- *   i2400m_net_tx        TX a data frame
- *     i2400m_tx
- *
- * i2400m_change_mtu      Called on ifconfig mtu XXX
- *
- * i2400m_tx_timeout      Called when the device times out
- *
- * i2400m_net_rx          Called by the RX code when a data frame is
- *                        available (firmware <= 1.3)
- * i2400m_net_erx         Called by the RX code when a data frame is
- *                        available (firmware >= 1.4).
- * i2400m_netdev_setup    Called to setup all the netdev stuff from
- *                        alloc_netdev.
- */
-#include <linux/if_arp.h>
-#include <linux/slab.h>
-#include <linux/netdevice.h>
-#include <linux/ethtool.h>
-#include <linux/export.h>
-#include "i2400m.h"
-
-
-#define D_SUBMODULE netdev
-#include "debug-levels.h"
-
-enum {
-/* netdev interface */
-	/* 20 secs? yep, this is the maximum timeout that the device
-	 * might take to get out of IDLE / negotiate it with the base
-	 * station. We add 1sec for good measure. */
-	I2400M_TX_TIMEOUT = 21 * HZ,
-	/*
-	 * Experimentation has determined that, 20 to be a good value
-	 * for minimizing the jitter in the throughput.
-	 */
-	I2400M_TX_QLEN = 20,
-};
-
-
-static
-int i2400m_open(struct net_device *net_dev)
-{
-	int result;
-	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(net_dev %p [i2400m %p])\n", net_dev, i2400m);
-	/* Make sure we wait until init is complete... */
-	mutex_lock(&i2400m->init_mutex);
-	if (i2400m->updown)
-		result = 0;
-	else
-		result = -EBUSY;
-	mutex_unlock(&i2400m->init_mutex);
-	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
-		net_dev, i2400m, result);
-	return result;
-}
-
-
-static
-int i2400m_stop(struct net_device *net_dev)
-{
-	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(net_dev %p [i2400m %p])\n", net_dev, i2400m);
-	i2400m_net_wake_stop(i2400m);
-	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = 0\n", net_dev, i2400m);
-	return 0;
-}
-
-
-/*
- * Wake up the device and transmit a held SKB, then restart the net queue
- *
- * When the device goes into basestation-idle mode, we need to tell it
- * to exit that mode; it will negotiate with the base station, user
- * space may have to intervene to rehandshake crypto and then tell us
- * when it is ready to transmit the packet we have "queued". Still we
- * need to give it sometime after it reports being ok.
- *
- * On error, there is not much we can do. If the error was on TX, we
- * still wake the queue up to see if the next packet will be luckier.
- *
- * If _cmd_exit_idle() fails...well, it could be many things; most
- * commonly it is that something else took the device out of IDLE mode
- * (for example, the base station). In that case we get an -EILSEQ and
- * we are just going to ignore that one. If the device is back to
- * connected, then fine -- if it is someother state, the packet will
- * be dropped anyway.
- */
-void i2400m_wake_tx_work(struct work_struct *ws)
-{
-	int result;
-	struct i2400m *i2400m = container_of(ws, struct i2400m, wake_tx_ws);
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *skb;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	skb = i2400m->wake_tx_skb;
-	i2400m->wake_tx_skb = NULL;
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-
-	d_fnstart(3, dev, "(ws %p i2400m %p skb %p)\n", ws, i2400m, skb);
-	result = -EINVAL;
-	if (skb == NULL) {
-		dev_err(dev, "WAKE&TX: skb disappeared!\n");
-		goto out_put;
-	}
-	/* If we have, somehow, lost the connection after this was
-	 * queued, don't do anything; this might be the device got
-	 * reset or just disconnected. */
-	if (unlikely(!netif_carrier_ok(net_dev)))
-		goto out_kfree;
-	result = i2400m_cmd_exit_idle(i2400m);
-	if (result == -EILSEQ)
-		result = 0;
-	if (result < 0) {
-		dev_err(dev, "WAKE&TX: device didn't get out of idle: "
-			"%d - resetting\n", result);
-		i2400m_reset(i2400m, I2400M_RT_BUS);
-		goto error;
-	}
-	result = wait_event_timeout(i2400m->state_wq,
-				    i2400m->state != I2400M_SS_IDLE,
-				    net_dev->watchdog_timeo - HZ/2);
-	if (result == 0)
-		result = -ETIMEDOUT;
-	if (result < 0) {
-		dev_err(dev, "WAKE&TX: error waiting for device to exit IDLE: "
-			"%d - resetting\n", result);
-		i2400m_reset(i2400m, I2400M_RT_BUS);
-		goto error;
-	}
-	msleep(20);	/* device still needs some time or it drops it */
-	result = i2400m_tx(i2400m, skb->data, skb->len, I2400M_PT_DATA);
-error:
-	netif_wake_queue(net_dev);
-out_kfree:
-	kfree_skb(skb);	/* refcount transferred by _hard_start_xmit() */
-out_put:
-	i2400m_put(i2400m);
-	d_fnend(3, dev, "(ws %p i2400m %p skb %p) = void [%d]\n",
-		ws, i2400m, skb, result);
-}
-
-
-/*
- * Prepare the data payload TX header
- *
- * The i2400m expects a 4 byte header in front of a data packet.
- *
- * Because we pretend to be an ethernet device, this packet comes with
- * an ethernet header. Pull it and push our header.
- */
-static
-void i2400m_tx_prep_header(struct sk_buff *skb)
-{
-	struct i2400m_pl_data_hdr *pl_hdr;
-	skb_pull(skb, ETH_HLEN);
-	pl_hdr = skb_push(skb, sizeof(*pl_hdr));
-	pl_hdr->reserved = 0;
-}
-
-
-
-/*
- * Cleanup resources acquired during i2400m_net_wake_tx()
- *
- * This is called by __i2400m_dev_stop and means we have to make sure
- * the workqueue is flushed from any pending work.
- */
-void i2400m_net_wake_stop(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *wake_tx_skb;
-	unsigned long flags;
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	/*
-	 * See i2400m_hard_start_xmit(), references are taken there and
-	 * here we release them if the packet was still pending.
-	 */
-	cancel_work_sync(&i2400m->wake_tx_ws);
-
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	wake_tx_skb = i2400m->wake_tx_skb;
-	i2400m->wake_tx_skb = NULL;
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-
-	if (wake_tx_skb) {
-		i2400m_put(i2400m);
-		kfree_skb(wake_tx_skb);
-	}
-
-	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
-}
-
-
-/*
- * TX an skb to an idle device
- *
- * When the device is in basestation-idle mode, we need to wake it up
- * and then TX. So we queue a work_struct for doing so.
- *
- * We need to get an extra ref for the skb (so it is not dropped), as
- * well as be careful not to queue more than one request (won't help
- * at all). If more than one request comes or there are errors, we
- * just drop the packets (see i2400m_hard_start_xmit()).
- */
-static
-int i2400m_net_wake_tx(struct i2400m *i2400m, struct net_device *net_dev,
-		       struct sk_buff *skb)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	unsigned long flags;
-
-	d_fnstart(3, dev, "(skb %p net_dev %p)\n", skb, net_dev);
-	if (net_ratelimit()) {
-		d_printf(3, dev, "WAKE&NETTX: "
-			 "skb %p sending %d bytes to radio\n",
-			 skb, skb->len);
-		d_dump(4, dev, skb->data, skb->len);
-	}
-	/* We hold a ref count for i2400m and skb, so when
-	 * stopping() the device, we need to cancel that work
-	 * and if pending, release those resources. */
-	result = 0;
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	if (!i2400m->wake_tx_skb) {
-		netif_stop_queue(net_dev);
-		i2400m_get(i2400m);
-		i2400m->wake_tx_skb = skb_get(skb);	/* transfer ref count */
-		i2400m_tx_prep_header(skb);
-		result = schedule_work(&i2400m->wake_tx_ws);
-		WARN_ON(result == 0);
-	}
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-	if (result == 0) {
-		/* Yes, this happens even if we stopped the
-		 * queue -- blame the queue disciplines that
-		 * queue without looking -- I guess there is a reason
-		 * for that. */
-		if (net_ratelimit())
-			d_printf(1, dev, "NETTX: device exiting idle, "
-				 "dropping skb %p, queue running %d\n",
-				 skb, netif_queue_stopped(net_dev));
-		result = -EBUSY;
-	}
-	d_fnend(3, dev, "(skb %p net_dev %p) = %d\n", skb, net_dev, result);
-	return result;
-}
-
-
-/*
- * Transmit a packet to the base station on behalf of the network stack.
- *
- * Returns: 0 if ok, < 0 errno code on error.
- *
- * We need to pull the ethernet header and add the hardware header,
- * which is currently set to all zeroes and reserved.
- */
-static
-int i2400m_net_tx(struct i2400m *i2400m, struct net_device *net_dev,
-		  struct sk_buff *skb)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(i2400m %p net_dev %p skb %p)\n",
-		  i2400m, net_dev, skb);
-	/* FIXME: check eth hdr, only IPv4 is routed by the device as of now */
-	netif_trans_update(net_dev);
-	i2400m_tx_prep_header(skb);
-	d_printf(3, dev, "NETTX: skb %p sending %d bytes to radio\n",
-		 skb, skb->len);
-	d_dump(4, dev, skb->data, skb->len);
-	result = i2400m_tx(i2400m, skb->data, skb->len, I2400M_PT_DATA);
-	d_fnend(3, dev, "(i2400m %p net_dev %p skb %p) = %d\n",
-		i2400m, net_dev, skb, result);
-	return result;
-}
-
-
-/*
- * Transmit a packet to the base station on behalf of the network stack
- *
- *
- * Returns: NETDEV_TX_OK (always, even in case of error)
- *
- * In case of error, we just drop it. Reasons:
- *
- *  - we add a hw header to each skb, and if the network stack
- *    retries, we have no way to know if that skb has it or not.
- *
- *  - network protocols have their own drop-recovery mechanisms
- *
- *  - there is not much else we can do
- *
- * If the device is idle, we need to wake it up; that is an operation
- * that will sleep. See i2400m_net_wake_tx() for details.
- */
-static
-netdev_tx_t i2400m_hard_start_xmit(struct sk_buff *skb,
-					 struct net_device *net_dev)
-{
-	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
-	struct device *dev = i2400m_dev(i2400m);
-	int result = -1;
-
-	d_fnstart(3, dev, "(skb %p net_dev %p)\n", skb, net_dev);
-
-	if (skb_cow_head(skb, 0))
-		goto drop;
-
-	if (i2400m->state == I2400M_SS_IDLE)
-		result = i2400m_net_wake_tx(i2400m, net_dev, skb);
-	else
-		result = i2400m_net_tx(i2400m, net_dev, skb);
-	if (result <  0) {
-drop:
-		net_dev->stats.tx_dropped++;
-	} else {
-		net_dev->stats.tx_packets++;
-		net_dev->stats.tx_bytes += skb->len;
-	}
-	dev_kfree_skb(skb);
-	d_fnend(3, dev, "(skb %p net_dev %p) = %d\n", skb, net_dev, result);
-	return NETDEV_TX_OK;
-}
-
-
-static
-void i2400m_tx_timeout(struct net_device *net_dev, unsigned int txqueue)
-{
-	/*
-	 * We might want to kick the device
-	 *
-	 * There is not much we can do though, as the device requires
-	 * that we send the data aggregated. By the time we receive
-	 * this, there might be data pending to be sent or not...
-	 */
-	net_dev->stats.tx_errors++;
-}
-
-
-/*
- * Create a fake ethernet header
- *
- * For emulating an ethernet device, every received IP header has to
- * be prefixed with an ethernet header. Fake it with the given
- * protocol.
- */
-static
-void i2400m_rx_fake_eth_header(struct net_device *net_dev,
-			       void *_eth_hdr, __be16 protocol)
-{
-	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
-	struct ethhdr *eth_hdr = _eth_hdr;
-
-	memcpy(eth_hdr->h_dest, net_dev->dev_addr, sizeof(eth_hdr->h_dest));
-	memcpy(eth_hdr->h_source, i2400m->src_mac_addr,
-	       sizeof(eth_hdr->h_source));
-	eth_hdr->h_proto = protocol;
-}
-
-
-/*
- * i2400m_net_rx - pass a network packet to the stack
- *
- * @i2400m: device instance
- * @skb_rx: the skb where the buffer pointed to by @buf is
- * @i: 1 if payload is the only one
- * @buf: pointer to the buffer containing the data
- * @len: buffer's length
- *
- * This is only used now for the v1.3 firmware. It will be deprecated
- * in >= 2.6.31.
- *
- * Note that due to firmware limitations, we don't have space to add
- * an ethernet header, so we need to copy each packet. Firmware
- * versions >= v1.4 fix this [see i2400m_net_erx()].
- *
- * We just clone the skb and set it up so that it's skb->data pointer
- * points to "buf" and it's length.
- *
- * Note that if the payload is the last (or the only one) in a
- * multi-payload message, we don't clone the SKB but just reuse it.
- *
- * This function is normally run from a thread context. However, we
- * still use netif_rx() instead of netif_receive_skb() as was
- * recommended in the mailing list. Reason is in some stress tests
- * when sending/receiving a lot of data we seem to hit a softlock in
- * the kernel's TCP implementation [aroudn tcp_delay_timer()]. Using
- * netif_rx() took care of the issue.
- *
- * This is, of course, still open to do more research on why running
- * with netif_receive_skb() hits this softlock. FIXME.
- *
- * FIXME: currently we don't do any efforts at distinguishing if what
- * we got was an IPv4 or IPv6 header, to setup the protocol field
- * correctly.
- */
-void i2400m_net_rx(struct i2400m *i2400m, struct sk_buff *skb_rx,
-		   unsigned i, const void *buf, int buf_len)
-{
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *skb;
-
-	d_fnstart(2, dev, "(i2400m %p buf %p buf_len %d)\n",
-		  i2400m, buf, buf_len);
-	if (i) {
-		skb = skb_get(skb_rx);
-		d_printf(2, dev, "RX: reusing first payload skb %p\n", skb);
-		skb_pull(skb, buf - (void *) skb->data);
-		skb_trim(skb, (void *) skb_end_pointer(skb) - buf);
-	} else {
-		/* Yes, this is bad -- a lot of overhead -- see
-		 * comments at the top of the file */
-		skb = __netdev_alloc_skb(net_dev, buf_len, GFP_KERNEL);
-		if (skb == NULL) {
-			dev_err(dev, "NETRX: no memory to realloc skb\n");
-			net_dev->stats.rx_dropped++;
-			goto error_skb_realloc;
-		}
-		skb_put_data(skb, buf, buf_len);
-	}
-	i2400m_rx_fake_eth_header(i2400m->wimax_dev.net_dev,
-				  skb->data - ETH_HLEN,
-				  cpu_to_be16(ETH_P_IP));
-	skb_set_mac_header(skb, -ETH_HLEN);
-	skb->dev = i2400m->wimax_dev.net_dev;
-	skb->protocol = htons(ETH_P_IP);
-	net_dev->stats.rx_packets++;
-	net_dev->stats.rx_bytes += buf_len;
-	d_printf(3, dev, "NETRX: receiving %d bytes to network stack\n",
-		buf_len);
-	d_dump(4, dev, buf, buf_len);
-	netif_rx_ni(skb);	/* see notes in function header */
-error_skb_realloc:
-	d_fnend(2, dev, "(i2400m %p buf %p buf_len %d) = void\n",
-		i2400m, buf, buf_len);
-}
-
-
-/*
- * i2400m_net_erx - pass a network packet to the stack (extended version)
- *
- * @i2400m: device descriptor
- * @skb: the skb where the packet is - the skb should be set to point
- *     at the IP packet; this function will add ethernet headers if
- *     needed.
- * @cs: packet type
- *
- * This is only used now for firmware >= v1.4. Note it is quite
- * similar to i2400m_net_rx() (used only for v1.3 firmware).
- *
- * This function is normally run from a thread context. However, we
- * still use netif_rx() instead of netif_receive_skb() as was
- * recommended in the mailing list. Reason is in some stress tests
- * when sending/receiving a lot of data we seem to hit a softlock in
- * the kernel's TCP implementation [aroudn tcp_delay_timer()]. Using
- * netif_rx() took care of the issue.
- *
- * This is, of course, still open to do more research on why running
- * with netif_receive_skb() hits this softlock. FIXME.
- */
-void i2400m_net_erx(struct i2400m *i2400m, struct sk_buff *skb,
-		    enum i2400m_cs cs)
-{
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(2, dev, "(i2400m %p skb %p [%u] cs %d)\n",
-		  i2400m, skb, skb->len, cs);
-	switch(cs) {
-	case I2400M_CS_IPV4_0:
-	case I2400M_CS_IPV4:
-		i2400m_rx_fake_eth_header(i2400m->wimax_dev.net_dev,
-					  skb->data - ETH_HLEN,
-					  cpu_to_be16(ETH_P_IP));
-		skb_set_mac_header(skb, -ETH_HLEN);
-		skb->dev = i2400m->wimax_dev.net_dev;
-		skb->protocol = htons(ETH_P_IP);
-		net_dev->stats.rx_packets++;
-		net_dev->stats.rx_bytes += skb->len;
-		break;
-	default:
-		dev_err(dev, "ERX: BUG? CS type %u unsupported\n", cs);
-		goto error;
-
-	}
-	d_printf(3, dev, "ERX: receiving %d bytes to the network stack\n",
-		 skb->len);
-	d_dump(4, dev, skb->data, skb->len);
-	netif_rx_ni(skb);	/* see notes in function header */
-error:
-	d_fnend(2, dev, "(i2400m %p skb %p [%u] cs %d) = void\n",
-		i2400m, skb, skb->len, cs);
-}
-
-static const struct net_device_ops i2400m_netdev_ops = {
-	.ndo_open = i2400m_open,
-	.ndo_stop = i2400m_stop,
-	.ndo_start_xmit = i2400m_hard_start_xmit,
-	.ndo_tx_timeout = i2400m_tx_timeout,
-};
-
-static void i2400m_get_drvinfo(struct net_device *net_dev,
-			       struct ethtool_drvinfo *info)
-{
-	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
-
-	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
-	strlcpy(info->fw_version, i2400m->fw_name ? : "",
-		sizeof(info->fw_version));
-	if (net_dev->dev.parent)
-		strlcpy(info->bus_info, dev_name(net_dev->dev.parent),
-			sizeof(info->bus_info));
-}
-
-static const struct ethtool_ops i2400m_ethtool_ops = {
-	.get_drvinfo = i2400m_get_drvinfo,
-	.get_link = ethtool_op_get_link,
-};
-
-/**
- * i2400m_netdev_setup - Setup setup @net_dev's i2400m private data
- *
- * Called by alloc_netdev()
- */
-void i2400m_netdev_setup(struct net_device *net_dev)
-{
-	d_fnstart(3, NULL, "(net_dev %p)\n", net_dev);
-	ether_setup(net_dev);
-	net_dev->mtu = I2400M_MAX_MTU;
-	net_dev->min_mtu = 0;
-	net_dev->max_mtu = I2400M_MAX_MTU;
-	net_dev->tx_queue_len = I2400M_TX_QLEN;
-	net_dev->features =
-		  NETIF_F_VLAN_CHALLENGED
-		| NETIF_F_HIGHDMA;
-	net_dev->flags =
-		IFF_NOARP		/* i2400m is apure IP device */
-		& (~IFF_BROADCAST	/* i2400m is P2P */
-		   & ~IFF_MULTICAST);
-	net_dev->watchdog_timeo = I2400M_TX_TIMEOUT;
-	net_dev->netdev_ops = &i2400m_netdev_ops;
-	net_dev->ethtool_ops = &i2400m_ethtool_ops;
-	d_fnend(3, NULL, "(net_dev %p) = void\n", net_dev);
-}
-EXPORT_SYMBOL_GPL(i2400m_netdev_setup);
-
diff --git a/drivers/net/wimax/i2400m/op-rfkill.c b/drivers/net/wimax/i2400m/op-rfkill.c
deleted file mode 100644
index 5c79f052cad2..000000000000
--- a/drivers/net/wimax/i2400m/op-rfkill.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Implement backend for the WiMAX stack rfkill support
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * The WiMAX kernel stack integrates into RF-Kill and keeps the
- * switches's status. We just need to:
- *
- * - report changes in the HW RF Kill switch [with
- *   wimax_rfkill_{sw,hw}_report(), which happens when we detect those
- *   indications coming through hardware reports]. We also do it on
- *   initialization to let the stack know the initial HW state.
- *
- * - implement indications from the stack to change the SW RF Kill
- *   switch (coming from sysfs, the wimax stack or user space).
- */
-#include "i2400m.h"
-#include <linux/wimax/i2400m.h>
-#include <linux/slab.h>
-
-
-
-#define D_SUBMODULE rfkill
-#include "debug-levels.h"
-
-/*
- * Return true if the i2400m radio is in the requested wimax_rf_state state
- *
- */
-static
-int i2400m_radio_is(struct i2400m *i2400m, enum wimax_rf_state state)
-{
-	if (state == WIMAX_RF_OFF)
-		return i2400m->state == I2400M_SS_RF_OFF
-			|| i2400m->state == I2400M_SS_RF_SHUTDOWN;
-	else if (state == WIMAX_RF_ON)
-		/* state == WIMAX_RF_ON */
-		return i2400m->state != I2400M_SS_RF_OFF
-			&& i2400m->state != I2400M_SS_RF_SHUTDOWN;
-	else {
-		BUG();
-		return -EINVAL;	/* shut gcc warnings on certain arches */
-	}
-}
-
-
-/*
- * WiMAX stack operation: implement SW RFKill toggling
- *
- * @wimax_dev: device descriptor
- * @skb: skb where the message has been received; skb->data is
- *       expected to point to the message payload.
- * @genl_info: passed by the generic netlink layer
- *
- * Generic Netlink will call this function when a message is sent from
- * userspace to change the software RF-Kill switch status.
- *
- * This function will set the device's software RF-Kill switch state to
- * match what is requested.
- *
- * NOTE: the i2400m has a strict state machine; we can only set the
- *       RF-Kill switch when it is on, the HW RF-Kill is on and the
- *       device is initialized. So we ignore errors steaming from not
- *       being in the right state (-EILSEQ).
- */
-int i2400m_op_rfkill_sw_toggle(struct wimax_dev *wimax_dev,
-			       enum wimax_rf_state state)
-{
-	int result;
-	struct i2400m *i2400m = wimax_dev_to_i2400m(wimax_dev);
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *ack_skb;
-	struct {
-		struct i2400m_l3l4_hdr hdr;
-		struct i2400m_tlv_rf_operation sw_rf;
-	} __packed *cmd;
-	char strerr[32];
-
-	d_fnstart(4, dev, "(wimax_dev %p state %d)\n", wimax_dev, state);
-
-	result = -ENOMEM;
-	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
-	if (cmd == NULL)
-		goto error_alloc;
-	cmd->hdr.type = cpu_to_le16(I2400M_MT_CMD_RF_CONTROL);
-	cmd->hdr.length = sizeof(cmd->sw_rf);
-	cmd->hdr.version = cpu_to_le16(I2400M_L3L4_VERSION);
-	cmd->sw_rf.hdr.type = cpu_to_le16(I2400M_TLV_RF_OPERATION);
-	cmd->sw_rf.hdr.length = cpu_to_le16(sizeof(cmd->sw_rf.status));
-	switch (state) {
-	case WIMAX_RF_OFF:	/* RFKILL ON, radio OFF */
-		cmd->sw_rf.status = cpu_to_le32(2);
-		break;
-	case WIMAX_RF_ON:	/* RFKILL OFF, radio ON */
-		cmd->sw_rf.status = cpu_to_le32(1);
-		break;
-	default:
-		BUG();
-	}
-
-	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
-	result = PTR_ERR(ack_skb);
-	if (IS_ERR(ack_skb)) {
-		dev_err(dev, "Failed to issue 'RF Control' command: %d\n",
-			result);
-		goto error_msg_to_dev;
-	}
-	result = i2400m_msg_check_status(wimax_msg_data(ack_skb),
-					 strerr, sizeof(strerr));
-	if (result < 0) {
-		dev_err(dev, "'RF Control' (0x%04x) command failed: %d - %s\n",
-			I2400M_MT_CMD_RF_CONTROL, result, strerr);
-		goto error_cmd;
-	}
-
-	/* Now we wait for the state to change to RADIO_OFF or RADIO_ON */
-	result = wait_event_timeout(
-		i2400m->state_wq, i2400m_radio_is(i2400m, state),
-		5 * HZ);
-	if (result == 0)
-		result = -ETIMEDOUT;
-	if (result < 0)
-		dev_err(dev, "Error waiting for device to toggle RF state: "
-			"%d\n", result);
-	result = 0;
-error_cmd:
-	kfree_skb(ack_skb);
-error_msg_to_dev:
-error_alloc:
-	d_fnend(4, dev, "(wimax_dev %p state %d) = %d\n",
-		wimax_dev, state, result);
-	kfree(cmd);
-	return result;
-}
-
-
-/*
- * Inform the WiMAX stack of changes in the RF Kill switches reported
- * by the device
- *
- * @i2400m: device descriptor
- * @rfss: TLV for RF Switches status; already validated
- *
- * NOTE: the reports on RF switch status cannot be trusted
- *       or used until the device is in a state of RADIO_OFF
- *       or greater.
- */
-void i2400m_report_tlv_rf_switches_status(
-	struct i2400m *i2400m,
-	const struct i2400m_tlv_rf_switches_status *rfss)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	enum i2400m_rf_switch_status hw, sw;
-	enum wimax_st wimax_state;
-
-	sw = le32_to_cpu(rfss->sw_rf_switch);
-	hw = le32_to_cpu(rfss->hw_rf_switch);
-
-	d_fnstart(3, dev, "(i2400m %p rfss %p [hw %u sw %u])\n",
-		  i2400m, rfss, hw, sw);
-	/* We only process rw switch evens when the device has been
-	 * fully initialized */
-	wimax_state = wimax_state_get(&i2400m->wimax_dev);
-	if (wimax_state < WIMAX_ST_RADIO_OFF) {
-		d_printf(3, dev, "ignoring RF switches report, state %u\n",
-			 wimax_state);
-		goto out;
-	}
-	switch (sw) {
-	case I2400M_RF_SWITCH_ON:	/* RF Kill disabled (radio on) */
-		wimax_report_rfkill_sw(&i2400m->wimax_dev, WIMAX_RF_ON);
-		break;
-	case I2400M_RF_SWITCH_OFF:	/* RF Kill enabled (radio off) */
-		wimax_report_rfkill_sw(&i2400m->wimax_dev, WIMAX_RF_OFF);
-		break;
-	default:
-		dev_err(dev, "HW BUG? Unknown RF SW state 0x%x\n", sw);
-	}
-
-	switch (hw) {
-	case I2400M_RF_SWITCH_ON:	/* RF Kill disabled (radio on) */
-		wimax_report_rfkill_hw(&i2400m->wimax_dev, WIMAX_RF_ON);
-		break;
-	case I2400M_RF_SWITCH_OFF:	/* RF Kill enabled (radio off) */
-		wimax_report_rfkill_hw(&i2400m->wimax_dev, WIMAX_RF_OFF);
-		break;
-	default:
-		dev_err(dev, "HW BUG? Unknown RF HW state 0x%x\n", hw);
-	}
-out:
-	d_fnend(3, dev, "(i2400m %p rfss %p [hw %u sw %u]) = void\n",
-		i2400m, rfss, hw, sw);
-}
diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c
deleted file mode 100644
index c9fb619a9e01..000000000000
--- a/drivers/net/wimax/i2400m/rx.c
+++ /dev/null
@@ -1,1395 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Handle incoming traffic and deliver it to the control or data planes
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- *  - Initial implementation
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Use skb_clone(), break up processing in chunks
- *  - Split transport/device specific
- *  - Make buffer size dynamic to exert less memory pressure
- *  - RX reorder support
- *
- * This handles the RX path.
- *
- * We receive an RX message from the bus-specific driver, which
- * contains one or more payloads that have potentially different
- * destinataries (data or control paths).
- *
- * So we just take that payload from the transport specific code in
- * the form of an skb, break it up in chunks (a cloned skb each in the
- * case of network packets) and pass it to netdev or to the
- * command/ack handler (and from there to the WiMAX stack).
- *
- * PROTOCOL FORMAT
- *
- * The format of the buffer is:
- *
- * HEADER                      (struct i2400m_msg_hdr)
- * PAYLOAD DESCRIPTOR 0        (struct i2400m_pld)
- * PAYLOAD DESCRIPTOR 1
- * ...
- * PAYLOAD DESCRIPTOR N
- * PAYLOAD 0                   (raw bytes)
- * PAYLOAD 1
- * ...
- * PAYLOAD N
- *
- * See tx.c for a deeper description on alignment requirements and
- * other fun facts of it.
- *
- * DATA PACKETS
- *
- * In firmwares <= v1.3, data packets have no header for RX, but they
- * do for TX (currently unused).
- *
- * In firmware >= 1.4, RX packets have an extended header (16
- * bytes). This header conveys information for management of host
- * reordering of packets (the device offloads storage of the packets
- * for reordering to the host). Read below for more information.
- *
- * The header is used as dummy space to emulate an ethernet header and
- * thus be able to act as an ethernet device without having to reallocate.
- *
- * DATA RX REORDERING
- *
- * Starting in firmware v1.4, the device can deliver packets for
- * delivery with special reordering information; this allows it to
- * more effectively do packet management when some frames were lost in
- * the radio traffic.
- *
- * Thus, for RX packets that come out of order, the device gives the
- * driver enough information to queue them properly and then at some
- * point, the signal to deliver the whole (or part) of the queued
- * packets to the networking stack. There are 16 such queues.
- *
- * This only happens when a packet comes in with the "need reorder"
- * flag set in the RX header. When such bit is set, the following
- * operations might be indicated:
- *
- *  - reset queue: send all queued packets to the OS
- *
- *  - queue: queue a packet
- *
- *  - update ws: update the queue's window start and deliver queued
- *    packets that meet the criteria
- *
- *  - queue & update ws: queue a packet, update the window start and
- *    deliver queued packets that meet the criteria
- *
- * (delivery criteria: the packet's [normalized] sequence number is
- * lower than the new [normalized] window start).
- *
- * See the i2400m_roq_*() functions for details.
- *
- * ROADMAP
- *
- * i2400m_rx
- *   i2400m_rx_msg_hdr_check
- *   i2400m_rx_pl_descr_check
- *   i2400m_rx_payload
- *     i2400m_net_rx
- *     i2400m_rx_edata
- *       i2400m_net_erx
- *       i2400m_roq_reset
- *         i2400m_net_erx
- *       i2400m_roq_queue
- *         __i2400m_roq_queue
- *       i2400m_roq_update_ws
- *         __i2400m_roq_update_ws
- *           i2400m_net_erx
- *       i2400m_roq_queue_update_ws
- *         __i2400m_roq_queue
- *         __i2400m_roq_update_ws
- *           i2400m_net_erx
- *     i2400m_rx_ctl
- *       i2400m_msg_size_check
- *       i2400m_report_hook_work    [in a workqueue]
- *         i2400m_report_hook
- *       wimax_msg_to_user
- *       i2400m_rx_ctl_ack
- *         wimax_msg_to_user_alloc
- *     i2400m_rx_trace
- *       i2400m_msg_size_check
- *       wimax_msg
- */
-#include <linux/slab.h>
-#include <linux/kernel.h>
-#include <linux/if_arp.h>
-#include <linux/netdevice.h>
-#include <linux/workqueue.h>
-#include <linux/export.h>
-#include <linux/moduleparam.h>
-#include "i2400m.h"
-
-
-#define D_SUBMODULE rx
-#include "debug-levels.h"
-
-static int i2400m_rx_reorder_disabled;	/* 0 (rx reorder enabled) by default */
-module_param_named(rx_reorder_disabled, i2400m_rx_reorder_disabled, int, 0644);
-MODULE_PARM_DESC(rx_reorder_disabled,
-		 "If true, RX reordering will be disabled.");
-
-struct i2400m_report_hook_args {
-	struct sk_buff *skb_rx;
-	const struct i2400m_l3l4_hdr *l3l4_hdr;
-	size_t size;
-	struct list_head list_node;
-};
-
-
-/*
- * Execute i2400m_report_hook in a workqueue
- *
- * Goes over the list of queued reports in i2400m->rx_reports and
- * processes them.
- *
- * NOTE: refcounts on i2400m are not needed because we flush the
- *     workqueue this runs on (i2400m->work_queue) before destroying
- *     i2400m.
- */
-void i2400m_report_hook_work(struct work_struct *ws)
-{
-	struct i2400m *i2400m = container_of(ws, struct i2400m, rx_report_ws);
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_report_hook_args *args, *args_next;
-	LIST_HEAD(list);
-	unsigned long flags;
-
-	while (1) {
-		spin_lock_irqsave(&i2400m->rx_lock, flags);
-		list_splice_init(&i2400m->rx_reports, &list);
-		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-		if (list_empty(&list))
-			break;
-		else
-			d_printf(1, dev, "processing queued reports\n");
-		list_for_each_entry_safe(args, args_next, &list, list_node) {
-			d_printf(2, dev, "processing queued report %p\n", args);
-			i2400m_report_hook(i2400m, args->l3l4_hdr, args->size);
-			kfree_skb(args->skb_rx);
-			list_del(&args->list_node);
-			kfree(args);
-		}
-	}
-}
-
-
-/*
- * Flush the list of queued reports
- */
-static
-void i2400m_report_hook_flush(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_report_hook_args *args, *args_next;
-	LIST_HEAD(list);
-	unsigned long flags;
-
-	d_printf(1, dev, "flushing queued reports\n");
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	list_splice_init(&i2400m->rx_reports, &list);
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	list_for_each_entry_safe(args, args_next, &list, list_node) {
-		d_printf(2, dev, "flushing queued report %p\n", args);
-		kfree_skb(args->skb_rx);
-		list_del(&args->list_node);
-		kfree(args);
-	}
-}
-
-
-/*
- * Queue a report for later processing
- *
- * @i2400m: device descriptor
- * @skb_rx: skb that contains the payload (for reference counting)
- * @l3l4_hdr: pointer to the control
- * @size: size of the message
- */
-static
-void i2400m_report_hook_queue(struct i2400m *i2400m, struct sk_buff *skb_rx,
-			      const void *l3l4_hdr, size_t size)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	unsigned long flags;
-	struct i2400m_report_hook_args *args;
-
-	args = kzalloc(sizeof(*args), GFP_NOIO);
-	if (args) {
-		args->skb_rx = skb_get(skb_rx);
-		args->l3l4_hdr = l3l4_hdr;
-		args->size = size;
-		spin_lock_irqsave(&i2400m->rx_lock, flags);
-		list_add_tail(&args->list_node, &i2400m->rx_reports);
-		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-		d_printf(2, dev, "queued report %p\n", args);
-		rmb();		/* see i2400m->ready's documentation  */
-		if (likely(i2400m->ready))	/* only send if up */
-			queue_work(i2400m->work_queue, &i2400m->rx_report_ws);
-	} else  {
-		if (printk_ratelimit())
-			dev_err(dev, "%s:%u: Can't allocate %zu B\n",
-				__func__, __LINE__, sizeof(*args));
-	}
-}
-
-
-/*
- * Process an ack to a command
- *
- * @i2400m: device descriptor
- * @payload: pointer to message
- * @size: size of the message
- *
- * Pass the acknodledgment (in an skb) to the thread that is waiting
- * for it in i2400m->msg_completion.
- *
- * We need to coordinate properly with the thread waiting for the
- * ack. Check if it is waiting or if it is gone. We loose the spinlock
- * to avoid allocating on atomic contexts (yeah, could use GFP_ATOMIC,
- * but this is not so speed critical).
- */
-static
-void i2400m_rx_ctl_ack(struct i2400m *i2400m,
-		       const void *payload, size_t size)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	unsigned long flags;
-	struct sk_buff *ack_skb;
-
-	/* Anyone waiting for an answer? */
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	if (i2400m->ack_skb != ERR_PTR(-EINPROGRESS)) {
-		dev_err(dev, "Huh? reply to command with no waiters\n");
-		goto error_no_waiter;
-	}
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-
-	ack_skb = wimax_msg_alloc(wimax_dev, NULL, payload, size, GFP_KERNEL);
-
-	/* Check waiter didn't time out waiting for the answer... */
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	if (i2400m->ack_skb != ERR_PTR(-EINPROGRESS)) {
-		d_printf(1, dev, "Huh? waiter for command reply cancelled\n");
-		goto error_waiter_cancelled;
-	}
-	if (IS_ERR(ack_skb))
-		dev_err(dev, "CMD/GET/SET ack: cannot allocate SKB\n");
-	i2400m->ack_skb = ack_skb;
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	complete(&i2400m->msg_completion);
-	return;
-
-error_waiter_cancelled:
-	if (!IS_ERR(ack_skb))
-		kfree_skb(ack_skb);
-error_no_waiter:
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-}
-
-
-/*
- * Receive and process a control payload
- *
- * @i2400m: device descriptor
- * @skb_rx: skb that contains the payload (for reference counting)
- * @payload: pointer to message
- * @size: size of the message
- *
- * There are two types of control RX messages: reports (asynchronous,
- * like your every day interrupts) and 'acks' (reponses to a command,
- * get or set request).
- *
- * If it is a report, we run hooks on it (to extract information for
- * things we need to do in the driver) and then pass it over to the
- * WiMAX stack to send it to user space.
- *
- * NOTE: report processing is done in a workqueue specific to the
- *     generic driver, to avoid deadlocks in the system.
- *
- * If it is not a report, it is an ack to a previously executed
- * command, set or get, so wake up whoever is waiting for it from
- * i2400m_msg_to_dev(). i2400m_rx_ctl_ack() takes care of that.
- *
- * Note that the sizes we pass to other functions from here are the
- * sizes of the _l3l4_hdr + payload, not full buffer sizes, as we have
- * verified in _msg_size_check() that they are congruent.
- *
- * For reports: We can't clone the original skb where the data is
- * because we need to send this up via netlink; netlink has to add
- * headers and we can't overwrite what's preceding the payload...as
- * it is another message. So we just dup them.
- */
-static
-void i2400m_rx_ctl(struct i2400m *i2400m, struct sk_buff *skb_rx,
-		   const void *payload, size_t size)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_l3l4_hdr *l3l4_hdr = payload;
-	unsigned msg_type;
-
-	result = i2400m_msg_size_check(i2400m, l3l4_hdr, size);
-	if (result < 0) {
-		dev_err(dev, "HW BUG? device sent a bad message: %d\n",
-			result);
-		goto error_check;
-	}
-	msg_type = le16_to_cpu(l3l4_hdr->type);
-	d_printf(1, dev, "%s 0x%04x: %zu bytes\n",
-		 msg_type & I2400M_MT_REPORT_MASK ? "REPORT" : "CMD/SET/GET",
-		 msg_type, size);
-	d_dump(2, dev, l3l4_hdr, size);
-	if (msg_type & I2400M_MT_REPORT_MASK) {
-		/*
-		 * Process each report
-		 *
-		 * - has to be ran serialized as well
-		 *
-		 * - the handling might force the execution of
-		 *   commands. That might cause reentrancy issues with
-		 *   bus-specific subdrivers and workqueues, so the we
-		 *   run it in a separate workqueue.
-		 *
-		 * - when the driver is not yet ready to handle them,
-		 *   they are queued and at some point the queue is
-		 *   restarted [NOTE: we can't queue SKBs directly, as
-		 *   this might be a piece of a SKB, not the whole
-		 *   thing, and this is cheaper than cloning the
-		 *   SKB].
-		 *
-		 * Note we don't do refcounting for the device
-		 * structure; this is because before destroying
-		 * 'i2400m', we make sure to flush the
-		 * i2400m->work_queue, so there are no issues.
-		 */
-		i2400m_report_hook_queue(i2400m, skb_rx, l3l4_hdr, size);
-		if (unlikely(i2400m->trace_msg_from_user))
-			wimax_msg(&i2400m->wimax_dev, "echo",
-				  l3l4_hdr, size, GFP_KERNEL);
-		result = wimax_msg(&i2400m->wimax_dev, NULL, l3l4_hdr, size,
-				   GFP_KERNEL);
-		if (result < 0)
-			dev_err(dev, "error sending report to userspace: %d\n",
-				result);
-	} else		/* an ack to a CMD, GET or SET */
-		i2400m_rx_ctl_ack(i2400m, payload, size);
-error_check:
-	return;
-}
-
-
-/*
- * Receive and send up a trace
- *
- * @i2400m: device descriptor
- * @skb_rx: skb that contains the trace (for reference counting)
- * @payload: pointer to trace message inside the skb
- * @size: size of the message
- *
- * THe i2400m might produce trace information (diagnostics) and we
- * send them through a different kernel-to-user pipe (to avoid
- * clogging it).
- *
- * As in i2400m_rx_ctl(), we can't clone the original skb where the
- * data is because we need to send this up via netlink; netlink has to
- * add headers and we can't overwrite what's preceding the
- * payload...as it is another message. So we just dup them.
- */
-static
-void i2400m_rx_trace(struct i2400m *i2400m,
-		     const void *payload, size_t size)
-{
-	int result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	const struct i2400m_l3l4_hdr *l3l4_hdr = payload;
-	unsigned msg_type;
-
-	result = i2400m_msg_size_check(i2400m, l3l4_hdr, size);
-	if (result < 0) {
-		dev_err(dev, "HW BUG? device sent a bad trace message: %d\n",
-			result);
-		goto error_check;
-	}
-	msg_type = le16_to_cpu(l3l4_hdr->type);
-	d_printf(1, dev, "Trace %s 0x%04x: %zu bytes\n",
-		 msg_type & I2400M_MT_REPORT_MASK ? "REPORT" : "CMD/SET/GET",
-		 msg_type, size);
-	d_dump(2, dev, l3l4_hdr, size);
-	result = wimax_msg(wimax_dev, "trace", l3l4_hdr, size, GFP_KERNEL);
-	if (result < 0)
-		dev_err(dev, "error sending trace to userspace: %d\n",
-			result);
-error_check:
-	return;
-}
-
-
-/*
- * Reorder queue data stored on skb->cb while the skb is queued in the
- * reorder queues.
- */
-struct i2400m_roq_data {
-	unsigned sn;		/* Serial number for the skb */
-	enum i2400m_cs cs;	/* packet type for the skb */
-};
-
-
-/*
- * ReOrder Queue
- *
- * @ws: Window Start; sequence number where the current window start
- *     is for this queue
- * @queue: the skb queue itself
- * @log: circular ring buffer used to log information about the
- *     reorder process in this queue that can be displayed in case of
- *     error to help diagnose it.
- *
- * This is the head for a list of skbs. In the skb->cb member of the
- * skb when queued here contains a 'struct i2400m_roq_data' were we
- * store the sequence number (sn) and the cs (packet type) coming from
- * the RX payload header from the device.
- */
-struct i2400m_roq
-{
-	unsigned ws;
-	struct sk_buff_head queue;
-	struct i2400m_roq_log *log;
-};
-
-
-static
-void __i2400m_roq_init(struct i2400m_roq *roq)
-{
-	roq->ws = 0;
-	skb_queue_head_init(&roq->queue);
-}
-
-
-static
-unsigned __i2400m_roq_index(struct i2400m *i2400m, struct i2400m_roq *roq)
-{
-	return ((unsigned long) roq - (unsigned long) i2400m->rx_roq)
-		/ sizeof(*roq);
-}
-
-
-/*
- * Normalize a sequence number based on the queue's window start
- *
- * nsn = (sn - ws) % 2048
- *
- * Note that if @sn < @roq->ws, we still need a positive number; %'s
- * sign is implementation specific, so we normalize it by adding 2048
- * to bring it to be positive.
- */
-static
-unsigned __i2400m_roq_nsn(struct i2400m_roq *roq, unsigned sn)
-{
-	int r;
-	r =  ((int) sn - (int) roq->ws) % 2048;
-	if (r < 0)
-		r += 2048;
-	return r;
-}
-
-
-/*
- * Circular buffer to keep the last N reorder operations
- *
- * In case something fails, dumb then to try to come up with what
- * happened.
- */
-enum {
-	I2400M_ROQ_LOG_LENGTH = 32,
-};
-
-struct i2400m_roq_log {
-	struct i2400m_roq_log_entry {
-		enum i2400m_ro_type type;
-		unsigned ws, count, sn, nsn, new_ws;
-	} entry[I2400M_ROQ_LOG_LENGTH];
-	unsigned in, out;
-};
-
-
-/* Print a log entry */
-static
-void i2400m_roq_log_entry_print(struct i2400m *i2400m, unsigned index,
-				unsigned e_index,
-				struct i2400m_roq_log_entry *e)
-{
-	struct device *dev = i2400m_dev(i2400m);
-
-	switch(e->type) {
-	case I2400M_RO_TYPE_RESET:
-		dev_err(dev, "q#%d reset           ws %u cnt %u sn %u/%u"
-			" - new nws %u\n",
-			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
-		break;
-	case I2400M_RO_TYPE_PACKET:
-		dev_err(dev, "q#%d queue           ws %u cnt %u sn %u/%u\n",
-			index, e->ws, e->count, e->sn, e->nsn);
-		break;
-	case I2400M_RO_TYPE_WS:
-		dev_err(dev, "q#%d update_ws       ws %u cnt %u sn %u/%u"
-			" - new nws %u\n",
-			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
-		break;
-	case I2400M_RO_TYPE_PACKET_WS:
-		dev_err(dev, "q#%d queue_update_ws ws %u cnt %u sn %u/%u"
-			" - new nws %u\n",
-			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
-		break;
-	default:
-		dev_err(dev, "q#%d BUG? entry %u - unknown type %u\n",
-			index, e_index, e->type);
-		break;
-	}
-}
-
-
-static
-void i2400m_roq_log_add(struct i2400m *i2400m,
-			struct i2400m_roq *roq, enum i2400m_ro_type type,
-			unsigned ws, unsigned count, unsigned sn,
-			unsigned nsn, unsigned new_ws)
-{
-	struct i2400m_roq_log_entry *e;
-	unsigned cnt_idx;
-	int index = __i2400m_roq_index(i2400m, roq);
-
-	/* if we run out of space, we eat from the end */
-	if (roq->log->in - roq->log->out == I2400M_ROQ_LOG_LENGTH)
-		roq->log->out++;
-	cnt_idx = roq->log->in++ % I2400M_ROQ_LOG_LENGTH;
-	e = &roq->log->entry[cnt_idx];
-
-	e->type = type;
-	e->ws = ws;
-	e->count = count;
-	e->sn = sn;
-	e->nsn = nsn;
-	e->new_ws = new_ws;
-
-	if (d_test(1))
-		i2400m_roq_log_entry_print(i2400m, index, cnt_idx, e);
-}
-
-
-/* Dump all the entries in the FIFO and reinitialize it */
-static
-void i2400m_roq_log_dump(struct i2400m *i2400m, struct i2400m_roq *roq)
-{
-	unsigned cnt, cnt_idx;
-	struct i2400m_roq_log_entry *e;
-	int index = __i2400m_roq_index(i2400m, roq);
-
-	BUG_ON(roq->log->out > roq->log->in);
-	for (cnt = roq->log->out; cnt < roq->log->in; cnt++) {
-		cnt_idx = cnt % I2400M_ROQ_LOG_LENGTH;
-		e = &roq->log->entry[cnt_idx];
-		i2400m_roq_log_entry_print(i2400m, index, cnt_idx, e);
-		memset(e, 0, sizeof(*e));
-	}
-	roq->log->in = roq->log->out = 0;
-}
-
-
-/*
- * Backbone for the queuing of an skb (by normalized sequence number)
- *
- * @i2400m: device descriptor
- * @roq: reorder queue where to add
- * @skb: the skb to add
- * @sn: the sequence number of the skb
- * @nsn: the normalized sequence number of the skb (pre-computed by the
- *     caller from the @sn and @roq->ws).
- *
- * We try first a couple of quick cases:
- *
- *   - the queue is empty
- *   - the skb would be appended to the queue
- *
- * These will be the most common operations.
- *
- * If these fail, then we have to do a sorted insertion in the queue,
- * which is the slowest path.
- *
- * We don't have to acquire a reference count as we are going to own it.
- */
-static
-void __i2400m_roq_queue(struct i2400m *i2400m, struct i2400m_roq *roq,
-			struct sk_buff *skb, unsigned sn, unsigned nsn)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *skb_itr;
-	struct i2400m_roq_data *roq_data_itr, *roq_data;
-	unsigned nsn_itr;
-
-	d_fnstart(4, dev, "(i2400m %p roq %p skb %p sn %u nsn %u)\n",
-		  i2400m, roq, skb, sn, nsn);
-
-	roq_data = (struct i2400m_roq_data *) &skb->cb;
-	BUILD_BUG_ON(sizeof(*roq_data) > sizeof(skb->cb));
-	roq_data->sn = sn;
-	d_printf(3, dev, "ERX: roq %p [ws %u] nsn %d sn %u\n",
-		 roq, roq->ws, nsn, roq_data->sn);
-
-	/* Queues will be empty on not-so-bad environments, so try
-	 * that first */
-	if (skb_queue_empty(&roq->queue)) {
-		d_printf(2, dev, "ERX: roq %p - first one\n", roq);
-		__skb_queue_head(&roq->queue, skb);
-		goto out;
-	}
-	/* Now try append, as most of the operations will be that */
-	skb_itr = skb_peek_tail(&roq->queue);
-	roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
-	nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
-	/* NSN bounds assumed correct (checked when it was queued) */
-	if (nsn >= nsn_itr) {
-		d_printf(2, dev, "ERX: roq %p - appended after %p (nsn %d sn %u)\n",
-			 roq, skb_itr, nsn_itr, roq_data_itr->sn);
-		__skb_queue_tail(&roq->queue, skb);
-		goto out;
-	}
-	/* None of the fast paths option worked. Iterate to find the
-	 * right spot where to insert the packet; we know the queue is
-	 * not empty, so we are not the first ones; we also know we
-	 * are not going to be the last ones. The list is sorted, so
-	 * we have to insert before the the first guy with an nsn_itr
-	 * greater that our nsn. */
-	skb_queue_walk(&roq->queue, skb_itr) {
-		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
-		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
-		/* NSN bounds assumed correct (checked when it was queued) */
-		if (nsn_itr > nsn) {
-			d_printf(2, dev, "ERX: roq %p - queued before %p "
-				 "(nsn %d sn %u)\n", roq, skb_itr, nsn_itr,
-				 roq_data_itr->sn);
-			__skb_queue_before(&roq->queue, skb_itr, skb);
-			goto out;
-		}
-	}
-	/* If we get here, that is VERY bad -- print info to help
-	 * diagnose and crash it */
-	dev_err(dev, "SW BUG? failed to insert packet\n");
-	dev_err(dev, "ERX: roq %p [ws %u] skb %p nsn %d sn %u\n",
-		roq, roq->ws, skb, nsn, roq_data->sn);
-	skb_queue_walk(&roq->queue, skb_itr) {
-		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
-		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
-		/* NSN bounds assumed correct (checked when it was queued) */
-		dev_err(dev, "ERX: roq %p skb_itr %p nsn %d sn %u\n",
-			roq, skb_itr, nsn_itr, roq_data_itr->sn);
-	}
-	BUG();
-out:
-	d_fnend(4, dev, "(i2400m %p roq %p skb %p sn %u nsn %d) = void\n",
-		i2400m, roq, skb, sn, nsn);
-}
-
-
-/*
- * Backbone for the update window start operation
- *
- * @i2400m: device descriptor
- * @roq: Reorder queue
- * @sn: New sequence number
- *
- * Updates the window start of a queue; when doing so, it must deliver
- * to the networking stack all the queued skb's whose normalized
- * sequence number is lower than the new normalized window start.
- */
-static
-unsigned __i2400m_roq_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
-				unsigned sn)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *skb_itr, *tmp_itr;
-	struct i2400m_roq_data *roq_data_itr;
-	unsigned new_nws, nsn_itr;
-
-	new_nws = __i2400m_roq_nsn(roq, sn);
-	/*
-	 * For type 2(update_window_start) rx messages, there is no
-	 * need to check if the normalized sequence number is greater 1023.
-	 * Simply insert and deliver all packets to the host up to the
-	 * window start.
-	 */
-	skb_queue_walk_safe(&roq->queue, skb_itr, tmp_itr) {
-		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
-		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
-		/* NSN bounds assumed correct (checked when it was queued) */
-		if (nsn_itr < new_nws) {
-			d_printf(2, dev, "ERX: roq %p - release skb %p "
-				 "(nsn %u/%u new nws %u)\n",
-				 roq, skb_itr, nsn_itr, roq_data_itr->sn,
-				 new_nws);
-			__skb_unlink(skb_itr, &roq->queue);
-			i2400m_net_erx(i2400m, skb_itr, roq_data_itr->cs);
-		}
-		else
-			break;	/* rest of packets all nsn_itr > nws */
-	}
-	roq->ws = sn;
-	return new_nws;
-}
-
-
-/*
- * Reset a queue
- *
- * @i2400m: device descriptor
- * @cin: Queue Index
- *
- * Deliver all the packets and reset the window-start to zero. Name is
- * kind of misleading.
- */
-static
-void i2400m_roq_reset(struct i2400m *i2400m, struct i2400m_roq *roq)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *skb_itr, *tmp_itr;
-	struct i2400m_roq_data *roq_data_itr;
-
-	d_fnstart(2, dev, "(i2400m %p roq %p)\n", i2400m, roq);
-	i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_RESET,
-			     roq->ws, skb_queue_len(&roq->queue),
-			     ~0, ~0, 0);
-	skb_queue_walk_safe(&roq->queue, skb_itr, tmp_itr) {
-		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
-		d_printf(2, dev, "ERX: roq %p - release skb %p (sn %u)\n",
-			 roq, skb_itr, roq_data_itr->sn);
-		__skb_unlink(skb_itr, &roq->queue);
-		i2400m_net_erx(i2400m, skb_itr, roq_data_itr->cs);
-	}
-	roq->ws = 0;
-	d_fnend(2, dev, "(i2400m %p roq %p) = void\n", i2400m, roq);
-}
-
-
-/*
- * Queue a packet
- *
- * @i2400m: device descriptor
- * @cin: Queue Index
- * @skb: containing the packet data
- * @fbn: First block number of the packet in @skb
- * @lbn: Last block number of the packet in @skb
- *
- * The hardware is asking the driver to queue a packet for later
- * delivery to the networking stack.
- */
-static
-void i2400m_roq_queue(struct i2400m *i2400m, struct i2400m_roq *roq,
-		      struct sk_buff * skb, unsigned lbn)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	unsigned nsn, len;
-
-	d_fnstart(2, dev, "(i2400m %p roq %p skb %p lbn %u) = void\n",
-		  i2400m, roq, skb, lbn);
-	len = skb_queue_len(&roq->queue);
-	nsn = __i2400m_roq_nsn(roq, lbn);
-	if (unlikely(nsn >= 1024)) {
-		dev_err(dev, "SW BUG? queue nsn %d (lbn %u ws %u)\n",
-			nsn, lbn, roq->ws);
-		i2400m_roq_log_dump(i2400m, roq);
-		i2400m_reset(i2400m, I2400M_RT_WARM);
-	} else {
-		__i2400m_roq_queue(i2400m, roq, skb, lbn, nsn);
-		i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_PACKET,
-				     roq->ws, len, lbn, nsn, ~0);
-	}
-	d_fnend(2, dev, "(i2400m %p roq %p skb %p lbn %u) = void\n",
-		i2400m, roq, skb, lbn);
-}
-
-
-/*
- * Update the window start in a reorder queue and deliver all skbs
- * with a lower window start
- *
- * @i2400m: device descriptor
- * @roq: Reorder queue
- * @sn: New sequence number
- */
-static
-void i2400m_roq_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
-			  unsigned sn)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	unsigned old_ws, nsn, len;
-
-	d_fnstart(2, dev, "(i2400m %p roq %p sn %u)\n", i2400m, roq, sn);
-	old_ws = roq->ws;
-	len = skb_queue_len(&roq->queue);
-	nsn = __i2400m_roq_update_ws(i2400m, roq, sn);
-	i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_WS,
-			     old_ws, len, sn, nsn, roq->ws);
-	d_fnstart(2, dev, "(i2400m %p roq %p sn %u) = void\n", i2400m, roq, sn);
-}
-
-
-/*
- * Queue a packet and update the window start
- *
- * @i2400m: device descriptor
- * @cin: Queue Index
- * @skb: containing the packet data
- * @fbn: First block number of the packet in @skb
- * @sn: Last block number of the packet in @skb
- *
- * Note that unlike i2400m_roq_update_ws(), which sets the new window
- * start to @sn, in here we'll set it to @sn + 1.
- */
-static
-void i2400m_roq_queue_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
-				struct sk_buff * skb, unsigned sn)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	unsigned nsn, old_ws, len;
-
-	d_fnstart(2, dev, "(i2400m %p roq %p skb %p sn %u)\n",
-		  i2400m, roq, skb, sn);
-	len = skb_queue_len(&roq->queue);
-	nsn = __i2400m_roq_nsn(roq, sn);
-	/*
-	 * For type 3(queue_update_window_start) rx messages, there is no
-	 * need to check if the normalized sequence number is greater 1023.
-	 * Simply insert and deliver all packets to the host up to the
-	 * window start.
-	 */
-	old_ws = roq->ws;
-	/* If the queue is empty, don't bother as we'd queue
-	 * it and immediately unqueue it -- just deliver it.
-	 */
-	if (len == 0) {
-		struct i2400m_roq_data *roq_data;
-		roq_data = (struct i2400m_roq_data *) &skb->cb;
-		i2400m_net_erx(i2400m, skb, roq_data->cs);
-	} else
-		__i2400m_roq_queue(i2400m, roq, skb, sn, nsn);
-
-	__i2400m_roq_update_ws(i2400m, roq, sn + 1);
-	i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_PACKET_WS,
-			   old_ws, len, sn, nsn, roq->ws);
-
-	d_fnend(2, dev, "(i2400m %p roq %p skb %p sn %u) = void\n",
-		i2400m, roq, skb, sn);
-}
-
-
-/*
- * This routine destroys the memory allocated for rx_roq, when no
- * other thread is accessing it. Access to rx_roq is refcounted by
- * rx_roq_refcount, hence memory allocated must be destroyed when
- * rx_roq_refcount becomes zero. This routine gets executed when
- * rx_roq_refcount becomes zero.
- */
-static void i2400m_rx_roq_destroy(struct kref *ref)
-{
-	unsigned itr;
-	struct i2400m *i2400m
-			= container_of(ref, struct i2400m, rx_roq_refcount);
-	for (itr = 0; itr < I2400M_RO_CIN + 1; itr++)
-		__skb_queue_purge(&i2400m->rx_roq[itr].queue);
-	kfree(i2400m->rx_roq[0].log);
-	kfree(i2400m->rx_roq);
-	i2400m->rx_roq = NULL;
-}
-
-/*
- * Receive and send up an extended data packet
- *
- * @i2400m: device descriptor
- * @skb_rx: skb that contains the extended data packet
- * @single_last: 1 if the payload is the only one or the last one of
- *     the skb.
- * @payload: pointer to the packet's data inside the skb
- * @size: size of the payload
- *
- * Starting in v1.4 of the i2400m's firmware, the device can send data
- * packets to the host in an extended format that; this incudes a 16
- * byte header (struct i2400m_pl_edata_hdr). Using this header's space
- * we can fake ethernet headers for ethernet device emulation without
- * having to copy packets around.
- *
- * This function handles said path.
- *
- *
- * Receive and send up an extended data packet that requires no reordering
- *
- * @i2400m: device descriptor
- * @skb_rx: skb that contains the extended data packet
- * @single_last: 1 if the payload is the only one or the last one of
- *     the skb.
- * @payload: pointer to the packet's data (past the actual extended
- *     data payload header).
- * @size: size of the payload
- *
- * Pass over to the networking stack a data packet that might have
- * reordering requirements.
- *
- * This needs to the decide if the skb in which the packet is
- * contained can be reused or if it needs to be cloned. Then it has to
- * be trimmed in the edges so that the beginning is the space for eth
- * header and then pass it to i2400m_net_erx() for the stack
- *
- * Assumes the caller has verified the sanity of the payload (size,
- * etc) already.
- */
-static
-void i2400m_rx_edata(struct i2400m *i2400m, struct sk_buff *skb_rx,
-		     unsigned single_last, const void *payload, size_t size)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_pl_edata_hdr *hdr = payload;
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-	struct sk_buff *skb;
-	enum i2400m_cs cs;
-	u32 reorder;
-	unsigned ro_needed, ro_type, ro_cin, ro_sn;
-	struct i2400m_roq *roq;
-	struct i2400m_roq_data *roq_data;
-	unsigned long flags;
-
-	BUILD_BUG_ON(ETH_HLEN > sizeof(*hdr));
-
-	d_fnstart(2, dev, "(i2400m %p skb_rx %p single %u payload %p "
-		  "size %zu)\n", i2400m, skb_rx, single_last, payload, size);
-	if (size < sizeof(*hdr)) {
-		dev_err(dev, "ERX: HW BUG? message with short header (%zu "
-			"vs %zu bytes expected)\n", size, sizeof(*hdr));
-		goto error;
-	}
-
-	if (single_last) {
-		skb = skb_get(skb_rx);
-		d_printf(3, dev, "ERX: skb %p reusing\n", skb);
-	} else {
-		skb = skb_clone(skb_rx, GFP_KERNEL);
-		if (skb == NULL) {
-			dev_err(dev, "ERX: no memory to clone skb\n");
-			net_dev->stats.rx_dropped++;
-			goto error_skb_clone;
-		}
-		d_printf(3, dev, "ERX: skb %p cloned from %p\n", skb, skb_rx);
-	}
-	/* now we have to pull and trim so that the skb points to the
-	 * beginning of the IP packet; the netdev part will add the
-	 * ethernet header as needed - we know there is enough space
-	 * because we checked in i2400m_rx_edata(). */
-	skb_pull(skb, payload + sizeof(*hdr) - (void *) skb->data);
-	skb_trim(skb, (void *) skb_end_pointer(skb) - payload - sizeof(*hdr));
-
-	reorder = le32_to_cpu(hdr->reorder);
-	ro_needed = reorder & I2400M_RO_NEEDED;
-	cs = hdr->cs;
-	if (ro_needed) {
-		ro_type = (reorder >> I2400M_RO_TYPE_SHIFT) & I2400M_RO_TYPE;
-		ro_cin = (reorder >> I2400M_RO_CIN_SHIFT) & I2400M_RO_CIN;
-		ro_sn = (reorder >> I2400M_RO_SN_SHIFT) & I2400M_RO_SN;
-
-		spin_lock_irqsave(&i2400m->rx_lock, flags);
-		if (i2400m->rx_roq == NULL) {
-			kfree_skb(skb);	/* rx_roq is already destroyed */
-			spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-			goto error;
-		}
-		roq = &i2400m->rx_roq[ro_cin];
-		kref_get(&i2400m->rx_roq_refcount);
-		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-
-		roq_data = (struct i2400m_roq_data *) &skb->cb;
-		roq_data->sn = ro_sn;
-		roq_data->cs = cs;
-		d_printf(2, dev, "ERX: reorder needed: "
-			 "type %u cin %u [ws %u] sn %u/%u len %zuB\n",
-			 ro_type, ro_cin, roq->ws, ro_sn,
-			 __i2400m_roq_nsn(roq, ro_sn), size);
-		d_dump(2, dev, payload, size);
-		switch(ro_type) {
-		case I2400M_RO_TYPE_RESET:
-			i2400m_roq_reset(i2400m, roq);
-			kfree_skb(skb);	/* no data here */
-			break;
-		case I2400M_RO_TYPE_PACKET:
-			i2400m_roq_queue(i2400m, roq, skb, ro_sn);
-			break;
-		case I2400M_RO_TYPE_WS:
-			i2400m_roq_update_ws(i2400m, roq, ro_sn);
-			kfree_skb(skb);	/* no data here */
-			break;
-		case I2400M_RO_TYPE_PACKET_WS:
-			i2400m_roq_queue_update_ws(i2400m, roq, skb, ro_sn);
-			break;
-		default:
-			dev_err(dev, "HW BUG? unknown reorder type %u\n", ro_type);
-		}
-
-		spin_lock_irqsave(&i2400m->rx_lock, flags);
-		kref_put(&i2400m->rx_roq_refcount, i2400m_rx_roq_destroy);
-		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	}
-	else
-		i2400m_net_erx(i2400m, skb, cs);
-error_skb_clone:
-error:
-	d_fnend(2, dev, "(i2400m %p skb_rx %p single %u payload %p "
-		"size %zu) = void\n", i2400m, skb_rx, single_last, payload, size);
-}
-
-
-/*
- * Act on a received payload
- *
- * @i2400m: device instance
- * @skb_rx: skb where the transaction was received
- * @single_last: 1 this is the only payload or the last one (so the
- *     skb can be reused instead of cloned).
- * @pld: payload descriptor
- * @payload: payload data
- *
- * Upon reception of a payload, look at its guts in the payload
- * descriptor and decide what to do with it. If it is a single payload
- * skb or if the last skb is a data packet, the skb will be referenced
- * and modified (so it doesn't have to be cloned).
- */
-static
-void i2400m_rx_payload(struct i2400m *i2400m, struct sk_buff *skb_rx,
-		       unsigned single_last, const struct i2400m_pld *pld,
-		       const void *payload)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	size_t pl_size = i2400m_pld_size(pld);
-	enum i2400m_pt pl_type = i2400m_pld_type(pld);
-
-	d_printf(7, dev, "RX: received payload type %u, %zu bytes\n",
-		 pl_type, pl_size);
-	d_dump(8, dev, payload, pl_size);
-
-	switch (pl_type) {
-	case I2400M_PT_DATA:
-		d_printf(3, dev, "RX: data payload %zu bytes\n", pl_size);
-		i2400m_net_rx(i2400m, skb_rx, single_last, payload, pl_size);
-		break;
-	case I2400M_PT_CTRL:
-		i2400m_rx_ctl(i2400m, skb_rx, payload, pl_size);
-		break;
-	case I2400M_PT_TRACE:
-		i2400m_rx_trace(i2400m, payload, pl_size);
-		break;
-	case I2400M_PT_EDATA:
-		d_printf(3, dev, "ERX: data payload %zu bytes\n", pl_size);
-		i2400m_rx_edata(i2400m, skb_rx, single_last, payload, pl_size);
-		break;
-	default:	/* Anything else shouldn't come to the host */
-		if (printk_ratelimit())
-			dev_err(dev, "RX: HW BUG? unexpected payload type %u\n",
-				pl_type);
-	}
-}
-
-
-/*
- * Check a received transaction's message header
- *
- * @i2400m: device descriptor
- * @msg_hdr: message header
- * @buf_size: size of the received buffer
- *
- * Check that the declarations done by a RX buffer message header are
- * sane and consistent with the amount of data that was received.
- */
-static
-int i2400m_rx_msg_hdr_check(struct i2400m *i2400m,
-			    const struct i2400m_msg_hdr *msg_hdr,
-			    size_t buf_size)
-{
-	int result = -EIO;
-	struct device *dev = i2400m_dev(i2400m);
-	if (buf_size < sizeof(*msg_hdr)) {
-		dev_err(dev, "RX: HW BUG? message with short header (%zu "
-			"vs %zu bytes expected)\n", buf_size, sizeof(*msg_hdr));
-		goto error;
-	}
-	if (msg_hdr->barker != cpu_to_le32(I2400M_D2H_MSG_BARKER)) {
-		dev_err(dev, "RX: HW BUG? message received with unknown "
-			"barker 0x%08x (buf_size %zu bytes)\n",
-			le32_to_cpu(msg_hdr->barker), buf_size);
-		goto error;
-	}
-	if (msg_hdr->num_pls == 0) {
-		dev_err(dev, "RX: HW BUG? zero payload packets in message\n");
-		goto error;
-	}
-	if (le16_to_cpu(msg_hdr->num_pls) > I2400M_MAX_PLS_IN_MSG) {
-		dev_err(dev, "RX: HW BUG? message contains more payload "
-			"than maximum; ignoring.\n");
-		goto error;
-	}
-	result = 0;
-error:
-	return result;
-}
-
-
-/*
- * Check a payload descriptor against the received data
- *
- * @i2400m: device descriptor
- * @pld: payload descriptor
- * @pl_itr: offset (in bytes) in the received buffer the payload is
- *          located
- * @buf_size: size of the received buffer
- *
- * Given a payload descriptor (part of a RX buffer), check it is sane
- * and that the data it declares fits in the buffer.
- */
-static
-int i2400m_rx_pl_descr_check(struct i2400m *i2400m,
-			      const struct i2400m_pld *pld,
-			      size_t pl_itr, size_t buf_size)
-{
-	int result = -EIO;
-	struct device *dev = i2400m_dev(i2400m);
-	size_t pl_size = i2400m_pld_size(pld);
-	enum i2400m_pt pl_type = i2400m_pld_type(pld);
-
-	if (pl_size > i2400m->bus_pl_size_max) {
-		dev_err(dev, "RX: HW BUG? payload @%zu: size %zu is "
-			"bigger than maximum %zu; ignoring message\n",
-			pl_itr, pl_size, i2400m->bus_pl_size_max);
-		goto error;
-	}
-	if (pl_itr + pl_size > buf_size) {	/* enough? */
-		dev_err(dev, "RX: HW BUG? payload @%zu: size %zu "
-			"goes beyond the received buffer "
-			"size (%zu bytes); ignoring message\n",
-			pl_itr, pl_size, buf_size);
-		goto error;
-	}
-	if (pl_type >= I2400M_PT_ILLEGAL) {
-		dev_err(dev, "RX: HW BUG? illegal payload type %u; "
-			"ignoring message\n", pl_type);
-		goto error;
-	}
-	result = 0;
-error:
-	return result;
-}
-
-
-/**
- * i2400m_rx - Receive a buffer of data from the device
- *
- * @i2400m: device descriptor
- * @skb: skbuff where the data has been received
- *
- * Parse in a buffer of data that contains an RX message sent from the
- * device. See the file header for the format. Run all checks on the
- * buffer header, then run over each payload's descriptors, verify
- * their consistency and act on each payload's contents.  If
- * everything is successful, update the device's statistics.
- *
- * Note: You need to set the skb to contain only the length of the
- * received buffer; for that, use skb_trim(skb, RECEIVED_SIZE).
- *
- * Returns:
- *
- * 0 if ok, < 0 errno on error
- *
- * If ok, this function owns now the skb and the caller DOESN'T have
- * to run kfree_skb() on it. However, on error, the caller still owns
- * the skb and it is responsible for releasing it.
- */
-int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
-{
-	int i, result;
-	struct device *dev = i2400m_dev(i2400m);
-	const struct i2400m_msg_hdr *msg_hdr;
-	size_t pl_itr, pl_size;
-	unsigned long flags;
-	unsigned num_pls, single_last, skb_len;
-
-	skb_len = skb->len;
-	d_fnstart(4, dev, "(i2400m %p skb %p [size %u])\n",
-		  i2400m, skb, skb_len);
-	msg_hdr = (void *) skb->data;
-	result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb_len);
-	if (result < 0)
-		goto error_msg_hdr_check;
-	result = -EIO;
-	num_pls = le16_to_cpu(msg_hdr->num_pls);
-	/* Check payload descriptor(s) */
-	pl_itr = struct_size(msg_hdr, pld, num_pls);
-	pl_itr = ALIGN(pl_itr, I2400M_PL_ALIGN);
-	if (pl_itr > skb_len) {	/* got all the payload descriptors? */
-		dev_err(dev, "RX: HW BUG? message too short (%u bytes) for "
-			"%u payload descriptors (%zu each, total %zu)\n",
-			skb_len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr);
-		goto error_pl_descr_short;
-	}
-	/* Walk each payload payload--check we really got it */
-	for (i = 0; i < num_pls; i++) {
-		/* work around old gcc warnings */
-		pl_size = i2400m_pld_size(&msg_hdr->pld[i]);
-		result = i2400m_rx_pl_descr_check(i2400m, &msg_hdr->pld[i],
-						  pl_itr, skb_len);
-		if (result < 0)
-			goto error_pl_descr_check;
-		single_last = num_pls == 1 || i == num_pls - 1;
-		i2400m_rx_payload(i2400m, skb, single_last, &msg_hdr->pld[i],
-				  skb->data + pl_itr);
-		pl_itr += ALIGN(pl_size, I2400M_PL_ALIGN);
-		cond_resched();		/* Don't monopolize */
-	}
-	kfree_skb(skb);
-	/* Update device statistics */
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	i2400m->rx_pl_num += i;
-	if (i > i2400m->rx_pl_max)
-		i2400m->rx_pl_max = i;
-	if (i < i2400m->rx_pl_min)
-		i2400m->rx_pl_min = i;
-	i2400m->rx_num++;
-	i2400m->rx_size_acc += skb_len;
-	if (skb_len < i2400m->rx_size_min)
-		i2400m->rx_size_min = skb_len;
-	if (skb_len > i2400m->rx_size_max)
-		i2400m->rx_size_max = skb_len;
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-error_pl_descr_check:
-error_pl_descr_short:
-error_msg_hdr_check:
-	d_fnend(4, dev, "(i2400m %p skb %p [size %u]) = %d\n",
-		i2400m, skb, skb_len, result);
-	return result;
-}
-EXPORT_SYMBOL_GPL(i2400m_rx);
-
-
-void i2400m_unknown_barker(struct i2400m *i2400m,
-			   const void *buf, size_t size)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	char prefix[64];
-	const __le32 *barker = buf;
-	dev_err(dev, "RX: HW BUG? unknown barker %08x, "
-		"dropping %zu bytes\n", le32_to_cpu(*barker), size);
-	snprintf(prefix, sizeof(prefix), "%s %s: ",
-		 dev_driver_string(dev), dev_name(dev));
-	if (size > 64) {
-		print_hex_dump(KERN_ERR, prefix, DUMP_PREFIX_OFFSET,
-			       8, 4, buf, 64, 0);
-		printk(KERN_ERR "%s... (only first 64 bytes "
-		       "dumped)\n", prefix);
-	} else
-		print_hex_dump(KERN_ERR, prefix, DUMP_PREFIX_OFFSET,
-			       8, 4, buf, size, 0);
-}
-EXPORT_SYMBOL(i2400m_unknown_barker);
-
-
-/*
- * Initialize the RX queue and infrastructure
- *
- * This sets up all the RX reordering infrastructures, which will not
- * be used if reordering is not enabled or if the firmware does not
- * support it. The device is told to do reordering in
- * i2400m_dev_initialize(), where it also looks at the value of the
- * i2400m->rx_reorder switch before taking a decission.
- *
- * Note we allocate the roq queues in one chunk and the actual logging
- * support for it (logging) in another one and then we setup the
- * pointers from the first to the last.
- */
-int i2400m_rx_setup(struct i2400m *i2400m)
-{
-	int result = 0;
-
-	i2400m->rx_reorder = i2400m_rx_reorder_disabled? 0 : 1;
-	if (i2400m->rx_reorder) {
-		unsigned itr;
-		struct i2400m_roq_log *rd;
-
-		result = -ENOMEM;
-
-		i2400m->rx_roq = kcalloc(I2400M_RO_CIN + 1,
-					 sizeof(i2400m->rx_roq[0]), GFP_KERNEL);
-		if (i2400m->rx_roq == NULL)
-			goto error_roq_alloc;
-
-		rd = kcalloc(I2400M_RO_CIN + 1, sizeof(*i2400m->rx_roq[0].log),
-			     GFP_KERNEL);
-		if (rd == NULL) {
-			result = -ENOMEM;
-			goto error_roq_log_alloc;
-		}
-
-		for(itr = 0; itr < I2400M_RO_CIN + 1; itr++) {
-			__i2400m_roq_init(&i2400m->rx_roq[itr]);
-			i2400m->rx_roq[itr].log = &rd[itr];
-		}
-		kref_init(&i2400m->rx_roq_refcount);
-	}
-	return 0;
-
-error_roq_log_alloc:
-	kfree(i2400m->rx_roq);
-error_roq_alloc:
-	return result;
-}
-
-
-/* Tear down the RX queue and infrastructure */
-void i2400m_rx_release(struct i2400m *i2400m)
-{
-	unsigned long flags;
-
-	if (i2400m->rx_reorder) {
-		spin_lock_irqsave(&i2400m->rx_lock, flags);
-		kref_put(&i2400m->rx_roq_refcount, i2400m_rx_roq_destroy);
-		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	}
-	/* at this point, nothing can be received... */
-	i2400m_report_hook_flush(i2400m);
-}
diff --git a/drivers/net/wimax/i2400m/sysfs.c b/drivers/net/wimax/i2400m/sysfs.c
deleted file mode 100644
index 895ee265909b..000000000000
--- a/drivers/net/wimax/i2400m/sysfs.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Sysfs interfaces to show driver and device information
- *
- * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/spinlock.h>
-#include <linux/device.h>
-#include "i2400m.h"
-
-
-#define D_SUBMODULE sysfs
-#include "debug-levels.h"
-
-
-/*
- * Set the idle timeout (msecs)
- *
- * FIXME: eventually this should be a common WiMAX stack method, but
- * would like to wait to see how other devices manage it.
- */
-static
-ssize_t i2400m_idle_timeout_store(struct device *dev,
-				  struct device_attribute *attr,
-				  const char *buf, size_t size)
-{
-	ssize_t result;
-	struct i2400m *i2400m = net_dev_to_i2400m(to_net_dev(dev));
-	unsigned val;
-
-	result = -EINVAL;
-	if (sscanf(buf, "%u\n", &val) != 1)
-		goto error_no_unsigned;
-	if (val != 0 && (val < 100 || val > 300000 || val % 100 != 0)) {
-		dev_err(dev, "idle_timeout: %u: invalid msecs specification; "
-			"valid values are 0, 100-300000 in 100 increments\n",
-			val);
-		goto error_bad_value;
-	}
-	result = i2400m_set_idle_timeout(i2400m, val);
-	if (result >= 0)
-		result = size;
-error_no_unsigned:
-error_bad_value:
-	return result;
-}
-
-static
-DEVICE_ATTR_WO(i2400m_idle_timeout);
-
-static
-struct attribute *i2400m_dev_attrs[] = {
-	&dev_attr_i2400m_idle_timeout.attr,
-	NULL,
-};
-
-struct attribute_group i2400m_dev_attr_group = {
-	.name = NULL,		/* we want them in the same directory */
-	.attrs = i2400m_dev_attrs,
-};
diff --git a/drivers/net/wimax/i2400m/tx.c b/drivers/net/wimax/i2400m/tx.c
deleted file mode 100644
index 1255302e251e..000000000000
--- a/drivers/net/wimax/i2400m/tx.c
+++ /dev/null
@@ -1,1011 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Generic (non-bus specific) TX handling
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- *  - Initial implementation
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Rewritten to use a single FIFO to lower the memory allocation
- *    pressure and optimize cache hits when copying to the queue, as
- *    well as splitting out bus-specific code.
- *
- *
- * Implements data transmission to the device; this is done through a
- * software FIFO, as data/control frames can be coalesced (while the
- * device is reading the previous tx transaction, others accumulate).
- *
- * A FIFO is used because at the end it is resource-cheaper that trying
- * to implement scatter/gather over USB. As well, most traffic is going
- * to be download (vs upload).
- *
- * The format for sending/receiving data to/from the i2400m is
- * described in detail in rx.c:PROTOCOL FORMAT. In here we implement
- * the transmission of that. This is split between a bus-independent
- * part that just prepares everything and a bus-specific part that
- * does the actual transmission over the bus to the device (in the
- * bus-specific driver).
- *
- *
- * The general format of a device-host transaction is MSG-HDR, PLD1,
- * PLD2...PLDN, PL1, PL2,...PLN, PADDING.
- *
- * Because we need the send payload descriptors and then payloads and
- * because it is kind of expensive to do scatterlists in USB (one URB
- * per node), it becomes cheaper to append all the data to a FIFO
- * (copying to a FIFO potentially in cache is cheaper).
- *
- * Then the bus-specific code takes the parts of that FIFO that are
- * written and passes them to the device.
- *
- * So the concepts to keep in mind there are:
- *
- * We use a FIFO to queue the data in a linear buffer. We first append
- * a MSG-HDR, space for I2400M_TX_PLD_MAX payload descriptors and then
- * go appending payloads until we run out of space or of payload
- * descriptors. Then we append padding to make the whole transaction a
- * multiple of i2400m->bus_tx_block_size (as defined by the bus layer).
- *
- * - A TX message: a combination of a message header, payload
- *   descriptors and payloads.
- *
- *     Open: it is marked as active (i2400m->tx_msg is valid) and we
- *       can keep adding payloads to it.
- *
- *     Closed: we are not appending more payloads to this TX message
- *       (exahusted space in the queue, too many payloads or
- *       whichever).  We have appended padding so the whole message
- *       length is aligned to i2400m->bus_tx_block_size (as set by the
- *       bus/transport layer).
- *
- * - Most of the time we keep a TX message open to which we append
- *   payloads.
- *
- * - If we are going to append and there is no more space (we are at
- *   the end of the FIFO), we close the message, mark the rest of the
- *   FIFO space unusable (skip_tail), create a new message at the
- *   beginning of the FIFO (if there is space) and append the message
- *   there.
- *
- *   This is because we need to give linear TX messages to the bus
- *   engine. So we don't write a message to the remaining FIFO space
- *   until the tail and continue at the head of it.
- *
- * - We overload one of the fields in the message header to use it as
- *   'size' of the TX message, so we can iterate over them. It also
- *   contains a flag that indicates if we have to skip it or not.
- *   When we send the buffer, we update that to its real on-the-wire
- *   value.
- *
- * - The MSG-HDR PLD1...PLD2 stuff has to be a size multiple of 16.
- *
- *   It follows that if MSG-HDR says we have N messages, the whole
- *   header + descriptors is 16 + 4*N; for those to be a multiple of
- *   16, it follows that N can be 4, 8, 12, ... (32, 48, 64, 80...
- *   bytes).
- *
- *   So if we have only 1 payload, we have to submit a header that in
- *   all truth has space for 4.
- *
- *   The implication is that we reserve space for 12 (64 bytes); but
- *   if we fill up only (eg) 2, our header becomes 32 bytes only. So
- *   the TX engine has to shift those 32 bytes of msg header and 2
- *   payloads and padding so that right after it the payloads start
- *   and the TX engine has to know about that.
- *
- *   It is cheaper to move the header up than the whole payloads down.
- *
- *   We do this in i2400m_tx_close(). See 'i2400m_msg_hdr->offset'.
- *
- * - Each payload has to be size-padded to 16 bytes; before appending
- *   it, we just do it.
- *
- * - The whole message has to be padded to i2400m->bus_tx_block_size;
- *   we do this at close time. Thus, when reserving space for the
- *   payload, we always make sure there is also free space for this
- *   padding that sooner or later will happen.
- *
- * When we append a message, we tell the bus specific code to kick in
- * TXs. It will TX (in parallel) until the buffer is exhausted--hence
- * the lockin we do. The TX code will only send a TX message at the
- * time (which remember, might contain more than one payload). Of
- * course, when the bus-specific driver attempts to TX a message that
- * is still open, it gets closed first.
- *
- * Gee, this is messy; well a picture. In the example below we have a
- * partially full FIFO, with a closed message ready to be delivered
- * (with a moved message header to make sure it is size-aligned to
- * 16), TAIL room that was unusable (and thus is marked with a message
- * header that says 'skip this') and at the head of the buffer, an
- * incomplete message with a couple of payloads.
- *
- * N   ___________________________________________________
- *    |                                                   |
- *    |     TAIL room                                     |
- *    |                                                   |
- *    |  msg_hdr to skip (size |= 0x80000)                |
- *    |---------------------------------------------------|-------
- *    |                                                   |  /|\
- *    |                                                   |   |
- *    |  TX message padding                               |   |
- *    |                                                   |   |
- *    |                                                   |   |
- *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|   |
- *    |                                                   |   |
- *    |  payload 1                                        |   |
- *    |                                                   | N * tx_block_size
- *    |                                                   |   |
- *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|   |
- *    |                                                   |   |
- *    |  payload 1                                        |   |
- *    |                                                   |   |
- *    |                                                   |   |
- *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|- -|- - - -
- *    |  padding 3                  /|\                   |   |   /|\
- *    |  padding 2                   |                    |   |    |
- *    |  pld 1                32 bytes (2 * 16)           |   |    |
- *    |  pld 0                       |                    |   |    |
- *    |  moved msg_hdr              \|/                   |  \|/   |
- *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|- - -   |
- *    |                                                   |    _PLD_SIZE
- *    |  unused                                           |        |
- *    |                                                   |        |
- *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|        |
- *    |  msg_hdr (size X)       [this message is closed]  |       \|/
- *    |===================================================|========== <=== OUT
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |          Free rooom                               |
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |===================================================|========== <=== IN
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |                                                   |
- *    |  payload 1                                        |
- *    |                                                   |
- *    |                                                   |
- *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|
- *    |                                                   |
- *    |  payload 0                                        |
- *    |                                                   |
- *    |                                                   |
- *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|
- *    |  pld 11                     /|\                   |
- *    |  ...                         |                    |
- *    |  pld 1                64 bytes (2 * 16)           |
- *    |  pld 0                       |                    |
- *    |  msg_hdr (size X)           \|/ [message is open] |
- * 0   ---------------------------------------------------
- *
- *
- * ROADMAP
- *
- * i2400m_tx_setup()           Called by i2400m_setup
- * i2400m_tx_release()         Called by i2400m_release()
- *
- *  i2400m_tx()                 Called to send data or control frames
- *    i2400m_tx_fifo_push()     Allocates append-space in the FIFO
- *    i2400m_tx_new()           Opens a new message in the FIFO
- *    i2400m_tx_fits()          Checks if a new payload fits in the message
- *    i2400m_tx_close()         Closes an open message in the FIFO
- *    i2400m_tx_skip_tail()     Marks unusable FIFO tail space
- *    i2400m->bus_tx_kick()
- *
- * Now i2400m->bus_tx_kick() is the the bus-specific driver backend
- * implementation; that would do:
- *
- * i2400m->bus_tx_kick()
- *   i2400m_tx_msg_get()	Gets first message ready to go
- *   ...sends it...
- *   i2400m_tx_msg_sent()       Ack the message is sent; repeat from
- *                              _tx_msg_get() until it returns NULL
- *                               (FIFO empty).
- */
-#include <linux/netdevice.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include "i2400m.h"
-
-
-#define D_SUBMODULE tx
-#include "debug-levels.h"
-
-enum {
-	/**
-	 * TX Buffer size
-	 *
-	 * Doc says maximum transaction is 16KiB. If we had 16KiB en
-	 * route and 16KiB being queued, it boils down to needing
-	 * 32KiB.
-	 * 32KiB is insufficient for 1400 MTU, hence increasing
-	 * tx buffer size to 64KiB.
-	 */
-	I2400M_TX_BUF_SIZE = 65536,
-	/**
-	 * Message header and payload descriptors have to be 16
-	 * aligned (16 + 4 * N = 16 * M). If we take that average sent
-	 * packets are MTU size (~1400-~1500) it follows that we could
-	 * fit at most 10-11 payloads in one transaction. To meet the
-	 * alignment requirement, that means we need to leave space
-	 * for 12 (64 bytes). To simplify, we leave space for that. If
-	 * at the end there are less, we pad up to the nearest
-	 * multiple of 16.
-	 */
-	/*
-	 * According to Intel Wimax i3200, i5x50 and i6x50 specification
-	 * documents, the maximum number of payloads per message can be
-	 * up to 60. Increasing the number of payloads to 60 per message
-	 * helps to accommodate smaller payloads in a single transaction.
-	 */
-	I2400M_TX_PLD_MAX = 60,
-	I2400M_TX_PLD_SIZE = sizeof(struct i2400m_msg_hdr)
-	+ I2400M_TX_PLD_MAX * sizeof(struct i2400m_pld),
-	I2400M_TX_SKIP = 0x80000000,
-	/*
-	 * According to Intel Wimax i3200, i5x50 and i6x50 specification
-	 * documents, the maximum size of each message can be up to 16KiB.
-	 */
-	I2400M_TX_MSG_SIZE = 16384,
-};
-
-#define TAIL_FULL ((void *)~(unsigned long)NULL)
-
-/*
- * Calculate how much tail room is available
- *
- * Note the trick here. This path is ONLY caleed for Case A (see
- * i2400m_tx_fifo_push() below), where we have:
- *
- *       Case A
- * N  ___________
- *   | tail room |
- *   |           |
- *   |<-  IN   ->|
- *   |           |
- *   |   data    |
- *   |           |
- *   |<-  OUT  ->|
- *   |           |
- *   | head room |
- * 0  -----------
- *
- * When calculating the tail_room, tx_in might get to be zero if
- * i2400m->tx_in is right at the end of the buffer (really full
- * buffer) if there is no head room. In this case, tail_room would be
- * I2400M_TX_BUF_SIZE, although it is actually zero. Hence the final
- * mod (%) operation. However, when doing this kind of optimization,
- * i2400m->tx_in being zero would fail, so we treat is an a special
- * case.
- */
-static inline
-size_t __i2400m_tx_tail_room(struct i2400m *i2400m)
-{
-	size_t tail_room;
-	size_t tx_in;
-
-	if (unlikely(i2400m->tx_in == 0))
-		return I2400M_TX_BUF_SIZE;
-	tx_in = i2400m->tx_in % I2400M_TX_BUF_SIZE;
-	tail_room = I2400M_TX_BUF_SIZE - tx_in;
-	tail_room %= I2400M_TX_BUF_SIZE;
-	return tail_room;
-}
-
-
-/*
- * Allocate @size bytes in the TX fifo, return a pointer to it
- *
- * @i2400m: device descriptor
- * @size: size of the buffer we need to allocate
- * @padding: ensure that there is at least this many bytes of free
- *     contiguous space in the fifo. This is needed because later on
- *     we might need to add padding.
- * @try_head: specify either to allocate head room or tail room space
- *     in the TX FIFO. This boolean is required to avoids a system hang
- *     due to an infinite loop caused by i2400m_tx_fifo_push().
- *     The caller must always try to allocate tail room space first by
- *     calling this routine with try_head = 0. In case if there
- *     is not enough tail room space but there is enough head room space,
- *     (i2400m_tx_fifo_push() returns TAIL_FULL) try to allocate head
- *     room space, by calling this routine again with try_head = 1.
- *
- * Returns:
- *
- *     Pointer to the allocated space. NULL if there is no
- *     space. TAIL_FULL if there is no space at the tail but there is at
- *     the head (Case B below).
- *
- * These are the two basic cases we need to keep an eye for -- it is
- * much better explained in linux/kernel/kfifo.c, but this code
- * basically does the same. No rocket science here.
- *
- *       Case A               Case B
- * N  ___________          ___________
- *   | tail room |        |   data    |
- *   |           |        |           |
- *   |<-  IN   ->|        |<-  OUT  ->|
- *   |           |        |           |
- *   |   data    |        |   room    |
- *   |           |        |           |
- *   |<-  OUT  ->|        |<-  IN   ->|
- *   |           |        |           |
- *   | head room |        |   data    |
- * 0  -----------          -----------
- *
- * We allocate only *contiguous* space.
- *
- * We can allocate only from 'room'. In Case B, it is simple; in case
- * A, we only try from the tail room; if it is not enough, we just
- * fail and return TAIL_FULL and let the caller figure out if we wants to
- * skip the tail room and try to allocate from the head.
- *
- * There is a corner case, wherein i2400m_tx_new() can get into
- * an infinite loop calling i2400m_tx_fifo_push().
- * In certain situations, tx_in would have reached on the top of TX FIFO
- * and i2400m_tx_tail_room() returns 0, as described below:
- *
- * N  ___________ tail room is zero
- *   |<-  IN   ->|
- *   |           |
- *   |           |
- *   |           |
- *   |   data    |
- *   |<-  OUT  ->|
- *   |           |
- *   |           |
- *   | head room |
- * 0  -----------
- * During such a time, where tail room is zero in the TX FIFO and if there
- * is a request to add a payload to TX FIFO, which calls:
- * i2400m_tx()
- *         ->calls i2400m_tx_close()
- *         ->calls i2400m_tx_skip_tail()
- *         goto try_new;
- *         ->calls i2400m_tx_new()
- *                    |----> [try_head:]
- *     infinite loop  |     ->calls i2400m_tx_fifo_push()
- *                    |                if (tail_room < needed)
- *                    |                   if (head_room => needed)
- *                    |                       return TAIL_FULL;
- *                    |<----  goto try_head;
- *
- * i2400m_tx() calls i2400m_tx_close() to close the message, since there
- * is no tail room to accommodate the payload and calls
- * i2400m_tx_skip_tail() to skip the tail space. Now i2400m_tx() calls
- * i2400m_tx_new() to allocate space for new message header calling
- * i2400m_tx_fifo_push() that returns TAIL_FULL, since there is no tail space
- * to accommodate the message header, but there is enough head space.
- * The i2400m_tx_new() keeps re-retrying by calling i2400m_tx_fifo_push()
- * ending up in a loop causing system freeze.
- *
- * This corner case is avoided by using a try_head boolean,
- * as an argument to i2400m_tx_fifo_push().
- *
- * Note:
- *
- *     Assumes i2400m->tx_lock is taken, and we use that as a barrier
- *
- *     The indexes keep increasing and we reset them to zero when we
- *     pop data off the queue
- */
-static
-void *i2400m_tx_fifo_push(struct i2400m *i2400m, size_t size,
-			  size_t padding, bool try_head)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	size_t room, tail_room, needed_size;
-	void *ptr;
-
-	needed_size = size + padding;
-	room = I2400M_TX_BUF_SIZE - (i2400m->tx_in - i2400m->tx_out);
-	if (room < needed_size)	{ /* this takes care of Case B */
-		d_printf(2, dev, "fifo push %zu/%zu: no space\n",
-			 size, padding);
-		return NULL;
-	}
-	/* Is there space at the tail? */
-	tail_room = __i2400m_tx_tail_room(i2400m);
-	if (!try_head && tail_room < needed_size) {
-		/*
-		 * If the tail room space is not enough to push the message
-		 * in the TX FIFO, then there are two possibilities:
-		 * 1. There is enough head room space to accommodate
-		 * this message in the TX FIFO.
-		 * 2. There is not enough space in the head room and
-		 * in tail room of the TX FIFO to accommodate the message.
-		 * In the case (1), return TAIL_FULL so that the caller
-		 * can figure out, if the caller wants to push the message
-		 * into the head room space.
-		 * In the case (2), return NULL, indicating that the TX FIFO
-		 * cannot accommodate the message.
-		 */
-		if (room - tail_room >= needed_size) {
-			d_printf(2, dev, "fifo push %zu/%zu: tail full\n",
-				 size, padding);
-			return TAIL_FULL;	/* There might be head space */
-		} else {
-			d_printf(2, dev, "fifo push %zu/%zu: no head space\n",
-				 size, padding);
-			return NULL;	/* There is no space */
-		}
-	}
-	ptr = i2400m->tx_buf + i2400m->tx_in % I2400M_TX_BUF_SIZE;
-	d_printf(2, dev, "fifo push %zu/%zu: at @%zu\n", size, padding,
-		 i2400m->tx_in % I2400M_TX_BUF_SIZE);
-	i2400m->tx_in += size;
-	return ptr;
-}
-
-
-/*
- * Mark the tail of the FIFO buffer as 'to-skip'
- *
- * We should never hit the BUG_ON() because all the sizes we push to
- * the FIFO are padded to be a multiple of 16 -- the size of *msg
- * (I2400M_PL_PAD for the payloads, I2400M_TX_PLD_SIZE for the
- * header).
- *
- * Tail room can get to be zero if a message was opened when there was
- * space only for a header. _tx_close() will mark it as to-skip (as it
- * will have no payloads) and there will be no more space to flush, so
- * nothing has to be done here. This is probably cheaper than ensuring
- * in _tx_new() that there is some space for payloads...as we could
- * always possibly hit the same problem if the payload wouldn't fit.
- *
- * Note:
- *
- *     Assumes i2400m->tx_lock is taken, and we use that as a barrier
- *
- *     This path is only taken for Case A FIFO situations [see
- *     i2400m_tx_fifo_push()]
- */
-static
-void i2400m_tx_skip_tail(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	size_t tx_in = i2400m->tx_in % I2400M_TX_BUF_SIZE;
-	size_t tail_room = __i2400m_tx_tail_room(i2400m);
-	struct i2400m_msg_hdr *msg = i2400m->tx_buf + tx_in;
-	if (unlikely(tail_room == 0))
-		return;
-	BUG_ON(tail_room < sizeof(*msg));
-	msg->size = tail_room | I2400M_TX_SKIP;
-	d_printf(2, dev, "skip tail: skipping %zu bytes @%zu\n",
-		 tail_room, tx_in);
-	i2400m->tx_in += tail_room;
-}
-
-
-/*
- * Check if a skb will fit in the TX queue's current active TX
- * message (if there are still descriptors left unused).
- *
- * Returns:
- *     0 if the message won't fit, 1 if it will.
- *
- * Note:
- *
- *     Assumes a TX message is active (i2400m->tx_msg).
- *
- *     Assumes i2400m->tx_lock is taken, and we use that as a barrier
- */
-static
-unsigned i2400m_tx_fits(struct i2400m *i2400m)
-{
-	struct i2400m_msg_hdr *msg_hdr = i2400m->tx_msg;
-	return le16_to_cpu(msg_hdr->num_pls) < I2400M_TX_PLD_MAX;
-
-}
-
-
-/*
- * Start a new TX message header in the queue.
- *
- * Reserve memory from the base FIFO engine and then just initialize
- * the message header.
- *
- * We allocate the biggest TX message header we might need (one that'd
- * fit I2400M_TX_PLD_MAX payloads) -- when it is closed it will be
- * 'ironed it out' and the unneeded parts removed.
- *
- * NOTE:
- *
- *     Assumes that the previous message is CLOSED (eg: either
- *     there was none or 'i2400m_tx_close()' was called on it).
- *
- *     Assumes i2400m->tx_lock is taken, and we use that as a barrier
- */
-static
-void i2400m_tx_new(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_msg_hdr *tx_msg;
-	bool try_head = false;
-	BUG_ON(i2400m->tx_msg != NULL);
-	/*
-	 * In certain situations, TX queue might have enough space to
-	 * accommodate the new message header I2400M_TX_PLD_SIZE, but
-	 * might not have enough space to accommodate the payloads.
-	 * Adding bus_tx_room_min padding while allocating a new TX message
-	 * increases the possibilities of including at least one payload of the
-	 * size <= bus_tx_room_min.
-	 */
-try_head:
-	tx_msg = i2400m_tx_fifo_push(i2400m, I2400M_TX_PLD_SIZE,
-				     i2400m->bus_tx_room_min, try_head);
-	if (tx_msg == NULL)
-		goto out;
-	else if (tx_msg == TAIL_FULL) {
-		i2400m_tx_skip_tail(i2400m);
-		d_printf(2, dev, "new TX message: tail full, trying head\n");
-		try_head = true;
-		goto try_head;
-	}
-	memset(tx_msg, 0, I2400M_TX_PLD_SIZE);
-	tx_msg->size = I2400M_TX_PLD_SIZE;
-out:
-	i2400m->tx_msg = tx_msg;
-	d_printf(2, dev, "new TX message: %p @%zu\n",
-		 tx_msg, (void *) tx_msg - i2400m->tx_buf);
-}
-
-
-/*
- * Finalize the current TX message header
- *
- * Sets the message header to be at the proper location depending on
- * how many descriptors we have (check documentation at the file's
- * header for more info on that).
- *
- * Appends padding bytes to make sure the whole TX message (counting
- * from the 'relocated' message header) is aligned to
- * tx_block_size. We assume the _append() code has left enough space
- * in the FIFO for that. If there are no payloads, just pass, as it
- * won't be transferred.
- *
- * The amount of padding bytes depends on how many payloads are in the
- * TX message, as the "msg header and payload descriptors" will be
- * shifted up in the buffer.
- */
-static
-void i2400m_tx_close(struct i2400m *i2400m)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_msg_hdr *tx_msg = i2400m->tx_msg;
-	struct i2400m_msg_hdr *tx_msg_moved;
-	size_t aligned_size, padding, hdr_size;
-	void *pad_buf;
-	unsigned num_pls;
-
-	if (tx_msg->size & I2400M_TX_SKIP)	/* a skipper? nothing to do */
-		goto out;
-	num_pls = le16_to_cpu(tx_msg->num_pls);
-	/* We can get this situation when a new message was started
-	 * and there was no space to add payloads before hitting the
-	 tail (and taking padding into consideration). */
-	if (num_pls == 0) {
-		tx_msg->size |= I2400M_TX_SKIP;
-		goto out;
-	}
-	/* Relocate the message header
-	 *
-	 * Find the current header size, align it to 16 and if we need
-	 * to move it so the tail is next to the payloads, move it and
-	 * set the offset.
-	 *
-	 * If it moved, this header is good only for transmission; the
-	 * original one (it is kept if we moved) is still used to
-	 * figure out where the next TX message starts (and where the
-	 * offset to the moved header is).
-	 */
-	hdr_size = struct_size(tx_msg, pld, le16_to_cpu(tx_msg->num_pls));
-	hdr_size = ALIGN(hdr_size, I2400M_PL_ALIGN);
-	tx_msg->offset = I2400M_TX_PLD_SIZE - hdr_size;
-	tx_msg_moved = (void *) tx_msg + tx_msg->offset;
-	memmove(tx_msg_moved, tx_msg, hdr_size);
-	tx_msg_moved->size -= tx_msg->offset;
-	/*
-	 * Now figure out how much we have to add to the (moved!)
-	 * message so the size is a multiple of i2400m->bus_tx_block_size.
-	 */
-	aligned_size = ALIGN(tx_msg_moved->size, i2400m->bus_tx_block_size);
-	padding = aligned_size - tx_msg_moved->size;
-	if (padding > 0) {
-		pad_buf = i2400m_tx_fifo_push(i2400m, padding, 0, 0);
-		if (WARN_ON(pad_buf == NULL || pad_buf == TAIL_FULL)) {
-			/* This should not happen -- append should verify
-			 * there is always space left at least to append
-			 * tx_block_size */
-			dev_err(dev,
-				"SW BUG! Possible data leakage from memory the "
-				"device should not read for padding - "
-				"size %lu aligned_size %zu tx_buf %p in "
-				"%zu out %zu\n",
-				(unsigned long) tx_msg_moved->size,
-				aligned_size, i2400m->tx_buf, i2400m->tx_in,
-				i2400m->tx_out);
-		} else
-			memset(pad_buf, 0xad, padding);
-	}
-	tx_msg_moved->padding = cpu_to_le16(padding);
-	tx_msg_moved->size += padding;
-	if (tx_msg != tx_msg_moved)
-		tx_msg->size += padding;
-out:
-	i2400m->tx_msg = NULL;
-}
-
-
-/**
- * i2400m_tx - send the data in a buffer to the device
- *
- * @buf: pointer to the buffer to transmit
- *
- * @buf_len: buffer size
- *
- * @pl_type: type of the payload we are sending.
- *
- * Returns:
- *     0 if ok, < 0 errno code on error (-ENOSPC, if there is no more
- *     room for the message in the queue).
- *
- * Appends the buffer to the TX FIFO and notifies the bus-specific
- * part of the driver that there is new data ready to transmit.
- * Once this function returns, the buffer has been copied, so it can
- * be reused.
- *
- * The steps followed to append are explained in detail in the file
- * header.
- *
- * Whenever we write to a message, we increase msg->size, so it
- * reflects exactly how big the message is. This is needed so that if
- * we concatenate two messages before they can be sent, the code that
- * sends the messages can find the boundaries (and it will replace the
- * size with the real barker before sending).
- *
- * Note:
- *
- *     Cold and warm reset payloads need to be sent as a single
- *     payload, so we handle that.
- */
-int i2400m_tx(struct i2400m *i2400m, const void *buf, size_t buf_len,
-	      enum i2400m_pt pl_type)
-{
-	int result = -ENOSPC;
-	struct device *dev = i2400m_dev(i2400m);
-	unsigned long flags;
-	size_t padded_len;
-	void *ptr;
-	bool try_head = false;
-	unsigned is_singleton = pl_type == I2400M_PT_RESET_WARM
-		|| pl_type == I2400M_PT_RESET_COLD;
-
-	d_fnstart(3, dev, "(i2400m %p skb %p [%zu bytes] pt %u)\n",
-		  i2400m, buf, buf_len, pl_type);
-	padded_len = ALIGN(buf_len, I2400M_PL_ALIGN);
-	d_printf(5, dev, "padded_len %zd buf_len %zd\n", padded_len, buf_len);
-	/* If there is no current TX message, create one; if the
-	 * current one is out of payload slots or we have a singleton,
-	 * close it and start a new one */
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	/* If tx_buf is NULL, device is shutdown */
-	if (i2400m->tx_buf == NULL) {
-		result = -ESHUTDOWN;
-		goto error_tx_new;
-	}
-try_new:
-	if (unlikely(i2400m->tx_msg == NULL))
-		i2400m_tx_new(i2400m);
-	else if (unlikely(!i2400m_tx_fits(i2400m)
-			  || (is_singleton && i2400m->tx_msg->num_pls != 0))) {
-		d_printf(2, dev, "closing TX message (fits %u singleton "
-			 "%u num_pls %u)\n", i2400m_tx_fits(i2400m),
-			 is_singleton, i2400m->tx_msg->num_pls);
-		i2400m_tx_close(i2400m);
-		i2400m_tx_new(i2400m);
-	}
-	if (i2400m->tx_msg == NULL)
-		goto error_tx_new;
-	/*
-	 * Check if this skb will fit in the TX queue's current active
-	 * TX message. The total message size must not exceed the maximum
-	 * size of each message I2400M_TX_MSG_SIZE. If it exceeds,
-	 * close the current message and push this skb into the new message.
-	 */
-	if (i2400m->tx_msg->size + padded_len > I2400M_TX_MSG_SIZE) {
-		d_printf(2, dev, "TX: message too big, going new\n");
-		i2400m_tx_close(i2400m);
-		i2400m_tx_new(i2400m);
-	}
-	if (i2400m->tx_msg == NULL)
-		goto error_tx_new;
-	/* So we have a current message header; now append space for
-	 * the message -- if there is not enough, try the head */
-	ptr = i2400m_tx_fifo_push(i2400m, padded_len,
-				  i2400m->bus_tx_block_size, try_head);
-	if (ptr == TAIL_FULL) {	/* Tail is full, try head */
-		d_printf(2, dev, "pl append: tail full\n");
-		i2400m_tx_close(i2400m);
-		i2400m_tx_skip_tail(i2400m);
-		try_head = true;
-		goto try_new;
-	} else if (ptr == NULL) {	/* All full */
-		result = -ENOSPC;
-		d_printf(2, dev, "pl append: all full\n");
-	} else {			/* Got space, copy it, set padding */
-		struct i2400m_msg_hdr *tx_msg = i2400m->tx_msg;
-		unsigned num_pls = le16_to_cpu(tx_msg->num_pls);
-		memcpy(ptr, buf, buf_len);
-		memset(ptr + buf_len, 0xad, padded_len - buf_len);
-		i2400m_pld_set(&tx_msg->pld[num_pls], buf_len, pl_type);
-		d_printf(3, dev, "pld 0x%08x (type 0x%1x len 0x%04zx\n",
-			 le32_to_cpu(tx_msg->pld[num_pls].val),
-			 pl_type, buf_len);
-		tx_msg->num_pls = le16_to_cpu(num_pls+1);
-		tx_msg->size += padded_len;
-		d_printf(2, dev, "TX: appended %zu b (up to %u b) pl #%u\n",
-			padded_len, tx_msg->size, num_pls+1);
-		d_printf(2, dev,
-			 "TX: appended hdr @%zu %zu b pl #%u @%zu %zu/%zu b\n",
-			 (void *)tx_msg - i2400m->tx_buf, (size_t)tx_msg->size,
-			 num_pls+1, ptr - i2400m->tx_buf, buf_len, padded_len);
-		result = 0;
-		if (is_singleton)
-			i2400m_tx_close(i2400m);
-	}
-error_tx_new:
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-	/* kick in most cases, except when the TX subsys is down, as
-	 * it might free space */
-	if (likely(result != -ESHUTDOWN))
-		i2400m->bus_tx_kick(i2400m);
-	d_fnend(3, dev, "(i2400m %p skb %p [%zu bytes] pt %u) = %d\n",
-		i2400m, buf, buf_len, pl_type, result);
-	return result;
-}
-EXPORT_SYMBOL_GPL(i2400m_tx);
-
-
-/**
- * i2400m_tx_msg_get - Get the first TX message in the FIFO to start sending it
- *
- * @i2400m: device descriptors
- * @bus_size: where to place the size of the TX message
- *
- * Called by the bus-specific driver to get the first TX message at
- * the FIF that is ready for transmission.
- *
- * It sets the state in @i2400m to indicate the bus-specific driver is
- * transferring that message (i2400m->tx_msg_size).
- *
- * Once the transfer is completed, call i2400m_tx_msg_sent().
- *
- * Notes:
- *
- *     The size of the TX message to be transmitted might be smaller than
- *     that of the TX message in the FIFO (in case the header was
- *     shorter). Hence, we copy it in @bus_size, for the bus layer to
- *     use. We keep the message's size in i2400m->tx_msg_size so that
- *     when the bus later is done transferring we know how much to
- *     advance the fifo.
- *
- *     We collect statistics here as all the data is available and we
- *     assume it is going to work [see i2400m_tx_msg_sent()].
- */
-struct i2400m_msg_hdr *i2400m_tx_msg_get(struct i2400m *i2400m,
-					 size_t *bus_size)
-{
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400m_msg_hdr *tx_msg, *tx_msg_moved;
-	unsigned long flags, pls;
-
-	d_fnstart(3, dev, "(i2400m %p bus_size %p)\n", i2400m, bus_size);
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	tx_msg_moved = NULL;
-	if (i2400m->tx_buf == NULL)
-		goto out_unlock;
-skip:
-	tx_msg_moved = NULL;
-	if (i2400m->tx_in == i2400m->tx_out) {	/* Empty FIFO? */
-		i2400m->tx_in = 0;
-		i2400m->tx_out = 0;
-		d_printf(2, dev, "TX: FIFO empty: resetting\n");
-		goto out_unlock;
-	}
-	tx_msg = i2400m->tx_buf + i2400m->tx_out % I2400M_TX_BUF_SIZE;
-	if (tx_msg->size & I2400M_TX_SKIP) {	/* skip? */
-		d_printf(2, dev, "TX: skip: msg @%zu (%zu b)\n",
-			 i2400m->tx_out % I2400M_TX_BUF_SIZE,
-			 (size_t) tx_msg->size & ~I2400M_TX_SKIP);
-		i2400m->tx_out += tx_msg->size & ~I2400M_TX_SKIP;
-		goto skip;
-	}
-
-	if (tx_msg->num_pls == 0) {		/* No payloads? */
-		if (tx_msg == i2400m->tx_msg) {	/* open, we are done */
-			d_printf(2, dev,
-				 "TX: FIFO empty: open msg w/o payloads @%zu\n",
-				 (void *) tx_msg - i2400m->tx_buf);
-			tx_msg = NULL;
-			goto out_unlock;
-		} else {			/* closed, skip it */
-			d_printf(2, dev,
-				 "TX: skip msg w/o payloads @%zu (%zu b)\n",
-				 (void *) tx_msg - i2400m->tx_buf,
-				 (size_t) tx_msg->size);
-			i2400m->tx_out += tx_msg->size & ~I2400M_TX_SKIP;
-			goto skip;
-		}
-	}
-	if (tx_msg == i2400m->tx_msg)		/* open msg? */
-		i2400m_tx_close(i2400m);
-
-	/* Now we have a valid TX message (with payloads) to TX */
-	tx_msg_moved = (void *) tx_msg + tx_msg->offset;
-	i2400m->tx_msg_size = tx_msg->size;
-	*bus_size = tx_msg_moved->size;
-	d_printf(2, dev, "TX: pid %d msg hdr at @%zu offset +@%zu "
-		 "size %zu bus_size %zu\n",
-		 current->pid, (void *) tx_msg - i2400m->tx_buf,
-		 (size_t) tx_msg->offset, (size_t) tx_msg->size,
-		 (size_t) tx_msg_moved->size);
-	tx_msg_moved->barker = le32_to_cpu(I2400M_H2D_PREVIEW_BARKER);
-	tx_msg_moved->sequence = le32_to_cpu(i2400m->tx_sequence++);
-
-	pls = le32_to_cpu(tx_msg_moved->num_pls);
-	i2400m->tx_pl_num += pls;		/* Update stats */
-	if (pls > i2400m->tx_pl_max)
-		i2400m->tx_pl_max = pls;
-	if (pls < i2400m->tx_pl_min)
-		i2400m->tx_pl_min = pls;
-	i2400m->tx_num++;
-	i2400m->tx_size_acc += *bus_size;
-	if (*bus_size < i2400m->tx_size_min)
-		i2400m->tx_size_min = *bus_size;
-	if (*bus_size > i2400m->tx_size_max)
-		i2400m->tx_size_max = *bus_size;
-out_unlock:
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-	d_fnstart(3, dev, "(i2400m %p bus_size %p [%zu]) = %p\n",
-		  i2400m, bus_size, *bus_size, tx_msg_moved);
-	return tx_msg_moved;
-}
-EXPORT_SYMBOL_GPL(i2400m_tx_msg_get);
-
-
-/**
- * i2400m_tx_msg_sent - indicate the transmission of a TX message
- *
- * @i2400m: device descriptor
- *
- * Called by the bus-specific driver when a message has been sent;
- * this pops it from the FIFO; and as there is space, start the queue
- * in case it was stopped.
- *
- * Should be called even if the message send failed and we are
- * dropping this TX message.
- */
-void i2400m_tx_msg_sent(struct i2400m *i2400m)
-{
-	unsigned n;
-	unsigned long flags;
-	struct device *dev = i2400m_dev(i2400m);
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	if (i2400m->tx_buf == NULL)
-		goto out_unlock;
-	i2400m->tx_out += i2400m->tx_msg_size;
-	d_printf(2, dev, "TX: sent %zu b\n", (size_t) i2400m->tx_msg_size);
-	i2400m->tx_msg_size = 0;
-	BUG_ON(i2400m->tx_out > i2400m->tx_in);
-	/* level them FIFO markers off */
-	n = i2400m->tx_out / I2400M_TX_BUF_SIZE;
-	i2400m->tx_out %= I2400M_TX_BUF_SIZE;
-	i2400m->tx_in -= n * I2400M_TX_BUF_SIZE;
-out_unlock:
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
-}
-EXPORT_SYMBOL_GPL(i2400m_tx_msg_sent);
-
-
-/**
- * i2400m_tx_setup - Initialize the TX queue and infrastructure
- *
- * Make sure we reset the TX sequence to zero, as when this function
- * is called, the firmware has been just restarted. Same rational
- * for tx_in, tx_out, tx_msg_size and tx_msg. We reset them since
- * the memory for TX queue is reallocated.
- */
-int i2400m_tx_setup(struct i2400m *i2400m)
-{
-	int result = 0;
-	void *tx_buf;
-	unsigned long flags;
-
-	/* Do this here only once -- can't do on
-	 * i2400m_hard_start_xmit() as we'll cause race conditions if
-	 * the WS was scheduled on another CPU */
-	INIT_WORK(&i2400m->wake_tx_ws, i2400m_wake_tx_work);
-
-	tx_buf = kmalloc(I2400M_TX_BUF_SIZE, GFP_ATOMIC);
-	if (tx_buf == NULL) {
-		result = -ENOMEM;
-		goto error_kmalloc;
-	}
-
-	/*
-	 * Fail the build if we can't fit at least two maximum size messages
-	 * on the TX FIFO [one being delivered while one is constructed].
-	 */
-	BUILD_BUG_ON(2 * I2400M_TX_MSG_SIZE > I2400M_TX_BUF_SIZE);
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	i2400m->tx_sequence = 0;
-	i2400m->tx_in = 0;
-	i2400m->tx_out = 0;
-	i2400m->tx_msg_size = 0;
-	i2400m->tx_msg = NULL;
-	i2400m->tx_buf = tx_buf;
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-	/* Huh? the bus layer has to define this... */
-	BUG_ON(i2400m->bus_tx_block_size == 0);
-error_kmalloc:
-	return result;
-
-}
-
-
-/**
- * i2400m_tx_release - Tear down the TX queue and infrastructure
- */
-void i2400m_tx_release(struct i2400m *i2400m)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	kfree(i2400m->tx_buf);
-	i2400m->tx_buf = NULL;
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-}
diff --git a/drivers/net/wimax/i2400m/usb-debug-levels.h b/drivers/net/wimax/i2400m/usb-debug-levels.h
deleted file mode 100644
index b6f7335de765..000000000000
--- a/drivers/net/wimax/i2400m/usb-debug-levels.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Debug levels control file for the i2400m-usb module
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-#ifndef __debug_levels__h__
-#define __debug_levels__h__
-
-/* Maximum compile and run time debug level for all submodules */
-#define D_MODULENAME i2400m_usb
-#define D_MASTER CONFIG_WIMAX_I2400M_DEBUG_LEVEL
-
-#include <linux/wimax/debug.h>
-
-/* List of all the enabled modules */
-enum d_module {
-	D_SUBMODULE_DECLARE(usb),
-	D_SUBMODULE_DECLARE(fw),
-	D_SUBMODULE_DECLARE(notif),
-	D_SUBMODULE_DECLARE(rx),
-	D_SUBMODULE_DECLARE(tx),
-};
-
-
-#endif /* #ifndef __debug_levels__h__ */
diff --git a/drivers/net/wimax/i2400m/usb-fw.c b/drivers/net/wimax/i2400m/usb-fw.c
deleted file mode 100644
index 27ab233650d5..000000000000
--- a/drivers/net/wimax/i2400m/usb-fw.c
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Firmware uploader's USB specifics
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Initial implementation
- *
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - bus generic/specific split
- *
- * THE PROCEDURE
- *
- * See fw.c for the generic description of this procedure.
- *
- * This file implements only the USB specifics. It boils down to how
- * to send a command and waiting for an acknowledgement from the
- * device.
- *
- * This code (and process) is single threaded. It assumes it is the
- * only thread poking around (guaranteed by fw.c).
- *
- * COMMAND EXECUTION
- *
- * A write URB is posted with the buffer to the bulk output endpoint.
- *
- * ACK RECEPTION
- *
- * We just post a URB to the notification endpoint and wait for
- * data. We repeat until we get all the data we expect (as indicated
- * by the call from the bus generic code).
- *
- * The data is not read from the bulk in endpoint for boot mode.
- *
- * ROADMAP
- *
- * i2400mu_bus_bm_cmd_send
- *   i2400m_bm_cmd_prepare...
- *   i2400mu_tx_bulk_out
- *
- * i2400mu_bus_bm_wait_for_ack
- *   i2400m_notif_submit
- */
-#include <linux/usb.h>
-#include <linux/gfp.h>
-#include "i2400m-usb.h"
-
-
-#define D_SUBMODULE fw
-#include "usb-debug-levels.h"
-
-
-/*
- * Synchronous write to the device
- *
- * Takes care of updating EDC counts and thus, handle device errors.
- */
-static
-ssize_t i2400mu_tx_bulk_out(struct i2400mu *i2400mu, void *buf, size_t buf_size)
-{
-	int result;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	int len;
-	struct usb_endpoint_descriptor *epd;
-	int pipe, do_autopm = 1;
-
-	result = usb_autopm_get_interface(i2400mu->usb_iface);
-	if (result < 0) {
-		dev_err(dev, "BM-CMD: can't get autopm: %d\n", result);
-		do_autopm = 0;
-	}
-	epd = usb_get_epd(i2400mu->usb_iface, i2400mu->endpoint_cfg.bulk_out);
-	pipe = usb_sndbulkpipe(i2400mu->usb_dev, epd->bEndpointAddress);
-retry:
-	result = usb_bulk_msg(i2400mu->usb_dev, pipe, buf, buf_size, &len, 200);
-	switch (result) {
-	case 0:
-		if (len != buf_size) {
-			dev_err(dev, "BM-CMD: short write (%u B vs %zu "
-				"expected)\n", len, buf_size);
-			result = -EIO;
-			break;
-		}
-		result = len;
-		break;
-	case -EPIPE:
-		/*
-		 * Stall -- maybe the device is choking with our
-		 * requests. Clear it and give it some time. If they
-		 * happen to often, it might be another symptom, so we
-		 * reset.
-		 *
-		 * No error handling for usb_clear_halt(0; if it
-		 * works, the retry works; if it fails, this switch
-		 * does the error handling for us.
-		 */
-		if (edc_inc(&i2400mu->urb_edc,
-			    10 * EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
-			dev_err(dev, "BM-CMD: too many stalls in "
-				"URB; resetting device\n");
-			usb_queue_reset_device(i2400mu->usb_iface);
-		} else {
-			usb_clear_halt(i2400mu->usb_dev, pipe);
-			msleep(10);	/* give the device some time */
-			goto retry;
-		}
-		fallthrough;
-	case -EINVAL:			/* while removing driver */
-	case -ENODEV:			/* dev disconnect ... */
-	case -ENOENT:			/* just ignore it */
-	case -ESHUTDOWN:		/* and exit */
-	case -ECONNRESET:
-		result = -ESHUTDOWN;
-		break;
-	case -ETIMEDOUT:			/* bah... */
-		break;
-	default:				/* any other? */
-		if (edc_inc(&i2400mu->urb_edc,
-			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
-				dev_err(dev, "BM-CMD: maximum errors in "
-					"URB exceeded; resetting device\n");
-				usb_queue_reset_device(i2400mu->usb_iface);
-				result = -ENODEV;
-				break;
-		}
-		dev_err(dev, "BM-CMD: URB error %d, retrying\n",
-			result);
-		goto retry;
-	}
-	if (do_autopm)
-		usb_autopm_put_interface(i2400mu->usb_iface);
-	return result;
-}
-
-
-/*
- * Send a boot-mode command over the bulk-out pipe
- *
- * Command can be a raw command, which requires no preparation (and
- * which might not even be following the command format). Checks that
- * the right amount of data was transferred.
- *
- * To satisfy USB requirements (no onstack, vmalloc or in data segment
- * buffers), we copy the command to i2400m->bm_cmd_buf and send it from
- * there.
- *
- * @flags: pass thru from i2400m_bm_cmd()
- * @return: cmd_size if ok, < 0 errno code on error.
- */
-ssize_t i2400mu_bus_bm_cmd_send(struct i2400m *i2400m,
-				const struct i2400m_bootrom_header *_cmd,
-				size_t cmd_size, int flags)
-{
-	ssize_t result;
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
-	int opcode = _cmd == NULL ? -1 : i2400m_brh_get_opcode(_cmd);
-	struct i2400m_bootrom_header *cmd;
-	size_t cmd_size_a = ALIGN(cmd_size, 16);	/* USB restriction */
-
-	d_fnstart(8, dev, "(i2400m %p cmd %p size %zu)\n",
-		  i2400m, _cmd, cmd_size);
-	result = -E2BIG;
-	if (cmd_size > I2400M_BM_CMD_BUF_SIZE)
-		goto error_too_big;
-	if (_cmd != i2400m->bm_cmd_buf)
-		memmove(i2400m->bm_cmd_buf, _cmd, cmd_size);
-	cmd = i2400m->bm_cmd_buf;
-	if (cmd_size_a > cmd_size)			/* Zero pad space */
-		memset(i2400m->bm_cmd_buf + cmd_size, 0, cmd_size_a - cmd_size);
-	if ((flags & I2400M_BM_CMD_RAW) == 0) {
-		if (WARN_ON(i2400m_brh_get_response_required(cmd) == 0))
-			dev_warn(dev, "SW BUG: response_required == 0\n");
-		i2400m_bm_cmd_prepare(cmd);
-	}
-	result = i2400mu_tx_bulk_out(i2400mu, i2400m->bm_cmd_buf, cmd_size);
-	if (result < 0) {
-		dev_err(dev, "boot-mode cmd %d: cannot send: %zd\n",
-			opcode, result);
-		goto error_cmd_send;
-	}
-	if (result != cmd_size) {		/* all was transferred? */
-		dev_err(dev, "boot-mode cmd %d: incomplete transfer "
-			"(%zd vs %zu submitted)\n",  opcode, result, cmd_size);
-		result = -EIO;
-		goto error_cmd_size;
-	}
-error_cmd_size:
-error_cmd_send:
-error_too_big:
-	d_fnend(8, dev, "(i2400m %p cmd %p size %zu) = %zd\n",
-		i2400m, _cmd, cmd_size, result);
-	return result;
-}
-
-
-static
-void __i2400mu_bm_notif_cb(struct urb *urb)
-{
-	complete(urb->context);
-}
-
-
-/*
- * submit a read to the notification endpoint
- *
- * @i2400m: device descriptor
- * @urb: urb to use
- * @completion: completion variable to complete when done
- *
- * Data is always read to i2400m->bm_ack_buf
- */
-static
-int i2400mu_notif_submit(struct i2400mu *i2400mu, struct urb *urb,
-			 struct completion *completion)
-{
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct usb_endpoint_descriptor *epd;
-	int pipe;
-
-	epd = usb_get_epd(i2400mu->usb_iface,
-			  i2400mu->endpoint_cfg.notification);
-	pipe = usb_rcvintpipe(i2400mu->usb_dev, epd->bEndpointAddress);
-	usb_fill_int_urb(urb, i2400mu->usb_dev, pipe,
-			 i2400m->bm_ack_buf, I2400M_BM_ACK_BUF_SIZE,
-			 __i2400mu_bm_notif_cb, completion,
-			 epd->bInterval);
-	return usb_submit_urb(urb, GFP_KERNEL);
-}
-
-
-/*
- * Read an ack from  the notification endpoint
- *
- * @i2400m:
- * @_ack: pointer to where to store the read data
- * @ack_size: how many bytes we should read
- *
- * Returns: < 0 errno code on error; otherwise, amount of received bytes.
- *
- * Submits a notification read, appends the read data to the given ack
- * buffer and then repeats (until @ack_size bytes have been
- * received).
- */
-ssize_t i2400mu_bus_bm_wait_for_ack(struct i2400m *i2400m,
-				    struct i2400m_bootrom_header *_ack,
-				    size_t ack_size)
-{
-	ssize_t result = -ENOMEM;
-	struct device *dev = i2400m_dev(i2400m);
-	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
-	struct urb notif_urb;
-	void *ack = _ack;
-	size_t offset, len;
-	long val;
-	int do_autopm = 1;
-	DECLARE_COMPLETION_ONSTACK(notif_completion);
-
-	d_fnstart(8, dev, "(i2400m %p ack %p size %zu)\n",
-		  i2400m, ack, ack_size);
-	BUG_ON(_ack == i2400m->bm_ack_buf);
-	result = usb_autopm_get_interface(i2400mu->usb_iface);
-	if (result < 0) {
-		dev_err(dev, "BM-ACK: can't get autopm: %d\n", (int) result);
-		do_autopm = 0;
-	}
-	usb_init_urb(&notif_urb);	/* ready notifications */
-	usb_get_urb(&notif_urb);
-	offset = 0;
-	while (offset < ack_size) {
-		init_completion(&notif_completion);
-		result = i2400mu_notif_submit(i2400mu, &notif_urb,
-					      &notif_completion);
-		if (result < 0)
-			goto error_notif_urb_submit;
-		val = wait_for_completion_interruptible_timeout(
-			&notif_completion, HZ);
-		if (val == 0) {
-			result = -ETIMEDOUT;
-			usb_kill_urb(&notif_urb);	/* Timedout */
-			goto error_notif_wait;
-		}
-		if (val == -ERESTARTSYS) {
-			result = -EINTR;		/* Interrupted */
-			usb_kill_urb(&notif_urb);
-			goto error_notif_wait;
-		}
-		result = notif_urb.status;		/* How was the ack? */
-		switch (result) {
-		case 0:
-			break;
-		case -EINVAL:			/* while removing driver */
-		case -ENODEV:			/* dev disconnect ... */
-		case -ENOENT:			/* just ignore it */
-		case -ESHUTDOWN:		/* and exit */
-		case -ECONNRESET:
-			result = -ESHUTDOWN;
-			goto error_dev_gone;
-		default:				/* any other? */
-			usb_kill_urb(&notif_urb);	/* Timedout */
-			if (edc_inc(&i2400mu->urb_edc,
-				    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME))
-				goto error_exceeded;
-			dev_err(dev, "BM-ACK: URB error %d, "
-				"retrying\n", notif_urb.status);
-			continue;	/* retry */
-		}
-		if (notif_urb.actual_length == 0) {
-			d_printf(6, dev, "ZLP received, retrying\n");
-			continue;
-		}
-		/* Got data, append it to the buffer */
-		len = min(ack_size - offset, (size_t) notif_urb.actual_length);
-		memcpy(ack + offset, i2400m->bm_ack_buf, len);
-		offset += len;
-	}
-	result = offset;
-error_notif_urb_submit:
-error_notif_wait:
-error_dev_gone:
-out:
-	if (do_autopm)
-		usb_autopm_put_interface(i2400mu->usb_iface);
-	d_fnend(8, dev, "(i2400m %p ack %p size %zu) = %ld\n",
-		i2400m, ack, ack_size, (long) result);
-	usb_put_urb(&notif_urb);
-	return result;
-
-error_exceeded:
-	dev_err(dev, "bm: maximum errors in notification URB exceeded; "
-		"resetting device\n");
-	usb_queue_reset_device(i2400mu->usb_iface);
-	goto out;
-}
diff --git a/drivers/net/wimax/i2400m/usb-notif.c b/drivers/net/wimax/i2400m/usb-notif.c
deleted file mode 100644
index 5d429f816125..000000000000
--- a/drivers/net/wimax/i2400m/usb-notif.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m over USB
- * Notification handling
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Initial implementation
- *
- *
- * The notification endpoint is active when the device is not in boot
- * mode; in here we just read and get notifications; based on those,
- * we act to either reinitialize the device after a reboot or to
- * submit a RX request.
- *
- * ROADMAP
- *
- * i2400mu_usb_notification_setup()
- *
- * i2400mu_usb_notification_release()
- *
- * i2400mu_usb_notification_cb()	Called when a URB is ready
- *   i2400mu_notif_grok()
- *     i2400m_is_boot_barker()
- *     i2400m_dev_reset_handle()
- *     i2400mu_rx_kick()
- */
-#include <linux/usb.h>
-#include <linux/slab.h>
-#include "i2400m-usb.h"
-
-
-#define D_SUBMODULE notif
-#include "usb-debug-levels.h"
-
-
-static const
-__le32 i2400m_ZERO_BARKER[4] = { 0, 0, 0, 0 };
-
-
-/*
- * Process a received notification
- *
- * In normal operation mode, we can only receive two types of payloads
- * on the notification endpoint:
- *
- *   - a reboot barker, we do a bootstrap (the device has reseted).
- *
- *   - a block of zeroes: there is pending data in the IN endpoint
- */
-static
-int i2400mu_notification_grok(struct i2400mu *i2400mu, const void *buf,
-				 size_t buf_len)
-{
-	int ret;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-
-	d_fnstart(4, dev, "(i2400m %p buf %p buf_len %zu)\n",
-		  i2400mu, buf, buf_len);
-	ret = -EIO;
-	if (buf_len < sizeof(i2400m_ZERO_BARKER))
-		/* Not a bug, just ignore */
-		goto error_bad_size;
-	ret = 0;
-	if (!memcmp(i2400m_ZERO_BARKER, buf, sizeof(i2400m_ZERO_BARKER))) {
-		i2400mu_rx_kick(i2400mu);
-		goto out;
-	}
-	ret = i2400m_is_boot_barker(i2400m, buf, buf_len);
-	if (unlikely(ret >= 0))
-		ret = i2400m_dev_reset_handle(i2400m, "device rebooted");
-	else	/* Unknown or unexpected data in the notif message */
-		i2400m_unknown_barker(i2400m, buf, buf_len);
-error_bad_size:
-out:
-	d_fnend(4, dev, "(i2400m %p buf %p buf_len %zu) = %d\n",
-		i2400mu, buf, buf_len, ret);
-	return ret;
-}
-
-
-/*
- * URB callback for the notification endpoint
- *
- * @urb: the urb received from the notification endpoint
- *
- * This function will just process the USB side of the transaction,
- * checking everything is fine, pass the processing to
- * i2400m_notification_grok() and resubmit the URB.
- */
-static
-void i2400mu_notification_cb(struct urb *urb)
-{
-	int ret;
-	struct i2400mu *i2400mu = urb->context;
-	struct device *dev = &i2400mu->usb_iface->dev;
-
-	d_fnstart(4, dev, "(urb %p status %d actual_length %d)\n",
-		  urb, urb->status, urb->actual_length);
-	ret = urb->status;
-	switch (ret) {
-	case 0:
-		ret = i2400mu_notification_grok(i2400mu, urb->transfer_buffer,
-						urb->actual_length);
-		if (ret == -EIO && edc_inc(&i2400mu->urb_edc, EDC_MAX_ERRORS,
-					   EDC_ERROR_TIMEFRAME))
-			goto error_exceeded;
-		if (ret == -ENOMEM)	/* uff...power cycle? shutdown? */
-			goto error_exceeded;
-		break;
-	case -EINVAL:			/* while removing driver */
-	case -ENODEV:			/* dev disconnect ... */
-	case -ENOENT:			/* ditto */
-	case -ESHUTDOWN:		/* URB killed */
-	case -ECONNRESET:		/* disconnection */
-		goto out;		/* Notify around */
-	default:			/* Some error? */
-		if (edc_inc(&i2400mu->urb_edc,
-			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME))
-			goto error_exceeded;
-		dev_err(dev, "notification: URB error %d, retrying\n",
-			urb->status);
-	}
-	usb_mark_last_busy(i2400mu->usb_dev);
-	ret = usb_submit_urb(i2400mu->notif_urb, GFP_ATOMIC);
-	switch (ret) {
-	case 0:
-	case -EINVAL:			/* while removing driver */
-	case -ENODEV:			/* dev disconnect ... */
-	case -ENOENT:			/* ditto */
-	case -ESHUTDOWN:		/* URB killed */
-	case -ECONNRESET:		/* disconnection */
-		break;			/* just ignore */
-	default:			/* Some error? */
-		dev_err(dev, "notification: cannot submit URB: %d\n", ret);
-		goto error_submit;
-	}
-	d_fnend(4, dev, "(urb %p status %d actual_length %d) = void\n",
-		urb, urb->status, urb->actual_length);
-	return;
-
-error_exceeded:
-	dev_err(dev, "maximum errors in notification URB exceeded; "
-		"resetting device\n");
-error_submit:
-	usb_queue_reset_device(i2400mu->usb_iface);
-out:
-	d_fnend(4, dev, "(urb %p status %d actual_length %d) = void\n",
-		urb, urb->status, urb->actual_length);
-}
-
-
-/*
- * setup the notification endpoint
- *
- * @i2400m: device descriptor
- *
- * This procedure prepares the notification urb and handler for receiving
- * unsolicited barkers from the device.
- */
-int i2400mu_notification_setup(struct i2400mu *i2400mu)
-{
-	struct device *dev = &i2400mu->usb_iface->dev;
-	int usb_pipe, ret = 0;
-	struct usb_endpoint_descriptor *epd;
-	char *buf;
-
-	d_fnstart(4, dev, "(i2400m %p)\n", i2400mu);
-	buf = kmalloc(I2400MU_MAX_NOTIFICATION_LEN, GFP_KERNEL | GFP_DMA);
-	if (buf == NULL) {
-		ret = -ENOMEM;
-		goto error_buf_alloc;
-	}
-
-	i2400mu->notif_urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!i2400mu->notif_urb) {
-		ret = -ENOMEM;
-		goto error_alloc_urb;
-	}
-	epd = usb_get_epd(i2400mu->usb_iface,
-			  i2400mu->endpoint_cfg.notification);
-	usb_pipe = usb_rcvintpipe(i2400mu->usb_dev, epd->bEndpointAddress);
-	usb_fill_int_urb(i2400mu->notif_urb, i2400mu->usb_dev, usb_pipe,
-			 buf, I2400MU_MAX_NOTIFICATION_LEN,
-			 i2400mu_notification_cb, i2400mu, epd->bInterval);
-	ret = usb_submit_urb(i2400mu->notif_urb, GFP_KERNEL);
-	if (ret != 0) {
-		dev_err(dev, "notification: cannot submit URB: %d\n", ret);
-		goto error_submit;
-	}
-	d_fnend(4, dev, "(i2400m %p) = %d\n", i2400mu, ret);
-	return ret;
-
-error_submit:
-	usb_free_urb(i2400mu->notif_urb);
-error_alloc_urb:
-	kfree(buf);
-error_buf_alloc:
-	d_fnend(4, dev, "(i2400m %p) = %d\n", i2400mu, ret);
-	return ret;
-}
-
-
-/*
- * Tear down of the notification mechanism
- *
- * @i2400m: device descriptor
- *
- * Kill the interrupt endpoint urb, free any allocated resources.
- *
- * We need to check if we have done it before as for example,
- * _suspend() call this; if after a suspend() we get a _disconnect()
- * (as the case is when hibernating), nothing bad happens.
- */
-void i2400mu_notification_release(struct i2400mu *i2400mu)
-{
-	struct device *dev = &i2400mu->usb_iface->dev;
-
-	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
-	if (i2400mu->notif_urb != NULL) {
-		usb_kill_urb(i2400mu->notif_urb);
-		kfree(i2400mu->notif_urb->transfer_buffer);
-		usb_free_urb(i2400mu->notif_urb);
-		i2400mu->notif_urb = NULL;
-	}
-	d_fnend(4, dev, "(i2400mu %p)\n", i2400mu);
-}
diff --git a/drivers/net/wimax/i2400m/usb-rx.c b/drivers/net/wimax/i2400m/usb-rx.c
deleted file mode 100644
index 5b64bda7d9e7..000000000000
--- a/drivers/net/wimax/i2400m/usb-rx.c
+++ /dev/null
@@ -1,462 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * USB RX handling
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- *  - Initial implementation
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Use skb_clone(), break up processing in chunks
- *  - Split transport/device specific
- *  - Make buffer size dynamic to exert less memory pressure
- *
- *
- * This handles the RX path on USB.
- *
- * When a notification is received that says 'there is RX data ready',
- * we call i2400mu_rx_kick(); that wakes up the RX kthread, which
- * reads a buffer from USB and passes it to i2400m_rx() in the generic
- * handling code. The RX buffer has an specific format that is
- * described in rx.c.
- *
- * We use a kernel thread in a loop because:
- *
- *  - we want to be able to call the USB power management get/put
- *    functions (blocking) before each transaction.
- *
- *  - We might get a lot of notifications and we don't want to submit
- *    a zillion reads; by serializing, we are throttling.
- *
- *  - RX data processing can get heavy enough so that it is not
- *    appropriate for doing it in the USB callback; thus we run it in a
- *    process context.
- *
- * We provide a read buffer of an arbitrary size (short of a page); if
- * the callback reports -EOVERFLOW, it means it was too small, so we
- * just double the size and retry (being careful to append, as
- * sometimes the device provided some data). Every now and then we
- * check if the average packet size is smaller than the current packet
- * size and if so, we halve it. At the end, the size of the
- * preallocated buffer should be following the average received
- * transaction size, adapting dynamically to it.
- *
- * ROADMAP
- *
- * i2400mu_rx_kick()		   Called from notif.c when we get a
- *   			           'data ready' notification
- * i2400mu_rxd()                   Kernel RX daemon
- *   i2400mu_rx()                  Receive USB data
- *   i2400m_rx()                   Send data to generic i2400m RX handling
- *
- * i2400mu_rx_setup()              called from i2400mu_bus_dev_start()
- *
- * i2400mu_rx_release()            called from i2400mu_bus_dev_stop()
- */
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-#include <linux/usb.h>
-#include "i2400m-usb.h"
-
-
-#define D_SUBMODULE rx
-#include "usb-debug-levels.h"
-
-/*
- * Dynamic RX size
- *
- * We can't let the rx_size be a multiple of 512 bytes (the RX
- * endpoint's max packet size). On some USB host controllers (we
- * haven't been able to fully characterize which), if the device is
- * about to send (for example) X bytes and we only post a buffer to
- * receive n*512, it will fail to mark that as babble (so that
- * i2400mu_rx() [case -EOVERFLOW] can resize the buffer and get the
- * rest).
- *
- * So on growing or shrinking, if it is a multiple of the
- * maxpacketsize, we remove some (instead of incresing some, so in a
- * buddy allocator we try to waste less space).
- *
- * Note we also need a hook for this on i2400mu_rx() -- when we do the
- * first read, we are sure we won't hit this spot because
- * i240mm->rx_size has been set properly. However, if we have to
- * double because of -EOVERFLOW, when we launch the read to get the
- * rest of the data, we *have* to make sure that also is not a
- * multiple of the max_pkt_size.
- */
-
-static
-size_t i2400mu_rx_size_grow(struct i2400mu *i2400mu)
-{
-	struct device *dev = &i2400mu->usb_iface->dev;
-	size_t rx_size;
-	const size_t max_pkt_size = 512;
-
-	rx_size = 2 * i2400mu->rx_size;
-	if (rx_size % max_pkt_size == 0) {
-		rx_size -= 8;
-		d_printf(1, dev,
-			 "RX: expected size grew to %zu [adjusted -8] "
-			 "from %zu\n",
-			 rx_size, i2400mu->rx_size);
-	} else
-		d_printf(1, dev,
-			 "RX: expected size grew to %zu from %zu\n",
-			 rx_size, i2400mu->rx_size);
-	return rx_size;
-}
-
-
-static
-void i2400mu_rx_size_maybe_shrink(struct i2400mu *i2400mu)
-{
-	const size_t max_pkt_size = 512;
-	struct device *dev = &i2400mu->usb_iface->dev;
-
-	if (unlikely(i2400mu->rx_size_cnt >= 100
-		     && i2400mu->rx_size_auto_shrink)) {
-		size_t avg_rx_size =
-			i2400mu->rx_size_acc / i2400mu->rx_size_cnt;
-		size_t new_rx_size = i2400mu->rx_size / 2;
-		if (avg_rx_size < new_rx_size) {
-			if (new_rx_size % max_pkt_size == 0) {
-				new_rx_size -= 8;
-				d_printf(1, dev,
-					 "RX: expected size shrank to %zu "
-					 "[adjusted -8] from %zu\n",
-					 new_rx_size, i2400mu->rx_size);
-			} else
-				d_printf(1, dev,
-					 "RX: expected size shrank to %zu "
-					 "from %zu\n",
-					 new_rx_size, i2400mu->rx_size);
-			i2400mu->rx_size = new_rx_size;
-			i2400mu->rx_size_cnt = 0;
-			i2400mu->rx_size_acc = i2400mu->rx_size;
-		}
-	}
-}
-
-/*
- * Receive a message with payloads from the USB bus into an skb
- *
- * @i2400mu: USB device descriptor
- * @rx_skb: skb where to place the received message
- *
- * Deals with all the USB-specifics of receiving, dynamically
- * increasing the buffer size if so needed. Returns the payload in the
- * skb, ready to process. On a zero-length packet, we retry.
- *
- * On soft USB errors, we retry (until they become too frequent and
- * then are promoted to hard); on hard USB errors, we reset the
- * device. On other errors (skb realloacation, we just drop it and
- * hope for the next invocation to solve it).
- *
- * Returns: pointer to the skb if ok, ERR_PTR on error.
- *   NOTE: this function might realloc the skb (if it is too small),
- *   so always update with the one returned.
- *   ERR_PTR() is < 0 on error.
- *   Will return NULL if it cannot reallocate -- this can be
- *   considered a transient retryable error.
- */
-static
-struct sk_buff *i2400mu_rx(struct i2400mu *i2400mu, struct sk_buff *rx_skb)
-{
-	int result = 0;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	int usb_pipe, read_size, rx_size, do_autopm;
-	struct usb_endpoint_descriptor *epd;
-	const size_t max_pkt_size = 512;
-
-	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
-	do_autopm = atomic_read(&i2400mu->do_autopm);
-	result = do_autopm ?
-		usb_autopm_get_interface(i2400mu->usb_iface) : 0;
-	if (result < 0) {
-		dev_err(dev, "RX: can't get autopm: %d\n", result);
-		do_autopm = 0;
-	}
-	epd = usb_get_epd(i2400mu->usb_iface, i2400mu->endpoint_cfg.bulk_in);
-	usb_pipe = usb_rcvbulkpipe(i2400mu->usb_dev, epd->bEndpointAddress);
-retry:
-	rx_size = skb_end_pointer(rx_skb) - rx_skb->data - rx_skb->len;
-	if (unlikely(rx_size % max_pkt_size == 0)) {
-		rx_size -= 8;
-		d_printf(1, dev, "RX: rx_size adapted to %d [-8]\n", rx_size);
-	}
-	result = usb_bulk_msg(
-		i2400mu->usb_dev, usb_pipe, rx_skb->data + rx_skb->len,
-		rx_size, &read_size, 200);
-	usb_mark_last_busy(i2400mu->usb_dev);
-	switch (result) {
-	case 0:
-		if (read_size == 0)
-			goto retry;	/* ZLP, just resubmit */
-		skb_put(rx_skb, read_size);
-		break;
-	case -EPIPE:
-		/*
-		 * Stall -- maybe the device is choking with our
-		 * requests. Clear it and give it some time. If they
-		 * happen to often, it might be another symptom, so we
-		 * reset.
-		 *
-		 * No error handling for usb_clear_halt(0; if it
-		 * works, the retry works; if it fails, this switch
-		 * does the error handling for us.
-		 */
-		if (edc_inc(&i2400mu->urb_edc,
-			    10 * EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
-			dev_err(dev, "BM-CMD: too many stalls in "
-				"URB; resetting device\n");
-			goto do_reset;
-		}
-		usb_clear_halt(i2400mu->usb_dev, usb_pipe);
-		msleep(10);	/* give the device some time */
-		goto retry;
-	case -EINVAL:			/* while removing driver */
-	case -ENODEV:			/* dev disconnect ... */
-	case -ENOENT:			/* just ignore it */
-	case -ESHUTDOWN:
-	case -ECONNRESET:
-		break;
-	case -EOVERFLOW: {		/* too small, reallocate */
-		struct sk_buff *new_skb;
-		rx_size = i2400mu_rx_size_grow(i2400mu);
-		if (rx_size <= (1 << 16))	/* cap it */
-			i2400mu->rx_size = rx_size;
-		else if (printk_ratelimit()) {
-			dev_err(dev, "BUG? rx_size up to %d\n", rx_size);
-			result = -EINVAL;
-			goto out;
-		}
-		skb_put(rx_skb, read_size);
-		new_skb = skb_copy_expand(rx_skb, 0, rx_size - rx_skb->len,
-					  GFP_KERNEL);
-		if (new_skb == NULL) {
-			kfree_skb(rx_skb);
-			rx_skb = NULL;
-			goto out;	/* drop it...*/
-		}
-		kfree_skb(rx_skb);
-		rx_skb = new_skb;
-		i2400mu->rx_size_cnt = 0;
-		i2400mu->rx_size_acc = i2400mu->rx_size;
-		d_printf(1, dev, "RX: size changed to %d, received %d, "
-			 "copied %d, capacity %ld\n",
-			 rx_size, read_size, rx_skb->len,
-			 (long) skb_end_offset(new_skb));
-		goto retry;
-	}
-		/* In most cases, it happens due to the hardware scheduling a
-		 * read when there was no data - unfortunately, we have no way
-		 * to tell this timeout from a USB timeout. So we just ignore
-		 * it. */
-	case -ETIMEDOUT:
-		dev_err(dev, "RX: timeout: %d\n", result);
-		result = 0;
-		break;
-	default:			/* Any error */
-		if (edc_inc(&i2400mu->urb_edc,
-			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME))
-			goto error_reset;
-		dev_err(dev, "RX: error receiving URB: %d, retrying\n", result);
-		goto retry;
-	}
-out:
-	if (do_autopm)
-		usb_autopm_put_interface(i2400mu->usb_iface);
-	d_fnend(4, dev, "(i2400mu %p) = %p\n", i2400mu, rx_skb);
-	return rx_skb;
-
-error_reset:
-	dev_err(dev, "RX: maximum errors in URB exceeded; "
-		"resetting device\n");
-do_reset:
-	usb_queue_reset_device(i2400mu->usb_iface);
-	rx_skb = ERR_PTR(result);
-	goto out;
-}
-
-
-/*
- * Kernel thread for USB reception of data
- *
- * This thread waits for a kick; once kicked, it will allocate an skb
- * and receive a single message to it from USB (using
- * i2400mu_rx()). Once received, it is passed to the generic i2400m RX
- * code for processing.
- *
- * When done processing, it runs some dirty statistics to verify if
- * the last 100 messages received were smaller than half of the
- * current RX buffer size. In that case, the RX buffer size is
- * halved. This will helps lowering the pressure on the memory
- * allocator.
- *
- * Hard errors force the thread to exit.
- */
-static
-int i2400mu_rxd(void *_i2400mu)
-{
-	int result = 0;
-	struct i2400mu *i2400mu = _i2400mu;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-	size_t pending;
-	int rx_size;
-	struct sk_buff *rx_skb;
-	unsigned long flags;
-
-	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	BUG_ON(i2400mu->rx_kthread != NULL);
-	i2400mu->rx_kthread = current;
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	while (1) {
-		d_printf(2, dev, "RX: waiting for messages\n");
-		pending = 0;
-		wait_event_interruptible(
-			i2400mu->rx_wq,
-			(kthread_should_stop()	/* check this first! */
-			 || (pending = atomic_read(&i2400mu->rx_pending_count)))
-			);
-		if (kthread_should_stop())
-			break;
-		if (pending == 0)
-			continue;
-		rx_size = i2400mu->rx_size;
-		d_printf(2, dev, "RX: reading up to %d bytes\n", rx_size);
-		rx_skb = __netdev_alloc_skb(net_dev, rx_size, GFP_KERNEL);
-		if (rx_skb == NULL) {
-			dev_err(dev, "RX: can't allocate skb [%d bytes]\n",
-				rx_size);
-			msleep(50);	/* give it some time? */
-			continue;
-		}
-
-		/* Receive the message with the payloads */
-		rx_skb = i2400mu_rx(i2400mu, rx_skb);
-		result = PTR_ERR(rx_skb);
-		if (IS_ERR(rx_skb))
-			goto out;
-		atomic_dec(&i2400mu->rx_pending_count);
-		if (rx_skb == NULL || rx_skb->len == 0) {
-			/* some "ignorable" condition */
-			kfree_skb(rx_skb);
-			continue;
-		}
-
-		/* Deliver the message to the generic i2400m code */
-		i2400mu->rx_size_cnt++;
-		i2400mu->rx_size_acc += rx_skb->len;
-		result = i2400m_rx(i2400m, rx_skb);
-		if (result == -EIO
-		    && edc_inc(&i2400mu->urb_edc,
-			       EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
-			goto error_reset;
-		}
-
-		/* Maybe adjust RX buffer size */
-		i2400mu_rx_size_maybe_shrink(i2400mu);
-	}
-	result = 0;
-out:
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	i2400mu->rx_kthread = NULL;
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	d_fnend(4, dev, "(i2400mu %p) = %d\n", i2400mu, result);
-	return result;
-
-error_reset:
-	dev_err(dev, "RX: maximum errors in received buffer exceeded; "
-		"resetting device\n");
-	usb_queue_reset_device(i2400mu->usb_iface);
-	goto out;
-}
-
-
-/*
- * Start reading from the device
- *
- * @i2400m: device instance
- *
- * Notify the RX thread that there is data pending.
- */
-void i2400mu_rx_kick(struct i2400mu *i2400mu)
-{
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct device *dev = &i2400mu->usb_iface->dev;
-
-	d_fnstart(3, dev, "(i2400mu %p)\n", i2400m);
-	atomic_inc(&i2400mu->rx_pending_count);
-	wake_up_all(&i2400mu->rx_wq);
-	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
-}
-
-
-int i2400mu_rx_setup(struct i2400mu *i2400mu)
-{
-	int result = 0;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	struct task_struct *kthread;
-
-	kthread = kthread_run(i2400mu_rxd, i2400mu, "%s-rx",
-			      wimax_dev->name);
-	/* the kthread function sets i2400mu->rx_thread */
-	if (IS_ERR(kthread)) {
-		result = PTR_ERR(kthread);
-		dev_err(dev, "RX: cannot start thread: %d\n", result);
-	}
-	return result;
-}
-
-
-void i2400mu_rx_release(struct i2400mu *i2400mu)
-{
-	unsigned long flags;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct device *dev = i2400m_dev(i2400m);
-	struct task_struct *kthread;
-
-	spin_lock_irqsave(&i2400m->rx_lock, flags);
-	kthread = i2400mu->rx_kthread;
-	i2400mu->rx_kthread = NULL;
-	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
-	if (kthread)
-		kthread_stop(kthread);
-	else
-		d_printf(1, dev, "RX: kthread had already exited\n");
-}
-
diff --git a/drivers/net/wimax/i2400m/usb-tx.c b/drivers/net/wimax/i2400m/usb-tx.c
deleted file mode 100644
index 3ba9d70cca1b..000000000000
--- a/drivers/net/wimax/i2400m/usb-tx.c
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Intel Wireless WiMAX Connection 2400m
- * USB specific TX handling
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- *  - Initial implementation
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Split transport/device specific
- *
- *
- * Takes the TX messages in the i2400m's driver TX FIFO and sends them
- * to the device until there are no more.
- *
- * If we fail sending the message, we just drop it. There isn't much
- * we can do at this point. We could also retry, but the USB stack has
- * already retried and still failed, so there is not much of a
- * point. As well, most of the traffic is network, which has recovery
- * methods for dropped packets.
- *
- * For sending we just obtain a FIFO buffer to send, send it to the
- * USB bulk out, tell the TX FIFO code we have sent it; query for
- * another one, etc... until done.
- *
- * We use a thread so we can call usb_autopm_enable() and
- * usb_autopm_disable() for each transaction; this way when the device
- * goes idle, it will suspend. It also has less overhead than a
- * dedicated workqueue, as it is being used for a single task.
- *
- * ROADMAP
- *
- * i2400mu_tx_setup()
- * i2400mu_tx_release()
- *
- * i2400mu_bus_tx_kick()	- Called by the tx.c code when there
- *                                is new data in the FIFO.
- * i2400mu_txd()
- *   i2400m_tx_msg_get()
- *   i2400m_tx_msg_sent()
- */
-#include "i2400m-usb.h"
-
-
-#define D_SUBMODULE tx
-#include "usb-debug-levels.h"
-
-
-/*
- * Get the next TX message in the TX FIFO and send it to the device
- *
- * Note that any iteration consumes a message to be sent, no matter if
- * it succeeds or fails (we have no real way to retry or complain).
- *
- * Return: 0 if ok, < 0 errno code on hard error.
- */
-static
-int i2400mu_tx(struct i2400mu *i2400mu, struct i2400m_msg_hdr *tx_msg,
-	       size_t tx_msg_size)
-{
-	int result = 0;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	int usb_pipe, sent_size, do_autopm;
-	struct usb_endpoint_descriptor *epd;
-
-	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
-	do_autopm = atomic_read(&i2400mu->do_autopm);
-	result = do_autopm ?
-		usb_autopm_get_interface(i2400mu->usb_iface) : 0;
-	if (result < 0) {
-		dev_err(dev, "TX: can't get autopm: %d\n", result);
-		do_autopm = 0;
-	}
-	epd = usb_get_epd(i2400mu->usb_iface, i2400mu->endpoint_cfg.bulk_out);
-	usb_pipe = usb_sndbulkpipe(i2400mu->usb_dev, epd->bEndpointAddress);
-retry:
-	result = usb_bulk_msg(i2400mu->usb_dev, usb_pipe,
-			      tx_msg, tx_msg_size, &sent_size, 200);
-	usb_mark_last_busy(i2400mu->usb_dev);
-	switch (result) {
-	case 0:
-		if (sent_size != tx_msg_size) {	/* Too short? drop it */
-			dev_err(dev, "TX: short write (%d B vs %zu "
-				"expected)\n", sent_size, tx_msg_size);
-			result = -EIO;
-		}
-		break;
-	case -EPIPE:
-		/*
-		 * Stall -- maybe the device is choking with our
-		 * requests. Clear it and give it some time. If they
-		 * happen to often, it might be another symptom, so we
-		 * reset.
-		 *
-		 * No error handling for usb_clear_halt(0; if it
-		 * works, the retry works; if it fails, this switch
-		 * does the error handling for us.
-		 */
-		if (edc_inc(&i2400mu->urb_edc,
-			    10 * EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
-			dev_err(dev, "BM-CMD: too many stalls in "
-				"URB; resetting device\n");
-			usb_queue_reset_device(i2400mu->usb_iface);
-		} else {
-			usb_clear_halt(i2400mu->usb_dev, usb_pipe);
-			msleep(10);	/* give the device some time */
-			goto retry;
-		}
-		fallthrough;
-	case -EINVAL:			/* while removing driver */
-	case -ENODEV:			/* dev disconnect ... */
-	case -ENOENT:			/* just ignore it */
-	case -ESHUTDOWN:		/* and exit */
-	case -ECONNRESET:
-		result = -ESHUTDOWN;
-		break;
-	default:			/* Some error? */
-		if (edc_inc(&i2400mu->urb_edc,
-			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
-			dev_err(dev, "TX: maximum errors in URB "
-				"exceeded; resetting device\n");
-			usb_queue_reset_device(i2400mu->usb_iface);
-		} else {
-			dev_err(dev, "TX: cannot send URB; retrying. "
-				"tx_msg @%zu %zu B [%d sent]: %d\n",
-				(void *) tx_msg - i2400m->tx_buf,
-				tx_msg_size, sent_size, result);
-			goto retry;
-		}
-	}
-	if (do_autopm)
-		usb_autopm_put_interface(i2400mu->usb_iface);
-	d_fnend(4, dev, "(i2400mu %p) = result\n", i2400mu);
-	return result;
-}
-
-
-/*
- * Get the next TX message in the TX FIFO and send it to the device
- *
- * Note we exit the loop if i2400mu_tx() fails; that function only
- * fails on hard error (failing to tx a buffer not being one of them,
- * see its doc).
- *
- * Return: 0
- */
-static
-int i2400mu_txd(void *_i2400mu)
-{
-	struct i2400mu *i2400mu = _i2400mu;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	struct i2400m_msg_hdr *tx_msg;
-	size_t tx_msg_size;
-	unsigned long flags;
-
-	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
-
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	BUG_ON(i2400mu->tx_kthread != NULL);
-	i2400mu->tx_kthread = current;
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-
-	while (1) {
-		d_printf(2, dev, "TX: waiting for messages\n");
-		tx_msg = NULL;
-		wait_event_interruptible(
-			i2400mu->tx_wq,
-			(kthread_should_stop()	/* check this first! */
-			 || (tx_msg = i2400m_tx_msg_get(i2400m, &tx_msg_size)))
-			);
-		if (kthread_should_stop())
-			break;
-		WARN_ON(tx_msg == NULL);	/* should not happen...*/
-		d_printf(2, dev, "TX: submitting %zu bytes\n", tx_msg_size);
-		d_dump(5, dev, tx_msg, tx_msg_size);
-		/* Yeah, we ignore errors ... not much we can do */
-		i2400mu_tx(i2400mu, tx_msg, tx_msg_size);
-		i2400m_tx_msg_sent(i2400m);	/* ack it, advance the FIFO */
-	}
-
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	i2400mu->tx_kthread = NULL;
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-
-	d_fnend(4, dev, "(i2400mu %p)\n", i2400mu);
-	return 0;
-}
-
-
-/*
- * i2400m TX engine notifies us that there is data in the FIFO ready
- * for TX
- *
- * If there is a URB in flight, don't do anything; when it finishes,
- * it will see there is data in the FIFO and send it. Else, just
- * submit a write.
- */
-void i2400mu_bus_tx_kick(struct i2400m *i2400m)
-{
-	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
-	struct device *dev = &i2400mu->usb_iface->dev;
-
-	d_fnstart(3, dev, "(i2400m %p) = void\n", i2400m);
-	wake_up_all(&i2400mu->tx_wq);
-	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
-}
-
-
-int i2400mu_tx_setup(struct i2400mu *i2400mu)
-{
-	int result = 0;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
-	struct task_struct *kthread;
-
-	kthread = kthread_run(i2400mu_txd, i2400mu, "%s-tx",
-			      wimax_dev->name);
-	/* the kthread function sets i2400mu->tx_thread */
-	if (IS_ERR(kthread)) {
-		result = PTR_ERR(kthread);
-		dev_err(dev, "TX: cannot start thread: %d\n", result);
-	}
-	return result;
-}
-
-void i2400mu_tx_release(struct i2400mu *i2400mu)
-{
-	unsigned long flags;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct device *dev = i2400m_dev(i2400m);
-	struct task_struct *kthread;
-
-	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	kthread = i2400mu->tx_kthread;
-	i2400mu->tx_kthread = NULL;
-	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
-	if (kthread)
-		kthread_stop(kthread);
-	else
-		d_printf(1, dev, "TX: kthread had already exited\n");
-}
diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c
deleted file mode 100644
index b684e97ac976..000000000000
--- a/drivers/net/wimax/i2400m/usb.c
+++ /dev/null
@@ -1,764 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Wireless WiMAX Connection 2400m
- * Linux driver model glue for USB device, reset & fw upload
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- * Yanir Lubetkin <yanirx.lubetkin@intel.com>
- *
- * See i2400m-usb.h for a general description of this driver.
- *
- * This file implements driver model glue, and hook ups for the
- * generic driver to implement the bus-specific functions (device
- * communication setup/tear down, firmware upload and resetting).
- *
- * ROADMAP
- *
- * i2400mu_probe()
- *   alloc_netdev()...
- *     i2400mu_netdev_setup()
- *       i2400mu_init()
- *       i2400m_netdev_setup()
- *   i2400m_setup()...
- *
- * i2400mu_disconnect
- *   i2400m_release()
- *   free_netdev()
- *
- * i2400mu_suspend()
- *   i2400m_cmd_enter_powersave()
- *   i2400mu_notification_release()
- *
- * i2400mu_resume()
- *   i2400mu_notification_setup()
- *
- * i2400mu_bus_dev_start()        Called by i2400m_dev_start() [who is
- *   i2400mu_tx_setup()           called by i2400m_setup()]
- *   i2400mu_rx_setup()
- *   i2400mu_notification_setup()
- *
- * i2400mu_bus_dev_stop()         Called by i2400m_dev_stop() [who is
- *   i2400mu_notification_release()  called by i2400m_release()]
- *   i2400mu_rx_release()
- *   i2400mu_tx_release()
- *
- * i2400mu_bus_reset()            Called by i2400m_reset
- *   __i2400mu_reset()
- *     __i2400mu_send_barker()
- *   usb_reset_device()
- */
-#include "i2400m-usb.h"
-#include <linux/wimax/i2400m.h>
-#include <linux/debugfs.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-
-
-#define D_SUBMODULE usb
-#include "usb-debug-levels.h"
-
-static char i2400mu_debug_params[128];
-module_param_string(debug, i2400mu_debug_params, sizeof(i2400mu_debug_params),
-		    0644);
-MODULE_PARM_DESC(debug,
-		 "String of space-separated NAME:VALUE pairs, where NAMEs "
-		 "are the different debug submodules and VALUE are the "
-		 "initial debug value to set.");
-
-/* Our firmware file name */
-static const char *i2400mu_bus_fw_names_5x50[] = {
-#define I2400MU_FW_FILE_NAME_v1_5 "i2400m-fw-usb-1.5.sbcf"
-	I2400MU_FW_FILE_NAME_v1_5,
-#define I2400MU_FW_FILE_NAME_v1_4 "i2400m-fw-usb-1.4.sbcf"
-	I2400MU_FW_FILE_NAME_v1_4,
-	NULL,
-};
-
-
-static const char *i2400mu_bus_fw_names_6050[] = {
-#define I6050U_FW_FILE_NAME_v1_5 "i6050-fw-usb-1.5.sbcf"
-	I6050U_FW_FILE_NAME_v1_5,
-	NULL,
-};
-
-
-static
-int i2400mu_bus_dev_start(struct i2400m *i2400m)
-{
-	int result;
-	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
-	struct device *dev = &i2400mu->usb_iface->dev;
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	result = i2400mu_tx_setup(i2400mu);
-	if (result < 0)
-		goto error_usb_tx_setup;
-	result = i2400mu_rx_setup(i2400mu);
-	if (result < 0)
-		goto error_usb_rx_setup;
-	result = i2400mu_notification_setup(i2400mu);
-	if (result < 0)
-		goto error_notif_setup;
-	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
-	return result;
-
-error_notif_setup:
-	i2400mu_rx_release(i2400mu);
-error_usb_rx_setup:
-	i2400mu_tx_release(i2400mu);
-error_usb_tx_setup:
-	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
-	return result;
-}
-
-
-static
-void i2400mu_bus_dev_stop(struct i2400m *i2400m)
-{
-	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
-	struct device *dev = &i2400mu->usb_iface->dev;
-
-	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	i2400mu_notification_release(i2400mu);
-	i2400mu_rx_release(i2400mu);
-	i2400mu_tx_release(i2400mu);
-	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
-}
-
-
-/*
- * Sends a barker buffer to the device
- *
- * This helper will allocate a kmalloced buffer and use it to transmit
- * (then free it). Reason for this is that other arches cannot use
- * stack/vmalloc/text areas for DMA transfers.
- *
- * Error recovery here is simpler: anything is considered a hard error
- * and will move the reset code to use a last-resort bus-based reset.
- */
-static
-int __i2400mu_send_barker(struct i2400mu *i2400mu,
-			  const __le32 *barker,
-			  size_t barker_size,
-			  unsigned endpoint)
-{
-	struct usb_endpoint_descriptor *epd = NULL;
-	int pipe, actual_len, ret;
-	struct device *dev = &i2400mu->usb_iface->dev;
-	void *buffer;
-	int do_autopm = 1;
-
-	ret = usb_autopm_get_interface(i2400mu->usb_iface);
-	if (ret < 0) {
-		dev_err(dev, "RESET: can't get autopm: %d\n", ret);
-		do_autopm = 0;
-	}
-	ret = -ENOMEM;
-	buffer = kmalloc(barker_size, GFP_KERNEL);
-	if (buffer == NULL)
-		goto error_kzalloc;
-	epd = usb_get_epd(i2400mu->usb_iface, endpoint);
-	pipe = usb_sndbulkpipe(i2400mu->usb_dev, epd->bEndpointAddress);
-	memcpy(buffer, barker, barker_size);
-retry:
-	ret = usb_bulk_msg(i2400mu->usb_dev, pipe, buffer, barker_size,
-			   &actual_len, 200);
-	switch (ret) {
-	case 0:
-		if (actual_len != barker_size) {	/* Too short? drop it */
-			dev_err(dev, "E: %s: short write (%d B vs %zu "
-				"expected)\n",
-				__func__, actual_len, barker_size);
-			ret = -EIO;
-		}
-		break;
-	case -EPIPE:
-		/*
-		 * Stall -- maybe the device is choking with our
-		 * requests. Clear it and give it some time. If they
-		 * happen to often, it might be another symptom, so we
-		 * reset.
-		 *
-		 * No error handling for usb_clear_halt(0; if it
-		 * works, the retry works; if it fails, this switch
-		 * does the error handling for us.
-		 */
-		if (edc_inc(&i2400mu->urb_edc,
-			    10 * EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
-			dev_err(dev, "E: %s: too many stalls in "
-				"URB; resetting device\n", __func__);
-			usb_queue_reset_device(i2400mu->usb_iface);
-			/* fallthrough */
-		} else {
-			usb_clear_halt(i2400mu->usb_dev, pipe);
-			msleep(10);	/* give the device some time */
-			goto retry;
-		}
-		fallthrough;
-	case -EINVAL:			/* while removing driver */
-	case -ENODEV:			/* dev disconnect ... */
-	case -ENOENT:			/* just ignore it */
-	case -ESHUTDOWN:		/* and exit */
-	case -ECONNRESET:
-		ret = -ESHUTDOWN;
-		break;
-	default:			/* Some error? */
-		if (edc_inc(&i2400mu->urb_edc,
-			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
-			dev_err(dev, "E: %s: maximum errors in URB "
-				"exceeded; resetting device\n",
-				__func__);
-			usb_queue_reset_device(i2400mu->usb_iface);
-		} else {
-			dev_warn(dev, "W: %s: cannot send URB: %d\n",
-				 __func__, ret);
-			goto retry;
-		}
-	}
-	kfree(buffer);
-error_kzalloc:
-	if (do_autopm)
-		usb_autopm_put_interface(i2400mu->usb_iface);
-	return ret;
-}
-
-
-/*
- * Reset a device at different levels (warm, cold or bus)
- *
- * @i2400m: device descriptor
- * @reset_type: soft, warm or bus reset (I2400M_RT_WARM/SOFT/BUS)
- *
- * Warm and cold resets get a USB reset if they fail.
- *
- * Warm reset:
- *
- * The device will be fully reset internally, but won't be
- * disconnected from the USB bus (so no reenumeration will
- * happen). Firmware upload will be necessary.
- *
- * The device will send a reboot barker in the notification endpoint
- * that will trigger the driver to reinitialize the state
- * automatically from notif.c:i2400m_notification_grok() into
- * i2400m_dev_bootstrap_delayed().
- *
- * Cold and bus (USB) reset:
- *
- * The device will be fully reset internally, disconnected from the
- * USB bus an a reenumeration will happen. Firmware upload will be
- * necessary. Thus, we don't do any locking or struct
- * reinitialization, as we are going to be fully disconnected and
- * reenumerated.
- *
- * Note we need to return -ENODEV if a warm reset was requested and we
- * had to resort to a bus reset. See i2400m_op_reset(), wimax_reset()
- * and wimax_dev->op_reset.
- *
- * WARNING: no driver state saved/fixed
- */
-static
-int i2400mu_bus_reset(struct i2400m *i2400m, enum i2400m_reset_type rt)
-{
-	int result;
-	struct i2400mu *i2400mu =
-		container_of(i2400m, struct i2400mu, i2400m);
-	struct device *dev = i2400m_dev(i2400m);
-	static const __le32 i2400m_WARM_BOOT_BARKER[4] = {
-		cpu_to_le32(I2400M_WARM_RESET_BARKER),
-		cpu_to_le32(I2400M_WARM_RESET_BARKER),
-		cpu_to_le32(I2400M_WARM_RESET_BARKER),
-		cpu_to_le32(I2400M_WARM_RESET_BARKER),
-	};
-	static const __le32 i2400m_COLD_BOOT_BARKER[4] = {
-		cpu_to_le32(I2400M_COLD_RESET_BARKER),
-		cpu_to_le32(I2400M_COLD_RESET_BARKER),
-		cpu_to_le32(I2400M_COLD_RESET_BARKER),
-		cpu_to_le32(I2400M_COLD_RESET_BARKER),
-	};
-
-	d_fnstart(3, dev, "(i2400m %p rt %u)\n", i2400m, rt);
-	if (rt == I2400M_RT_WARM)
-		result = __i2400mu_send_barker(
-			i2400mu, i2400m_WARM_BOOT_BARKER,
-			sizeof(i2400m_WARM_BOOT_BARKER),
-			i2400mu->endpoint_cfg.bulk_out);
-	else if (rt == I2400M_RT_COLD)
-		result = __i2400mu_send_barker(
-			i2400mu, i2400m_COLD_BOOT_BARKER,
-			sizeof(i2400m_COLD_BOOT_BARKER),
-			i2400mu->endpoint_cfg.reset_cold);
-	else if (rt == I2400M_RT_BUS) {
-		result = usb_reset_device(i2400mu->usb_dev);
-		switch (result) {
-		case 0:
-		case -EINVAL:	/* device is gone */
-		case -ENODEV:
-		case -ENOENT:
-		case -ESHUTDOWN:
-			result = 0;
-			break;	/* We assume the device is disconnected */
-		default:
-			dev_err(dev, "USB reset failed (%d), giving up!\n",
-				result);
-		}
-	} else {
-		result = -EINVAL;	/* shut gcc up in certain arches */
-		BUG();
-	}
-	if (result < 0
-	    && result != -EINVAL	/* device is gone */
-	    && rt != I2400M_RT_BUS) {
-		/*
-		 * Things failed -- resort to lower level reset, that
-		 * we queue in another context; the reason for this is
-		 * that the pre and post reset functionality requires
-		 * the i2400m->init_mutex; RT_WARM and RT_COLD can
-		 * come from areas where i2400m->init_mutex is taken.
-		 */
-		dev_err(dev, "%s reset failed (%d); trying USB reset\n",
-			rt == I2400M_RT_WARM ? "warm" : "cold", result);
-		usb_queue_reset_device(i2400mu->usb_iface);
-		result = -ENODEV;
-	}
-	d_fnend(3, dev, "(i2400m %p rt %u) = %d\n", i2400m, rt, result);
-	return result;
-}
-
-static void i2400mu_get_drvinfo(struct net_device *net_dev,
-                                struct ethtool_drvinfo *info)
-{
-	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
-	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
-	struct usb_device *udev = i2400mu->usb_dev;
-
-	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
-	strlcpy(info->fw_version, i2400m->fw_name ? : "",
-		sizeof(info->fw_version));
-	usb_make_path(udev, info->bus_info, sizeof(info->bus_info));
-}
-
-static const struct ethtool_ops i2400mu_ethtool_ops = {
-	.get_drvinfo = i2400mu_get_drvinfo,
-	.get_link = ethtool_op_get_link,
-};
-
-static
-void i2400mu_netdev_setup(struct net_device *net_dev)
-{
-	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
-	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
-	i2400mu_init(i2400mu);
-	i2400m_netdev_setup(net_dev);
-	net_dev->ethtool_ops = &i2400mu_ethtool_ops;
-}
-
-
-/*
- * Debug levels control; see debug.h
- */
-struct d_level D_LEVEL[] = {
-	D_SUBMODULE_DEFINE(usb),
-	D_SUBMODULE_DEFINE(fw),
-	D_SUBMODULE_DEFINE(notif),
-	D_SUBMODULE_DEFINE(rx),
-	D_SUBMODULE_DEFINE(tx),
-};
-size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
-
-static
-void i2400mu_debugfs_add(struct i2400mu *i2400mu)
-{
-	struct dentry *dentry = i2400mu->i2400m.wimax_dev.debugfs_dentry;
-
-	dentry = debugfs_create_dir("i2400m-usb", dentry);
-	i2400mu->debugfs_dentry = dentry;
-
-	d_level_register_debugfs("dl_", usb, dentry);
-	d_level_register_debugfs("dl_", fw, dentry);
-	d_level_register_debugfs("dl_", notif, dentry);
-	d_level_register_debugfs("dl_", rx, dentry);
-	d_level_register_debugfs("dl_", tx, dentry);
-
-	/* Don't touch these if you don't know what you are doing */
-	debugfs_create_u8("rx_size_auto_shrink", 0600, dentry,
-			  &i2400mu->rx_size_auto_shrink);
-
-	debugfs_create_size_t("rx_size", 0600, dentry, &i2400mu->rx_size);
-}
-
-
-static struct device_type i2400mu_type = {
-	.name	= "wimax",
-};
-
-/*
- * Probe a i2400m interface and register it
- *
- * @iface:   USB interface to link to
- * @id:      USB class/subclass/protocol id
- * @returns: 0 if ok, < 0 errno code on error.
- *
- * Alloc a net device, initialize the bus-specific details and then
- * calls the bus-generic initialization routine. That will register
- * the wimax and netdev devices, upload the firmware [using
- * _bus_bm_*()], call _bus_dev_start() to finalize the setup of the
- * communication with the device and then will start to talk to it to
- * finnish setting it up.
- */
-static
-int i2400mu_probe(struct usb_interface *iface,
-		  const struct usb_device_id *id)
-{
-	int result;
-	struct net_device *net_dev;
-	struct device *dev = &iface->dev;
-	struct i2400m *i2400m;
-	struct i2400mu *i2400mu;
-	struct usb_device *usb_dev = interface_to_usbdev(iface);
-
-	if (iface->cur_altsetting->desc.bNumEndpoints < 4)
-		return -ENODEV;
-
-	if (usb_dev->speed != USB_SPEED_HIGH)
-		dev_err(dev, "device not connected as high speed\n");
-
-	/* Allocate instance [calls i2400m_netdev_setup() on it]. */
-	result = -ENOMEM;
-	net_dev = alloc_netdev(sizeof(*i2400mu), "wmx%d", NET_NAME_UNKNOWN,
-			       i2400mu_netdev_setup);
-	if (net_dev == NULL) {
-		dev_err(dev, "no memory for network device instance\n");
-		goto error_alloc_netdev;
-	}
-	SET_NETDEV_DEV(net_dev, dev);
-	SET_NETDEV_DEVTYPE(net_dev, &i2400mu_type);
-	i2400m = net_dev_to_i2400m(net_dev);
-	i2400mu = container_of(i2400m, struct i2400mu, i2400m);
-	i2400m->wimax_dev.net_dev = net_dev;
-	i2400mu->usb_dev = usb_get_dev(usb_dev);
-	i2400mu->usb_iface = iface;
-	usb_set_intfdata(iface, i2400mu);
-
-	i2400m->bus_tx_block_size = I2400MU_BLK_SIZE;
-	/*
-	 * Room required in the Tx queue for USB message to accommodate
-	 * a smallest payload while allocating header space is 16 bytes.
-	 * Adding this room  for the new tx message increases the
-	 * possibilities of including any payload with size <= 16 bytes.
-	 */
-	i2400m->bus_tx_room_min = I2400MU_BLK_SIZE;
-	i2400m->bus_pl_size_max = I2400MU_PL_SIZE_MAX;
-	i2400m->bus_setup = NULL;
-	i2400m->bus_dev_start = i2400mu_bus_dev_start;
-	i2400m->bus_dev_stop = i2400mu_bus_dev_stop;
-	i2400m->bus_release = NULL;
-	i2400m->bus_tx_kick = i2400mu_bus_tx_kick;
-	i2400m->bus_reset = i2400mu_bus_reset;
-	i2400m->bus_bm_retries = I2400M_USB_BOOT_RETRIES;
-	i2400m->bus_bm_cmd_send = i2400mu_bus_bm_cmd_send;
-	i2400m->bus_bm_wait_for_ack = i2400mu_bus_bm_wait_for_ack;
-	i2400m->bus_bm_mac_addr_impaired = 0;
-
-	switch (id->idProduct) {
-	case USB_DEVICE_ID_I6050:
-	case USB_DEVICE_ID_I6050_2:
-	case USB_DEVICE_ID_I6150:
-	case USB_DEVICE_ID_I6150_2:
-	case USB_DEVICE_ID_I6150_3:
-	case USB_DEVICE_ID_I6250:
-		i2400mu->i6050 = 1;
-		break;
-	default:
-		break;
-	}
-
-	if (i2400mu->i6050) {
-		i2400m->bus_fw_names = i2400mu_bus_fw_names_6050;
-		i2400mu->endpoint_cfg.bulk_out = 0;
-		i2400mu->endpoint_cfg.notification = 3;
-		i2400mu->endpoint_cfg.reset_cold = 2;
-		i2400mu->endpoint_cfg.bulk_in = 1;
-	} else {
-		i2400m->bus_fw_names = i2400mu_bus_fw_names_5x50;
-		i2400mu->endpoint_cfg.bulk_out = 0;
-		i2400mu->endpoint_cfg.notification = 1;
-		i2400mu->endpoint_cfg.reset_cold = 2;
-		i2400mu->endpoint_cfg.bulk_in = 3;
-	}
-#ifdef CONFIG_PM
-	iface->needs_remote_wakeup = 1;		/* autosuspend (15s delay) */
-	device_init_wakeup(dev, 1);
-	pm_runtime_set_autosuspend_delay(&usb_dev->dev, 15000);
-	usb_enable_autosuspend(usb_dev);
-#endif
-
-	result = i2400m_setup(i2400m, I2400M_BRI_MAC_REINIT);
-	if (result < 0) {
-		dev_err(dev, "cannot setup device: %d\n", result);
-		goto error_setup;
-	}
-	i2400mu_debugfs_add(i2400mu);
-	return 0;
-
-error_setup:
-	usb_set_intfdata(iface, NULL);
-	usb_put_dev(i2400mu->usb_dev);
-	free_netdev(net_dev);
-error_alloc_netdev:
-	return result;
-}
-
-
-/*
- * Disconnect a i2400m from the system.
- *
- * i2400m_stop() has been called before, so al the rx and tx contexts
- * have been taken down already. Make sure the queue is stopped,
- * unregister netdev and i2400m, free and kill.
- */
-static
-void i2400mu_disconnect(struct usb_interface *iface)
-{
-	struct i2400mu *i2400mu = usb_get_intfdata(iface);
-	struct i2400m *i2400m = &i2400mu->i2400m;
-	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
-	struct device *dev = &iface->dev;
-
-	d_fnstart(3, dev, "(iface %p i2400m %p)\n", iface, i2400m);
-
-	debugfs_remove_recursive(i2400mu->debugfs_dentry);
-	i2400m_release(i2400m);
-	usb_set_intfdata(iface, NULL);
-	usb_put_dev(i2400mu->usb_dev);
-	free_netdev(net_dev);
-	d_fnend(3, dev, "(iface %p i2400m %p) = void\n", iface, i2400m);
-}
-
-
-/*
- * Get the device ready for USB port or system standby and hibernation
- *
- * USB port and system standby are handled the same.
- *
- * When the system hibernates, the USB device is powered down and then
- * up, so we don't really have to do much here, as it will be seen as
- * a reconnect. Still for simplicity we consider this case the same as
- * suspend, so that the device has a chance to do notify the base
- * station (if connected).
- *
- * So at the end, the three cases require common handling.
- *
- * If at the time of this call the device's firmware is not loaded,
- * nothing has to be done. Note we can be "loose" about not reading
- * i2400m->updown under i2400m->init_mutex. If it happens to change
- * inmediately, other parts of the call flow will fail and effectively
- * catch it.
- *
- * If the firmware is loaded, we need to:
- *
- *  - tell the device to go into host interface power save mode, wait
- *    for it to ack
- *
- *    This is quite more interesting than it is; we need to execute a
- *    command, but this time, we don't want the code in usb-{tx,rx}.c
- *    to call the usb_autopm_get/put_interface() barriers as it'd
- *    deadlock, so we need to decrement i2400mu->do_autopm, that acts
- *    as a poor man's semaphore. Ugly, but it works.
- *
- *    As well, the device might refuse going to sleep for whichever
- *    reason. In this case we just fail. For system suspend/hibernate,
- *    we *can't* fail. We check PMSG_IS_AUTO to see if the
- *    suspend call comes from the USB stack or from the system and act
- *    in consequence.
- *
- *  - stop the notification endpoint polling
- */
-static
-int i2400mu_suspend(struct usb_interface *iface, pm_message_t pm_msg)
-{
-	int result = 0;
-	struct device *dev = &iface->dev;
-	struct i2400mu *i2400mu = usb_get_intfdata(iface);
-	unsigned is_autosuspend = 0;
-	struct i2400m *i2400m = &i2400mu->i2400m;
-
-#ifdef CONFIG_PM
-	if (PMSG_IS_AUTO(pm_msg))
-		is_autosuspend = 1;
-#endif
-
-	d_fnstart(3, dev, "(iface %p pm_msg %u)\n", iface, pm_msg.event);
-	rmb();		/* see i2400m->updown's documentation  */
-	if (i2400m->updown == 0)
-		goto no_firmware;
-	if (i2400m->state == I2400M_SS_DATA_PATH_CONNECTED && is_autosuspend) {
-		/* ugh -- the device is connected and this suspend
-		 * request is an autosuspend one (not a system standby
-		 * / hibernate).
-		 *
-		 * The only way the device can go to standby is if the
-		 * link with the base station is in IDLE mode; that
-		 * were the case, we'd be in status
-		 * I2400M_SS_CONNECTED_IDLE. But we are not.
-		 *
-		 * If we *tell* him to go power save now, it'll reset
-		 * as a precautionary measure, so if this is an
-		 * autosuspend thing, say no and it'll come back
-		 * later, when the link is IDLE
-		 */
-		result = -EBADF;
-		d_printf(1, dev, "fw up, link up, not-idle, autosuspend: "
-			 "not entering powersave\n");
-		goto error_not_now;
-	}
-	d_printf(1, dev, "fw up: entering powersave\n");
-	atomic_dec(&i2400mu->do_autopm);
-	result = i2400m_cmd_enter_powersave(i2400m);
-	atomic_inc(&i2400mu->do_autopm);
-	if (result < 0 && !is_autosuspend) {
-		/* System suspend, can't fail */
-		dev_err(dev, "failed to suspend, will reset on resume\n");
-		result = 0;
-	}
-	if (result < 0)
-		goto error_enter_powersave;
-	i2400mu_notification_release(i2400mu);
-	d_printf(1, dev, "powersave requested\n");
-error_enter_powersave:
-error_not_now:
-no_firmware:
-	d_fnend(3, dev, "(iface %p pm_msg %u) = %d\n",
-		iface, pm_msg.event, result);
-	return result;
-}
-
-
-static
-int i2400mu_resume(struct usb_interface *iface)
-{
-	int ret = 0;
-	struct device *dev = &iface->dev;
-	struct i2400mu *i2400mu = usb_get_intfdata(iface);
-	struct i2400m *i2400m = &i2400mu->i2400m;
-
-	d_fnstart(3, dev, "(iface %p)\n", iface);
-	rmb();		/* see i2400m->updown's documentation  */
-	if (i2400m->updown == 0) {
-		d_printf(1, dev, "fw was down, no resume needed\n");
-		goto out;
-	}
-	d_printf(1, dev, "fw was up, resuming\n");
-	i2400mu_notification_setup(i2400mu);
-	/* USB has flow control, so we don't need to give it time to
-	 * come back; otherwise, we'd use something like a get-state
-	 * command... */
-out:
-	d_fnend(3, dev, "(iface %p) = %d\n", iface, ret);
-	return ret;
-}
-
-
-static
-int i2400mu_reset_resume(struct usb_interface *iface)
-{
-	int result;
-	struct device *dev = &iface->dev;
-	struct i2400mu *i2400mu = usb_get_intfdata(iface);
-	struct i2400m *i2400m = &i2400mu->i2400m;
-
-	d_fnstart(3, dev, "(iface %p)\n", iface);
-	result = i2400m_dev_reset_handle(i2400m, "device reset on resume");
-	d_fnend(3, dev, "(iface %p) = %d\n", iface, result);
-	return result < 0 ? result : 0;
-}
-
-
-/*
- * Another driver or user space is triggering a reset on the device
- * which contains the interface passed as an argument. Cease IO and
- * save any device state you need to restore.
- *
- * If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if
- * you are in atomic context.
- */
-static
-int i2400mu_pre_reset(struct usb_interface *iface)
-{
-	struct i2400mu *i2400mu = usb_get_intfdata(iface);
-	return i2400m_pre_reset(&i2400mu->i2400m);
-}
-
-
-/*
- * The reset has completed.  Restore any saved device state and begin
- * using the device again.
- *
- * If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if
- * you are in atomic context.
- */
-static
-int i2400mu_post_reset(struct usb_interface *iface)
-{
-	struct i2400mu *i2400mu = usb_get_intfdata(iface);
-	return i2400m_post_reset(&i2400mu->i2400m);
-}
-
-
-static
-struct usb_device_id i2400mu_id_table[] = {
-	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6050) },
-	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6050_2) },
-	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150) },
-	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150_2) },
-	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150_3) },
-	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6250) },
-	{ USB_DEVICE(0x8086, 0x0181) },
-	{ USB_DEVICE(0x8086, 0x1403) },
-	{ USB_DEVICE(0x8086, 0x1405) },
-	{ USB_DEVICE(0x8086, 0x0180) },
-	{ USB_DEVICE(0x8086, 0x0182) },
-	{ USB_DEVICE(0x8086, 0x1406) },
-	{ USB_DEVICE(0x8086, 0x1403) },
-	{ },
-};
-MODULE_DEVICE_TABLE(usb, i2400mu_id_table);
-
-
-static
-struct usb_driver i2400mu_driver = {
-	.name = KBUILD_MODNAME,
-	.suspend = i2400mu_suspend,
-	.resume = i2400mu_resume,
-	.reset_resume = i2400mu_reset_resume,
-	.probe = i2400mu_probe,
-	.disconnect = i2400mu_disconnect,
-	.pre_reset = i2400mu_pre_reset,
-	.post_reset = i2400mu_post_reset,
-	.id_table = i2400mu_id_table,
-	.supports_autosuspend = 1,
-};
-
-static
-int __init i2400mu_driver_init(void)
-{
-	d_parse_params(D_LEVEL, D_LEVEL_SIZE, i2400mu_debug_params,
-		       "i2400m_usb.debug");
-	return usb_register(&i2400mu_driver);
-}
-module_init(i2400mu_driver_init);
-
-
-static
-void __exit i2400mu_driver_exit(void)
-{
-	usb_deregister(&i2400mu_driver);
-}
-module_exit(i2400mu_driver_exit);
-
-MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
-MODULE_DESCRIPTION("Driver for USB based Intel Wireless WiMAX Connection 2400M "
-		   "(5x50 & 6050)");
-MODULE_LICENSE("GPL");
-MODULE_FIRMWARE(I2400MU_FW_FILE_NAME_v1_5);
-MODULE_FIRMWARE(I6050U_FW_FILE_NAME_v1_5);
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 2d0310448eba..443ca3f3cdf0 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -114,6 +114,8 @@ source "drivers/staging/kpc2000/Kconfig"
 
 source "drivers/staging/qlge/Kconfig"
 
+source "drivers/staging/wimax/Kconfig"
+
 source "drivers/staging/wfx/Kconfig"
 
 source "drivers/staging/hikey9xx/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 757a892ab5b9..dc45128ef525 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -47,5 +47,6 @@ obj-$(CONFIG_XIL_AXIS_FIFO)	+= axis-fifo/
 obj-$(CONFIG_FIELDBUS_DEV)     += fieldbus/
 obj-$(CONFIG_KPC2000)		+= kpc2000/
 obj-$(CONFIG_QLGE)		+= qlge/
+obj-$(CONFIG_WIMAX)		+= wimax/
 obj-$(CONFIG_WFX)		+= wfx/
 obj-y				+= hikey9xx/
diff --git a/drivers/staging/wimax/Documentation/i2400m.rst b/drivers/staging/wimax/Documentation/i2400m.rst
new file mode 100644
index 000000000000..194388c0c351
--- /dev/null
+++ b/drivers/staging/wimax/Documentation/i2400m.rst
@@ -0,0 +1,283 @@
+.. include:: <isonum.txt>
+
+====================================================
+Driver for the Intel Wireless Wimax Connection 2400m
+====================================================
+
+:Copyright: |copy| 2008 Intel Corporation < linux-wimax@intel.com >
+
+   This provides a driver for the Intel Wireless WiMAX Connection 2400m
+   and a basic Linux kernel WiMAX stack.
+
+1. Requirements
+===============
+
+     * Linux installation with Linux kernel 2.6.22 or newer (if building
+       from a separate tree)
+     * Intel i2400m Echo Peak or Baxter Peak; this includes the Intel
+       Wireless WiMAX/WiFi Link 5x50 series.
+     * build tools:
+
+          + Linux kernel development package for the target kernel; to
+            build against your currently running kernel, you need to have
+            the kernel development package corresponding to the running
+            image installed (usually if your kernel is named
+            linux-VERSION, the development package is called
+            linux-dev-VERSION or linux-headers-VERSION).
+          + GNU C Compiler, make
+
+2. Compilation and installation
+===============================
+
+2.1. Compilation of the drivers included in the kernel
+------------------------------------------------------
+
+   Configure the kernel; to enable the WiMAX drivers select Drivers >
+   Networking Drivers > WiMAX device support. Enable all of them as
+   modules (easier).
+
+   If USB or SDIO are not enabled in the kernel configuration, the options
+   to build the i2400m USB or SDIO drivers will not show. Enable said
+   subsystems and go back to the WiMAX menu to enable the drivers.
+
+   Compile and install your kernel as usual.
+
+2.2. Compilation of the drivers distributed as an standalone module
+-------------------------------------------------------------------
+
+   To compile::
+
+	$ cd source/directory
+	$ make
+
+   Once built you can load and unload using the provided load.sh script;
+   load.sh will load the modules, load.sh u will unload them.
+
+   To install in the default kernel directories (and enable auto loading
+   when the device is plugged)::
+
+	$ make install
+	$ depmod -a
+
+   If your kernel development files are located in a non standard
+   directory or if you want to build for a kernel that is not the
+   currently running one, set KDIR to the right location::
+
+	$ make KDIR=/path/to/kernel/dev/tree
+
+   For more information, please contact linux-wimax@intel.com.
+
+3. Installing the firmware
+--------------------------
+
+   The firmware can be obtained from http://linuxwimax.org or might have
+   been supplied with your hardware.
+
+   It has to be installed in the target system::
+
+	$ cp FIRMWAREFILE.sbcf /lib/firmware/i2400m-fw-BUSTYPE-1.3.sbcf
+
+     * NOTE: if your firmware came in an .rpm or .deb file, just install
+       it as normal, with the rpm (rpm -i FIRMWARE.rpm) or dpkg
+       (dpkg -i FIRMWARE.deb) commands. No further action is needed.
+     * BUSTYPE will be usb or sdio, depending on the hardware you have.
+       Each hardware type comes with its own firmware and will not work
+       with other types.
+
+4. Design
+=========
+
+   This package contains two major parts: a WiMAX kernel stack and a
+   driver for the Intel i2400m.
+
+   The WiMAX stack is designed to provide for common WiMAX control
+   services to current and future WiMAX devices from any vendor; please
+   see README.wimax for details.
+
+   The i2400m kernel driver is broken up in two main parts: the bus
+   generic driver and the bus-specific drivers. The bus generic driver
+   forms the drivercore and contain no knowledge of the actual method we
+   use to connect to the device. The bus specific drivers are just the
+   glue to connect the bus-generic driver and the device. Currently only
+   USB and SDIO are supported. See drivers/net/wimax/i2400m/i2400m.h for
+   more information.
+
+   The bus generic driver is logically broken up in two parts: OS-glue and
+   hardware-glue. The OS-glue interfaces with Linux. The hardware-glue
+   interfaces with the device on using an interface provided by the
+   bus-specific driver. The reason for this breakup is to be able to
+   easily reuse the hardware-glue to write drivers for other OSes; note
+   the hardware glue part is written as a native Linux driver; no
+   abstraction layers are used, so to port to another OS, the Linux kernel
+   API calls should be replaced with the target OS's.
+
+5. Usage
+========
+
+   To load the driver, follow the instructions in the install section;
+   once the driver is loaded, plug in the device (unless it is permanently
+   plugged in). The driver will enumerate the device, upload the firmware
+   and output messages in the kernel log (dmesg, /var/log/messages or
+   /var/log/kern.log) such as::
+
+	...
+	i2400m_usb 5-4:1.0: firmware interface version 8.0.0
+	i2400m_usb 5-4:1.0: WiMAX interface wmx0 (00:1d:e1:01:94:2c) ready
+
+   At this point the device is ready to work.
+
+   Current versions require the Intel WiMAX Network Service in userspace
+   to make things work. See the network service's README for instructions
+   on how to scan, connect and disconnect.
+
+5.1. Module parameters
+----------------------
+
+   Module parameters can be set at kernel or module load time or by
+   echoing values::
+
+	$ echo VALUE > /sys/module/MODULENAME/parameters/PARAMETERNAME
+
+   To make changes permanent, for example, for the i2400m module, you can
+   also create a file named /etc/modprobe.d/i2400m containing::
+
+	options i2400m idle_mode_disabled=1
+
+   To find which parameters are supported by a module, run::
+
+	$ modinfo path/to/module.ko
+
+   During kernel bootup (if the driver is linked in the kernel), specify
+   the following to the kernel command line::
+
+	i2400m.PARAMETER=VALUE
+
+5.1.1. i2400m: idle_mode_disabled
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The i2400m module supports a parameter to disable idle mode. This
+   parameter, once set, will take effect only when the device is
+   reinitialized by the driver (eg: following a reset or a reconnect).
+
+5.2. Debug operations: debugfs entries
+--------------------------------------
+
+   The driver will register debugfs entries that allow the user to tweak
+   debug settings. There are three main container directories where
+   entries are placed, which correspond to the three blocks a i2400m WiMAX
+   driver has:
+
+     * /sys/kernel/debug/wimax:DEVNAME/ for the generic WiMAX stack
+       controls
+     * /sys/kernel/debug/wimax:DEVNAME/i2400m for the i2400m generic
+       driver controls
+     * /sys/kernel/debug/wimax:DEVNAME/i2400m-usb (or -sdio) for the
+       bus-specific i2400m-usb or i2400m-sdio controls).
+
+   Of course, if debugfs is mounted in a directory other than
+   /sys/kernel/debug, those paths will change.
+
+5.2.1. Increasing debug output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The files named *dl_* indicate knobs for controlling the debug output
+   of different submodules::
+
+	# find /sys/kernel/debug/wimax\:wmx0 -name \*dl_\*
+	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_tx
+	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_rx
+	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_notif
+	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_fw
+	/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_usb
+	/sys/kernel/debug/wimax:wmx0/i2400m/dl_tx
+	/sys/kernel/debug/wimax:wmx0/i2400m/dl_rx
+	/sys/kernel/debug/wimax:wmx0/i2400m/dl_rfkill
+	/sys/kernel/debug/wimax:wmx0/i2400m/dl_netdev
+	/sys/kernel/debug/wimax:wmx0/i2400m/dl_fw
+	/sys/kernel/debug/wimax:wmx0/i2400m/dl_debugfs
+	/sys/kernel/debug/wimax:wmx0/i2400m/dl_driver
+	/sys/kernel/debug/wimax:wmx0/i2400m/dl_control
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_stack
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_rfkill
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_reset
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_msg
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_id_table
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_debugfs
+
+   By reading the file you can obtain the current value of said debug
+   level; by writing to it, you can set it.
+
+   To increase the debug level of, for example, the i2400m's generic TX
+   engine, just write::
+
+	$ echo 3 > /sys/kernel/debug/wimax:wmx0/i2400m/dl_tx
+
+   Increasing numbers yield increasing debug information; for details of
+   what is printed and the available levels, check the source. The code
+   uses 0 for disabled and increasing values until 8.
+
+5.2.2. RX and TX statistics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The i2400m/rx_stats and i2400m/tx_stats provide statistics about the
+   data reception/delivery from the device::
+
+	$ cat /sys/kernel/debug/wimax:wmx0/i2400m/rx_stats
+	45 1 3 34 3104 48 480
+
+   The numbers reported are:
+
+     * packets/RX-buffer: total, min, max
+     * RX-buffers: total RX buffers received, accumulated RX buffer size
+       in bytes, min size received, max size received
+
+   Thus, to find the average buffer size received, divide accumulated
+   RX-buffer / total RX-buffers.
+
+   To clear the statistics back to 0, write anything to the rx_stats file::
+
+	$ echo 1 > /sys/kernel/debug/wimax:wmx0/i2400m_rx_stats
+
+   Likewise for TX.
+
+   Note the packets this debug file refers to are not network packet, but
+   packets in the sense of the device-specific protocol for communication
+   to the host. See drivers/net/wimax/i2400m/tx.c.
+
+5.2.3. Tracing messages received from user space
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   To echo messages received from user space into the trace pipe that the
+   i2400m driver creates, set the debug file i2400m/trace_msg_from_user to
+   1::
+
+	$ echo 1 > /sys/kernel/debug/wimax:wmx0/i2400m/trace_msg_from_user
+
+5.2.4. Performing a device reset
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   By writing a 0, a 1 or a 2 to the file
+   /sys/kernel/debug/wimax:wmx0/reset, the driver performs a warm (without
+   disconnecting from the bus), cold (disconnecting from the bus) or bus
+   (bus specific) reset on the device.
+
+5.2.5. Asking the device to enter power saving mode
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   By writing any value to the /sys/kernel/debug/wimax:wmx0 file, the
+   device will attempt to enter power saving mode.
+
+6. Troubleshooting
+==================
+
+6.1. Driver complains about ``i2400m-fw-usb-1.2.sbcf: request failed``
+----------------------------------------------------------------------
+
+   If upon connecting the device, the following is output in the kernel
+   log::
+
+	i2400m_usb 5-4:1.0: fw i2400m-fw-usb-1.3.sbcf: request failed: -2
+
+   This means that the driver cannot locate the firmware file named
+   /lib/firmware/i2400m-fw-usb-1.2.sbcf. Check that the file is present in
+   the right location.
diff --git a/drivers/staging/wimax/Documentation/index.rst b/drivers/staging/wimax/Documentation/index.rst
new file mode 100644
index 000000000000..fdf7c1f99ff5
--- /dev/null
+++ b/drivers/staging/wimax/Documentation/index.rst
@@ -0,0 +1,19 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+WiMAX subsystem
+===============
+
+.. toctree::
+   :maxdepth: 2
+
+   wimax
+
+   i2400m
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/drivers/staging/wimax/Documentation/wimax.rst b/drivers/staging/wimax/Documentation/wimax.rst
new file mode 100644
index 000000000000..817ee8ba2732
--- /dev/null
+++ b/drivers/staging/wimax/Documentation/wimax.rst
@@ -0,0 +1,89 @@
+.. include:: <isonum.txt>
+
+========================
+Linux kernel WiMAX stack
+========================
+
+:Copyright: |copy| 2008 Intel Corporation < linux-wimax@intel.com >
+
+   This provides a basic Linux kernel WiMAX stack to provide a common
+   control API for WiMAX devices, usable from kernel and user space.
+
+1. Design
+=========
+
+   The WiMAX stack is designed to provide for common WiMAX control
+   services to current and future WiMAX devices from any vendor.
+
+   Because currently there is only one and we don't know what would be the
+   common services, the APIs it currently provides are very minimal.
+   However, it is done in such a way that it is easily extensible to
+   accommodate future requirements.
+
+   The stack works by embedding a struct wimax_dev in your device's
+   control structures. This provides a set of callbacks that the WiMAX
+   stack will call in order to implement control operations requested by
+   the user. As well, the stack provides API functions that the driver
+   calls to notify about changes of state in the device.
+
+   The stack exports the API calls needed to control the device to user
+   space using generic netlink as a marshalling mechanism. You can access
+   them using your own code or use the wrappers provided for your
+   convenience in libwimax (in the wimax-tools package).
+
+   For detailed information on the stack, please see
+   include/linux/wimax.h.
+
+2. Usage
+========
+
+   For usage in a driver (registration, API, etc) please refer to the
+   instructions in the header file include/linux/wimax.h.
+
+   When a device is registered with the WiMAX stack, a set of debugfs
+   files will appear in /sys/kernel/debug/wimax:wmxX can tweak for
+   control.
+
+2.1. Obtaining debug information: debugfs entries
+-------------------------------------------------
+
+   The WiMAX stack is compiled, by default, with debug messages that can
+   be used to diagnose issues. By default, said messages are disabled.
+
+   The drivers will register debugfs entries that allow the user to tweak
+   debug settings.
+
+   Each driver, when registering with the stack, will cause a debugfs
+   directory named wimax:DEVICENAME to be created; optionally, it might
+   create more subentries below it.
+
+2.1.1. Increasing debug output
+------------------------------
+
+   The files named *dl_* indicate knobs for controlling the debug output
+   of different submodules of the WiMAX stack::
+
+	# find /sys/kernel/debug/wimax\:wmx0 -name \*dl_\*
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_stack
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_rfkill
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_reset
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_op_msg
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_id_table
+	/sys/kernel/debug/wimax:wmx0/wimax_dl_debugfs
+	/sys/kernel/debug/wimax:wmx0/.... # other driver specific files
+
+   NOTE:
+       Of course, if debugfs is mounted in a directory other than
+       /sys/kernel/debug, those paths will change.
+
+   By reading the file you can obtain the current value of said debug
+   level; by writing to it, you can set it.
+
+   To increase the debug level of, for example, the id-table submodule,
+   just write:
+
+	$ echo 3 > /sys/kernel/debug/wimax:wmx0/wimax_dl_id_table
+
+   Increasing numbers yield increasing debug information; for details of
+   what is printed and the available levels, check the source. The code
+   uses 0 for disabled and increasing values until 8.
diff --git a/drivers/staging/wimax/Kconfig b/drivers/staging/wimax/Kconfig
new file mode 100644
index 000000000000..ded8b70b25ee
--- /dev/null
+++ b/drivers/staging/wimax/Kconfig
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# WiMAX LAN device configuration
+#
+
+menuconfig WIMAX
+	tristate "WiMAX Wireless Broadband support"
+	depends on RFKILL || !RFKILL
+	help
+
+	  Select to configure support for devices that provide
+	  wireless broadband connectivity using the WiMAX protocol
+	  (IEEE 802.16).
+
+	  Please note that most of these devices require signing up
+	  for a service plan with a provider.
+
+	  The different WiMAX drivers can be enabled in the menu entry
+
+	  Device Drivers > Network device support > WiMAX Wireless
+	  Broadband devices
+
+	  If unsure, it is safe to select M (module).
+
+if WIMAX
+
+config WIMAX_DEBUG_LEVEL
+	int "WiMAX debug level"
+	depends on WIMAX
+	default 8
+	help
+
+	  Select the maximum debug verbosity level to be compiled into
+	  the WiMAX stack code.
+
+	  By default, debug messages are disabled at runtime and can
+	  be selectively enabled for different parts of the code using
+	  the sysfs debug-levels file.
+
+	  If set at zero, this will compile out all the debug code.
+
+	  It is recommended that it is left at 8.
+
+source "drivers/staging/wimax/i2400m/Kconfig"
+
+endif
diff --git a/drivers/staging/wimax/Makefile b/drivers/staging/wimax/Makefile
new file mode 100644
index 000000000000..0e3f988656aa
--- /dev/null
+++ b/drivers/staging/wimax/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_WIMAX)		+= wimax.o
+
+wimax-y :=		\
+	id-table.o	\
+	op-msg.o	\
+	op-reset.o	\
+	op-rfkill.o	\
+	op-state-get.o	\
+	stack.o
+
+wimax-$(CONFIG_DEBUG_FS) += debugfs.o
+
+obj-$(CONFIG_WIMAX_I2400M)	+= i2400m/
diff --git a/drivers/staging/wimax/TODO b/drivers/staging/wimax/TODO
new file mode 100644
index 000000000000..26e4cb9e9599
--- /dev/null
+++ b/drivers/staging/wimax/TODO
@@ -0,0 +1,18 @@
+There are no known users of this driver as of October 2020, and it will
+be removed unless someone turns out to still need it in future releases.
+
+According to https://en.wikipedia.org/wiki/List_of_WiMAX_networks, there
+have been many public wimax networks, but it appears that many of these
+have migrated to LTE or discontinued their service altogether.  As most
+PCs and phones lack WiMAX hardware support, the remaining networks tend
+to use standalone routers. These almost certainly run Linux, but not a
+modern kernel or the mainline wimax driver stack.
+
+NetworkManager appears to have dropped userspace support in 2015
+https://bugzilla.gnome.org/show_bug.cgi?id=747846, the www.linuxwimax.org
+site had already shut down earlier.
+
+WiMax is apparently still being deployed on airport campus networks
+("AeroMACS"), but in a frequency band that was not supported by the old
+Intel 2400m (used in Sandy Bridge laptops and earlier), which is the
+only driver using the kernel's wimax stack.
diff --git a/drivers/staging/wimax/debug-levels.h b/drivers/staging/wimax/debug-levels.h
new file mode 100644
index 000000000000..b854802d1d00
--- /dev/null
+++ b/drivers/staging/wimax/debug-levels.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Linux WiMAX Stack
+ * Debug levels control file for the wimax module
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ */
+#ifndef __debug_levels__h__
+#define __debug_levels__h__
+
+/* Maximum compile and run time debug level for all submodules */
+#define D_MODULENAME wimax
+#define D_MASTER CONFIG_WIMAX_DEBUG_LEVEL
+
+#include "linux-wimax-debug.h"
+
+/* List of all the enabled modules */
+enum d_module {
+	D_SUBMODULE_DECLARE(debugfs),
+	D_SUBMODULE_DECLARE(id_table),
+	D_SUBMODULE_DECLARE(op_msg),
+	D_SUBMODULE_DECLARE(op_reset),
+	D_SUBMODULE_DECLARE(op_rfkill),
+	D_SUBMODULE_DECLARE(op_state_get),
+	D_SUBMODULE_DECLARE(stack),
+};
+
+#endif /* #ifndef __debug_levels__h__ */
diff --git a/drivers/staging/wimax/debugfs.c b/drivers/staging/wimax/debugfs.c
new file mode 100644
index 000000000000..e11bff61ffcf
--- /dev/null
+++ b/drivers/staging/wimax/debugfs.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux WiMAX
+ * Debugfs support
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ */
+#include <linux/debugfs.h>
+#include "linux-wimax.h"
+#include "wimax-internal.h"
+
+#define D_SUBMODULE debugfs
+#include "debug-levels.h"
+
+void wimax_debugfs_add(struct wimax_dev *wimax_dev)
+{
+	struct net_device *net_dev = wimax_dev->net_dev;
+	struct dentry *dentry;
+	char buf[128];
+
+	snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name);
+	dentry = debugfs_create_dir(buf, NULL);
+	wimax_dev->debugfs_dentry = dentry;
+
+	d_level_register_debugfs("wimax_dl_", debugfs, dentry);
+	d_level_register_debugfs("wimax_dl_", id_table, dentry);
+	d_level_register_debugfs("wimax_dl_", op_msg, dentry);
+	d_level_register_debugfs("wimax_dl_", op_reset, dentry);
+	d_level_register_debugfs("wimax_dl_", op_rfkill, dentry);
+	d_level_register_debugfs("wimax_dl_", op_state_get, dentry);
+	d_level_register_debugfs("wimax_dl_", stack, dentry);
+}
+
+void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
+{
+	debugfs_remove_recursive(wimax_dev->debugfs_dentry);
+}
diff --git a/drivers/staging/wimax/i2400m/Kconfig b/drivers/staging/wimax/i2400m/Kconfig
new file mode 100644
index 000000000000..843b905a26a3
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/Kconfig
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config WIMAX_I2400M
+	tristate
+	depends on WIMAX
+	select FW_LOADER
+
+comment "Enable USB support to see WiMAX USB drivers"
+	depends on USB = n
+
+config WIMAX_I2400M_USB
+	tristate "Intel Wireless WiMAX Connection 2400 over USB (including 5x50)"
+	depends on WIMAX && USB
+	select WIMAX_I2400M
+	help
+	  Select if you have a device based on the Intel WiMAX
+	  Connection 2400 over USB (like any of the Intel Wireless
+	  WiMAX/WiFi Link 5x50 series).
+
+	  If unsure, it is safe to select M (module).
+
+config WIMAX_I2400M_DEBUG_LEVEL
+	int "WiMAX i2400m debug level"
+	depends on WIMAX_I2400M
+	default 8
+	help
+
+	  Select the maximum debug verbosity level to be compiled into
+	  the WiMAX i2400m driver code.
+
+	  By default, this is disabled at runtime and can be
+	  selectively enabled at runtime for different parts of the
+	  code using the sysfs debug-levels file.
+
+	  If set at zero, this will compile out all the debug code.
+
+	  It is recommended that it is left at 8.
diff --git a/drivers/staging/wimax/i2400m/Makefile b/drivers/staging/wimax/i2400m/Makefile
new file mode 100644
index 000000000000..b1db1eff0648
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_WIMAX_I2400M) += i2400m.o
+obj-$(CONFIG_WIMAX_I2400M_USB) += i2400m-usb.o
+
+i2400m-y :=		\
+	control.o	\
+	driver.o	\
+	fw.o		\
+	op-rfkill.o	\
+	sysfs.o		\
+	netdev.o	\
+	tx.o		\
+	rx.o
+
+i2400m-$(CONFIG_DEBUG_FS) += debugfs.o
+
+i2400m-usb-y :=			\
+	usb-fw.o		\
+	usb-notif.o		\
+	usb-tx.o		\
+	usb-rx.o		\
+	usb.o
diff --git a/drivers/staging/wimax/i2400m/control.c b/drivers/staging/wimax/i2400m/control.c
new file mode 100644
index 000000000000..fe885aa56cf3
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/control.c
@@ -0,0 +1,1434 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Miscellaneous control functions for managing the device
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Initial implementation
+ *
+ * This is a collection of functions used to control the device (plus
+ * a few helpers).
+ *
+ * There are utilities for handling TLV buffers, hooks on the device's
+ * reports to act on device changes of state [i2400m_report_hook()],
+ * on acks to commands [i2400m_msg_ack_hook()], a helper for sending
+ * commands to the device and blocking until a reply arrives
+ * [i2400m_msg_to_dev()], a few high level commands for manipulating
+ * the device state, powersving mode and configuration plus the
+ * routines to setup the device once communication is stablished with
+ * it [i2400m_dev_initialize()].
+ *
+ * ROADMAP
+ *
+ * i2400m_dev_initialize()       Called by i2400m_dev_start()
+ *   i2400m_set_init_config()
+ *   i2400m_cmd_get_state()
+ * i2400m_dev_shutdown()        Called by i2400m_dev_stop()
+ *   i2400m_reset()
+ *
+ * i2400m_{cmd,get,set}_*()
+ *   i2400m_msg_to_dev()
+ *   i2400m_msg_check_status()
+ *
+ * i2400m_report_hook()         Called on reception of an event
+ *   i2400m_report_state_hook()
+ *     i2400m_tlv_buffer_walk()
+ *     i2400m_tlv_match()
+ *     i2400m_report_tlv_system_state()
+ *     i2400m_report_tlv_rf_switches_status()
+ *     i2400m_report_tlv_media_status()
+ *   i2400m_cmd_enter_powersave()
+ *
+ * i2400m_msg_ack_hook()        Called on reception of a reply to a
+ *                              command, get or set
+ */
+
+#include <stdarg.h>
+#include "i2400m.h"
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "linux-wimax-i2400m.h"
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+
+
+#define D_SUBMODULE control
+#include "debug-levels.h"
+
+static int i2400m_idle_mode_disabled;/* 0 (idle mode enabled) by default */
+module_param_named(idle_mode_disabled, i2400m_idle_mode_disabled, int, 0644);
+MODULE_PARM_DESC(idle_mode_disabled,
+		 "If true, the device will not enable idle mode negotiation "
+		 "with the base station (when connected) to save power.");
+
+/* 0 (power saving enabled) by default */
+static int i2400m_power_save_disabled;
+module_param_named(power_save_disabled, i2400m_power_save_disabled, int, 0644);
+MODULE_PARM_DESC(power_save_disabled,
+		 "If true, the driver will not tell the device to enter "
+		 "power saving mode when it reports it is ready for it. "
+		 "False by default (so the device is told to do power "
+		 "saving).");
+
+static int i2400m_passive_mode;	/* 0 (passive mode disabled) by default */
+module_param_named(passive_mode, i2400m_passive_mode, int, 0644);
+MODULE_PARM_DESC(passive_mode,
+		 "If true, the driver will not do any device setup "
+		 "and leave it up to user space, who must be properly "
+		 "setup.");
+
+
+/*
+ * Return if a TLV is of a give type and size
+ *
+ * @tlv_hdr: pointer to the TLV
+ * @tlv_type: type of the TLV we are looking for
+ * @tlv_size: expected size of the TLV we are looking for (if -1,
+ *            don't check the size). This includes the header
+ * Returns: 0 if the TLV matches
+ *          < 0 if it doesn't match at all
+ *          > 0 total TLV + payload size, if the type matches, but not
+ *              the size
+ */
+static
+ssize_t i2400m_tlv_match(const struct i2400m_tlv_hdr *tlv,
+		     enum i2400m_tlv tlv_type, ssize_t tlv_size)
+{
+	if (le16_to_cpu(tlv->type) != tlv_type)	/* Not our type? skip */
+		return -1;
+	if (tlv_size != -1
+	    && le16_to_cpu(tlv->length) + sizeof(*tlv) != tlv_size) {
+		size_t size = le16_to_cpu(tlv->length) + sizeof(*tlv);
+		printk(KERN_WARNING "W: tlv type 0x%x mismatched because of "
+		       "size (got %zu vs %zd expected)\n",
+		       tlv_type, size, tlv_size);
+		return size;
+	}
+	return 0;
+}
+
+
+/*
+ * Given a buffer of TLVs, iterate over them
+ *
+ * @i2400m: device instance
+ * @tlv_buf: pointer to the beginning of the TLV buffer
+ * @buf_size: buffer size in bytes
+ * @tlv_pos: seek position; this is assumed to be a pointer returned
+ *           by i2400m_tlv_buffer_walk() [and thus, validated]. The
+ *           TLV returned will be the one following this one.
+ *
+ * Usage:
+ *
+ * tlv_itr = NULL;
+ * while (tlv_itr = i2400m_tlv_buffer_walk(i2400m, buf, size, tlv_itr))  {
+ *         ...
+ *         // Do stuff with tlv_itr, DON'T MODIFY IT
+ *         ...
+ * }
+ */
+static
+const struct i2400m_tlv_hdr *i2400m_tlv_buffer_walk(
+	struct i2400m *i2400m,
+	const void *tlv_buf, size_t buf_size,
+	const struct i2400m_tlv_hdr *tlv_pos)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_tlv_hdr *tlv_top = tlv_buf + buf_size;
+	size_t offset, length, avail_size;
+	unsigned type;
+
+	if (tlv_pos == NULL)	/* Take the first one? */
+		tlv_pos = tlv_buf;
+	else			/* Nope, the next one */
+		tlv_pos = (void *) tlv_pos
+			+ le16_to_cpu(tlv_pos->length) + sizeof(*tlv_pos);
+	if (tlv_pos == tlv_top) {	/* buffer done */
+		tlv_pos = NULL;
+		goto error_beyond_end;
+	}
+	if (tlv_pos > tlv_top) {
+		tlv_pos = NULL;
+		WARN_ON(1);
+		goto error_beyond_end;
+	}
+	offset = (void *) tlv_pos - (void *) tlv_buf;
+	avail_size = buf_size - offset;
+	if (avail_size < sizeof(*tlv_pos)) {
+		dev_err(dev, "HW BUG? tlv_buf %p [%zu bytes], tlv @%zu: "
+			"short header\n", tlv_buf, buf_size, offset);
+		goto error_short_header;
+	}
+	type = le16_to_cpu(tlv_pos->type);
+	length = le16_to_cpu(tlv_pos->length);
+	if (avail_size < sizeof(*tlv_pos) + length) {
+		dev_err(dev, "HW BUG? tlv_buf %p [%zu bytes], "
+			"tlv type 0x%04x @%zu: "
+			"short data (%zu bytes vs %zu needed)\n",
+			tlv_buf, buf_size, type, offset, avail_size,
+			sizeof(*tlv_pos) + length);
+		goto error_short_header;
+	}
+error_short_header:
+error_beyond_end:
+	return tlv_pos;
+}
+
+
+/*
+ * Find a TLV in a buffer of sequential TLVs
+ *
+ * @i2400m: device descriptor
+ * @tlv_hdr: pointer to the first TLV in the sequence
+ * @size: size of the buffer in bytes; all TLVs are assumed to fit
+ *        fully in the buffer (otherwise we'll complain).
+ * @tlv_type: type of the TLV we are looking for
+ * @tlv_size: expected size of the TLV we are looking for (if -1,
+ *            don't check the size). This includes the header
+ *
+ * Returns: NULL if the TLV is not found, otherwise a pointer to
+ *          it. If the sizes don't match, an error is printed and NULL
+ *          returned.
+ */
+static
+const struct i2400m_tlv_hdr *i2400m_tlv_find(
+	struct i2400m *i2400m,
+	const struct i2400m_tlv_hdr *tlv_hdr, size_t size,
+	enum i2400m_tlv tlv_type, ssize_t tlv_size)
+{
+	ssize_t match;
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_tlv_hdr *tlv = NULL;
+	while ((tlv = i2400m_tlv_buffer_walk(i2400m, tlv_hdr, size, tlv))) {
+		match = i2400m_tlv_match(tlv, tlv_type, tlv_size);
+		if (match == 0)		/* found it :) */
+			break;
+		if (match > 0)
+			dev_warn(dev, "TLV type 0x%04x found with size "
+				 "mismatch (%zu vs %zd needed)\n",
+				 tlv_type, match, tlv_size);
+	}
+	return tlv;
+}
+
+
+static const struct
+{
+	char *msg;
+	int errno;
+} ms_to_errno[I2400M_MS_MAX] = {
+	[I2400M_MS_DONE_OK] = { "", 0 },
+	[I2400M_MS_DONE_IN_PROGRESS] = { "", 0 },
+	[I2400M_MS_INVALID_OP] = { "invalid opcode", -ENOSYS },
+	[I2400M_MS_BAD_STATE] = { "invalid state", -EILSEQ },
+	[I2400M_MS_ILLEGAL_VALUE] = { "illegal value", -EINVAL },
+	[I2400M_MS_MISSING_PARAMS] = { "missing parameters", -ENOMSG },
+	[I2400M_MS_VERSION_ERROR] = { "bad version", -EIO },
+	[I2400M_MS_ACCESSIBILITY_ERROR] = { "accesibility error", -EIO },
+	[I2400M_MS_BUSY] = { "busy", -EBUSY },
+	[I2400M_MS_CORRUPTED_TLV] = { "corrupted TLV", -EILSEQ },
+	[I2400M_MS_UNINITIALIZED] = { "uninitialized", -EILSEQ },
+	[I2400M_MS_UNKNOWN_ERROR] = { "unknown error", -EIO },
+	[I2400M_MS_PRODUCTION_ERROR] = { "production error", -EIO },
+	[I2400M_MS_NO_RF] = { "no RF", -EIO },
+	[I2400M_MS_NOT_READY_FOR_POWERSAVE] =
+		{ "not ready for powersave", -EACCES },
+	[I2400M_MS_THERMAL_CRITICAL] = { "thermal critical", -EL3HLT },
+};
+
+
+/*
+ * i2400m_msg_check_status - translate a message's status code
+ *
+ * @i2400m: device descriptor
+ * @l3l4_hdr: message header
+ * @strbuf: buffer to place a formatted error message (unless NULL).
+ * @strbuf_size: max amount of available space; larger messages will
+ * be truncated.
+ *
+ * Returns: errno code corresponding to the status code in @l3l4_hdr
+ *          and a message in @strbuf describing the error.
+ */
+int i2400m_msg_check_status(const struct i2400m_l3l4_hdr *l3l4_hdr,
+			    char *strbuf, size_t strbuf_size)
+{
+	int result;
+	enum i2400m_ms status = le16_to_cpu(l3l4_hdr->status);
+	const char *str;
+
+	if (status == 0)
+		return 0;
+	if (status >= ARRAY_SIZE(ms_to_errno)) {
+		str = "unknown status code";
+		result = -EBADR;
+	} else {
+		str = ms_to_errno[status].msg;
+		result = ms_to_errno[status].errno;
+	}
+	if (strbuf)
+		snprintf(strbuf, strbuf_size, "%s (%d)", str, status);
+	return result;
+}
+
+
+/*
+ * Act on a TLV System State reported by the device
+ *
+ * @i2400m: device descriptor
+ * @ss: validated System State TLV
+ */
+static
+void i2400m_report_tlv_system_state(struct i2400m *i2400m,
+				    const struct i2400m_tlv_system_state *ss)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	enum i2400m_system_state i2400m_state = le32_to_cpu(ss->state);
+
+	d_fnstart(3, dev, "(i2400m %p ss %p [%u])\n", i2400m, ss, i2400m_state);
+
+	if (i2400m->state != i2400m_state) {
+		i2400m->state = i2400m_state;
+		wake_up_all(&i2400m->state_wq);
+	}
+	switch (i2400m_state) {
+	case I2400M_SS_UNINITIALIZED:
+	case I2400M_SS_INIT:
+	case I2400M_SS_CONFIG:
+	case I2400M_SS_PRODUCTION:
+		wimax_state_change(wimax_dev, WIMAX_ST_UNINITIALIZED);
+		break;
+
+	case I2400M_SS_RF_OFF:
+	case I2400M_SS_RF_SHUTDOWN:
+		wimax_state_change(wimax_dev, WIMAX_ST_RADIO_OFF);
+		break;
+
+	case I2400M_SS_READY:
+	case I2400M_SS_STANDBY:
+	case I2400M_SS_SLEEPACTIVE:
+		wimax_state_change(wimax_dev, WIMAX_ST_READY);
+		break;
+
+	case I2400M_SS_CONNECTING:
+	case I2400M_SS_WIMAX_CONNECTED:
+		wimax_state_change(wimax_dev, WIMAX_ST_READY);
+		break;
+
+	case I2400M_SS_SCAN:
+	case I2400M_SS_OUT_OF_ZONE:
+		wimax_state_change(wimax_dev, WIMAX_ST_SCANNING);
+		break;
+
+	case I2400M_SS_IDLE:
+		d_printf(1, dev, "entering BS-negotiated idle mode\n");
+		fallthrough;
+	case I2400M_SS_DISCONNECTING:
+	case I2400M_SS_DATA_PATH_CONNECTED:
+		wimax_state_change(wimax_dev, WIMAX_ST_CONNECTED);
+		break;
+
+	default:
+		/* Huh? just in case, shut it down */
+		dev_err(dev, "HW BUG? unknown state %u: shutting down\n",
+			i2400m_state);
+		i2400m_reset(i2400m, I2400M_RT_WARM);
+		break;
+	}
+	d_fnend(3, dev, "(i2400m %p ss %p [%u]) = void\n",
+		i2400m, ss, i2400m_state);
+}
+
+
+/*
+ * Parse and act on a TLV Media Status sent by the device
+ *
+ * @i2400m: device descriptor
+ * @ms: validated Media Status TLV
+ *
+ * This will set the carrier up on down based on the device's link
+ * report. This is done asides of what the WiMAX stack does based on
+ * the device's state as sometimes we need to do a link-renew (the BS
+ * wants us to renew a DHCP lease, for example).
+ *
+ * In fact, doc says that every time we get a link-up, we should do a
+ * DHCP negotiation...
+ */
+static
+void i2400m_report_tlv_media_status(struct i2400m *i2400m,
+				    const struct i2400m_tlv_media_status *ms)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	struct net_device *net_dev = wimax_dev->net_dev;
+	enum i2400m_media_status status = le32_to_cpu(ms->media_status);
+
+	d_fnstart(3, dev, "(i2400m %p ms %p [%u])\n", i2400m, ms, status);
+
+	switch (status) {
+	case I2400M_MEDIA_STATUS_LINK_UP:
+		netif_carrier_on(net_dev);
+		break;
+	case I2400M_MEDIA_STATUS_LINK_DOWN:
+		netif_carrier_off(net_dev);
+		break;
+	/*
+	 * This is the network telling us we need to retrain the DHCP
+	 * lease -- so far, we are trusting the WiMAX Network Service
+	 * in user space to pick this up and poke the DHCP client.
+	 */
+	case I2400M_MEDIA_STATUS_LINK_RENEW:
+		netif_carrier_on(net_dev);
+		break;
+	default:
+		dev_err(dev, "HW BUG? unknown media status %u\n",
+			status);
+	}
+	d_fnend(3, dev, "(i2400m %p ms %p [%u]) = void\n",
+		i2400m, ms, status);
+}
+
+
+/*
+ * Process a TLV from a 'state report'
+ *
+ * @i2400m: device descriptor
+ * @tlv: pointer to the TLV header; it has been already validated for
+ *     consistent size.
+ * @tag: for error messages
+ *
+ * Act on the TLVs from a 'state report'.
+ */
+static
+void i2400m_report_state_parse_tlv(struct i2400m *i2400m,
+				   const struct i2400m_tlv_hdr *tlv,
+				   const char *tag)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_tlv_media_status *ms;
+	const struct i2400m_tlv_system_state *ss;
+	const struct i2400m_tlv_rf_switches_status *rfss;
+
+	if (0 == i2400m_tlv_match(tlv, I2400M_TLV_SYSTEM_STATE, sizeof(*ss))) {
+		ss = container_of(tlv, typeof(*ss), hdr);
+		d_printf(2, dev, "%s: system state TLV "
+			 "found (0x%04x), state 0x%08x\n",
+			 tag, I2400M_TLV_SYSTEM_STATE,
+			 le32_to_cpu(ss->state));
+		i2400m_report_tlv_system_state(i2400m, ss);
+	}
+	if (0 == i2400m_tlv_match(tlv, I2400M_TLV_RF_STATUS, sizeof(*rfss))) {
+		rfss = container_of(tlv, typeof(*rfss), hdr);
+		d_printf(2, dev, "%s: RF status TLV "
+			 "found (0x%04x), sw 0x%02x hw 0x%02x\n",
+			 tag, I2400M_TLV_RF_STATUS,
+			 le32_to_cpu(rfss->sw_rf_switch),
+			 le32_to_cpu(rfss->hw_rf_switch));
+		i2400m_report_tlv_rf_switches_status(i2400m, rfss);
+	}
+	if (0 == i2400m_tlv_match(tlv, I2400M_TLV_MEDIA_STATUS, sizeof(*ms))) {
+		ms = container_of(tlv, typeof(*ms), hdr);
+		d_printf(2, dev, "%s: Media Status TLV: %u\n",
+			 tag, le32_to_cpu(ms->media_status));
+		i2400m_report_tlv_media_status(i2400m, ms);
+	}
+}
+
+
+/*
+ * Parse a 'state report' and extract information
+ *
+ * @i2400m: device descriptor
+ * @l3l4_hdr: pointer to message; it has been already validated for
+ *            consistent size.
+ * @size: size of the message (header + payload). The header length
+ *        declaration is assumed to be congruent with @size (as in
+ *        sizeof(*l3l4_hdr) + l3l4_hdr->length == size)
+ *
+ * Walk over the TLVs in a report state and act on them.
+ */
+static
+void i2400m_report_state_hook(struct i2400m *i2400m,
+			      const struct i2400m_l3l4_hdr *l3l4_hdr,
+			      size_t size, const char *tag)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_tlv_hdr *tlv;
+	size_t tlv_size = le16_to_cpu(l3l4_hdr->length);
+
+	d_fnstart(4, dev, "(i2400m %p, l3l4_hdr %p, size %zu, %s)\n",
+		  i2400m, l3l4_hdr, size, tag);
+	tlv = NULL;
+
+	while ((tlv = i2400m_tlv_buffer_walk(i2400m, &l3l4_hdr->pl,
+					     tlv_size, tlv)))
+		i2400m_report_state_parse_tlv(i2400m, tlv, tag);
+	d_fnend(4, dev, "(i2400m %p, l3l4_hdr %p, size %zu, %s) = void\n",
+		i2400m, l3l4_hdr, size, tag);
+}
+
+
+/*
+ * i2400m_report_hook - (maybe) act on a report
+ *
+ * @i2400m: device descriptor
+ * @l3l4_hdr: pointer to message; it has been already validated for
+ *            consistent size.
+ * @size: size of the message (header + payload). The header length
+ *        declaration is assumed to be congruent with @size (as in
+ *        sizeof(*l3l4_hdr) + l3l4_hdr->length == size)
+ *
+ * Extract information we might need (like carrien on/off) from a
+ * device report.
+ */
+void i2400m_report_hook(struct i2400m *i2400m,
+			const struct i2400m_l3l4_hdr *l3l4_hdr, size_t size)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned msg_type;
+
+	d_fnstart(3, dev, "(i2400m %p l3l4_hdr %p size %zu)\n",
+		  i2400m, l3l4_hdr, size);
+	/* Chew on the message, we might need some information from
+	 * here */
+	msg_type = le16_to_cpu(l3l4_hdr->type);
+	switch (msg_type) {
+	case I2400M_MT_REPORT_STATE:	/* carrier detection... */
+		i2400m_report_state_hook(i2400m,
+					 l3l4_hdr, size, "REPORT STATE");
+		break;
+	/* If the device is ready for power save, then ask it to do
+	 * it. */
+	case I2400M_MT_REPORT_POWERSAVE_READY:	/* zzzzz */
+		if (l3l4_hdr->status == cpu_to_le16(I2400M_MS_DONE_OK)) {
+			if (i2400m_power_save_disabled)
+				d_printf(1, dev, "ready for powersave, "
+					 "not requesting (disabled by module "
+					 "parameter)\n");
+			else {
+				d_printf(1, dev, "ready for powersave, "
+					 "requesting\n");
+				i2400m_cmd_enter_powersave(i2400m);
+			}
+		}
+		break;
+	}
+	d_fnend(3, dev, "(i2400m %p l3l4_hdr %p size %zu) = void\n",
+		i2400m, l3l4_hdr, size);
+}
+
+
+/*
+ * i2400m_msg_ack_hook - process cmd/set/get ack for internal status
+ *
+ * @i2400m: device descriptor
+ * @l3l4_hdr: pointer to message; it has been already validated for
+ *            consistent size.
+ * @size: size of the message
+ *
+ * Extract information we might need from acks to commands and act on
+ * it. This is akin to i2400m_report_hook(). Note most of this
+ * processing should be done in the function that calls the
+ * command. This is here for some cases where it can't happen...
+ */
+static void i2400m_msg_ack_hook(struct i2400m *i2400m,
+				 const struct i2400m_l3l4_hdr *l3l4_hdr,
+				 size_t size)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned int ack_type;
+	char strerr[32];
+
+	/* Chew on the message, we might need some information from
+	 * here */
+	ack_type = le16_to_cpu(l3l4_hdr->type);
+	switch (ack_type) {
+	case I2400M_MT_CMD_ENTER_POWERSAVE:
+		/* This is just left here for the sake of example, as
+		 * the processing is done somewhere else. */
+		if (0) {
+			result = i2400m_msg_check_status(
+				l3l4_hdr, strerr, sizeof(strerr));
+			if (result >= 0)
+				d_printf(1, dev, "ready for power save: %zd\n",
+					 size);
+		}
+		break;
+	}
+}
+
+
+/*
+ * i2400m_msg_size_check() - verify message size and header are congruent
+ *
+ * It is ok if the total message size is larger than the expected
+ * size, as there can be padding.
+ */
+int i2400m_msg_size_check(struct i2400m *i2400m,
+			  const struct i2400m_l3l4_hdr *l3l4_hdr,
+			  size_t msg_size)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	size_t expected_size;
+	d_fnstart(4, dev, "(i2400m %p l3l4_hdr %p msg_size %zu)\n",
+		  i2400m, l3l4_hdr, msg_size);
+	if (msg_size < sizeof(*l3l4_hdr)) {
+		dev_err(dev, "bad size for message header "
+			"(expected at least %zu, got %zu)\n",
+			(size_t) sizeof(*l3l4_hdr), msg_size);
+		result = -EIO;
+		goto error_hdr_size;
+	}
+	expected_size = le16_to_cpu(l3l4_hdr->length) + sizeof(*l3l4_hdr);
+	if (msg_size < expected_size) {
+		dev_err(dev, "bad size for message code 0x%04x (expected %zu, "
+			"got %zu)\n", le16_to_cpu(l3l4_hdr->type),
+			expected_size, msg_size);
+		result = -EIO;
+	} else
+		result = 0;
+error_hdr_size:
+	d_fnend(4, dev,
+		"(i2400m %p l3l4_hdr %p msg_size %zu) = %d\n",
+		i2400m, l3l4_hdr, msg_size, result);
+	return result;
+}
+
+
+
+/*
+ * Cancel a wait for a command ACK
+ *
+ * @i2400m: device descriptor
+ * @code: [negative] errno code to cancel with (don't use
+ *     -EINPROGRESS)
+ *
+ * If there is an ack already filled out, free it.
+ */
+void i2400m_msg_to_dev_cancel_wait(struct i2400m *i2400m, int code)
+{
+	struct sk_buff *ack_skb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	ack_skb = i2400m->ack_skb;
+	if (ack_skb && !IS_ERR(ack_skb))
+		kfree_skb(ack_skb);
+	i2400m->ack_skb = ERR_PTR(code);
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+}
+
+
+/**
+ * i2400m_msg_to_dev - Send a control message to the device and get a response
+ *
+ * @i2400m: device descriptor
+ *
+ * @buf: pointer to the buffer containing the message to be sent; it
+ *           has to start with a &struct i2400M_l3l4_hdr and then
+ *           followed by the payload. Once this function returns, the
+ *           buffer can be reused.
+ *
+ * @buf_len: buffer size
+ *
+ * Returns:
+ *
+ * Pointer to skb containing the ack message. You need to check the
+ * pointer with IS_ERR(), as it might be an error code. Error codes
+ * could happen because:
+ *
+ *  - the message wasn't formatted correctly
+ *  - couldn't send the message
+ *  - failed waiting for a response
+ *  - the ack message wasn't formatted correctly
+ *
+ * The returned skb has been allocated with wimax_msg_to_user_alloc(),
+ * it contains the response in a netlink attribute and is ready to be
+ * passed up to user space with wimax_msg_to_user_send(). To access
+ * the payload and its length, use wimax_msg_{data,len}() on the skb.
+ *
+ * The skb has to be freed with kfree_skb() once done.
+ *
+ * Description:
+ *
+ * This function delivers a message/command to the device and waits
+ * for an ack to be received. The format is described in
+ * linux/wimax/i2400m.h. In summary, a command/get/set is followed by an
+ * ack.
+ *
+ * This function will not check the ack status, that's left up to the
+ * caller.  Once done with the ack skb, it has to be kfree_skb()ed.
+ *
+ * The i2400m handles only one message at the same time, thus we need
+ * the mutex to exclude other players.
+ *
+ * We write the message and then wait for an answer to come back. The
+ * RX path intercepts control messages and handles them in
+ * i2400m_rx_ctl(). Reports (notifications) are (maybe) processed
+ * locally and then forwarded (as needed) to user space on the WiMAX
+ * stack message pipe. Acks are saved and passed back to us through an
+ * skb in i2400m->ack_skb which is ready to be given to generic
+ * netlink if need be.
+ */
+struct sk_buff *i2400m_msg_to_dev(struct i2400m *i2400m,
+				  const void *buf, size_t buf_len)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_l3l4_hdr *msg_l3l4_hdr;
+	struct sk_buff *ack_skb;
+	const struct i2400m_l3l4_hdr *ack_l3l4_hdr;
+	size_t ack_len;
+	int ack_timeout;
+	unsigned msg_type;
+	unsigned long flags;
+
+	d_fnstart(3, dev, "(i2400m %p buf %p len %zu)\n",
+		  i2400m, buf, buf_len);
+
+	rmb();		/* Make sure we see what i2400m_dev_reset_handle() */
+	if (i2400m->boot_mode)
+		return ERR_PTR(-EL3RST);
+
+	msg_l3l4_hdr = buf;
+	/* Check msg & payload consistency */
+	result = i2400m_msg_size_check(i2400m, msg_l3l4_hdr, buf_len);
+	if (result < 0)
+		goto error_bad_msg;
+	msg_type = le16_to_cpu(msg_l3l4_hdr->type);
+	d_printf(1, dev, "CMD/GET/SET 0x%04x %zu bytes\n",
+		 msg_type, buf_len);
+	d_dump(2, dev, buf, buf_len);
+
+	/* Setup the completion, ack_skb ("we are waiting") and send
+	 * the message to the device */
+	mutex_lock(&i2400m->msg_mutex);
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	i2400m->ack_skb = ERR_PTR(-EINPROGRESS);
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	init_completion(&i2400m->msg_completion);
+	result = i2400m_tx(i2400m, buf, buf_len, I2400M_PT_CTRL);
+	if (result < 0) {
+		dev_err(dev, "can't send message 0x%04x: %d\n",
+			le16_to_cpu(msg_l3l4_hdr->type), result);
+		goto error_tx;
+	}
+
+	/* Some commands take longer to execute because of crypto ops,
+	 * so we give them some more leeway on timeout */
+	switch (msg_type) {
+	case I2400M_MT_GET_TLS_OPERATION_RESULT:
+	case I2400M_MT_CMD_SEND_EAP_RESPONSE:
+		ack_timeout = 5 * HZ;
+		break;
+	default:
+		ack_timeout = HZ;
+	}
+
+	if (unlikely(i2400m->trace_msg_from_user))
+		wimax_msg(&i2400m->wimax_dev, "echo", buf, buf_len, GFP_KERNEL);
+	/* The RX path in rx.c will put any response for this message
+	 * in i2400m->ack_skb and wake us up. If we cancel the wait,
+	 * we need to change the value of i2400m->ack_skb to something
+	 * not -EINPROGRESS so RX knows there is no one waiting. */
+	result = wait_for_completion_interruptible_timeout(
+		&i2400m->msg_completion, ack_timeout);
+	if (result == 0) {
+		dev_err(dev, "timeout waiting for reply to message 0x%04x\n",
+			msg_type);
+		result = -ETIMEDOUT;
+		i2400m_msg_to_dev_cancel_wait(i2400m, result);
+		goto error_wait_for_completion;
+	} else if (result < 0) {
+		dev_err(dev, "error waiting for reply to message 0x%04x: %d\n",
+			msg_type, result);
+		i2400m_msg_to_dev_cancel_wait(i2400m, result);
+		goto error_wait_for_completion;
+	}
+
+	/* Pull out the ack data from i2400m->ack_skb -- see if it is
+	 * an error and act accordingly */
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	ack_skb = i2400m->ack_skb;
+	if (IS_ERR(ack_skb))
+		result = PTR_ERR(ack_skb);
+	else
+		result = 0;
+	i2400m->ack_skb = NULL;
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	if (result < 0)
+		goto error_ack_status;
+	ack_l3l4_hdr = wimax_msg_data_len(ack_skb, &ack_len);
+
+	/* Check the ack and deliver it if it is ok */
+	if (unlikely(i2400m->trace_msg_from_user))
+		wimax_msg(&i2400m->wimax_dev, "echo",
+			  ack_l3l4_hdr, ack_len, GFP_KERNEL);
+	result = i2400m_msg_size_check(i2400m, ack_l3l4_hdr, ack_len);
+	if (result < 0) {
+		dev_err(dev, "HW BUG? reply to message 0x%04x: %d\n",
+			msg_type, result);
+		goto error_bad_ack_len;
+	}
+	if (msg_type != le16_to_cpu(ack_l3l4_hdr->type)) {
+		dev_err(dev, "HW BUG? bad reply 0x%04x to message 0x%04x\n",
+			le16_to_cpu(ack_l3l4_hdr->type), msg_type);
+		result = -EIO;
+		goto error_bad_ack_type;
+	}
+	i2400m_msg_ack_hook(i2400m, ack_l3l4_hdr, ack_len);
+	mutex_unlock(&i2400m->msg_mutex);
+	d_fnend(3, dev, "(i2400m %p buf %p len %zu) = %p\n",
+		i2400m, buf, buf_len, ack_skb);
+	return ack_skb;
+
+error_bad_ack_type:
+error_bad_ack_len:
+	kfree_skb(ack_skb);
+error_ack_status:
+error_wait_for_completion:
+error_tx:
+	mutex_unlock(&i2400m->msg_mutex);
+error_bad_msg:
+	d_fnend(3, dev, "(i2400m %p buf %p len %zu) = %d\n",
+		i2400m, buf, buf_len, result);
+	return ERR_PTR(result);
+}
+
+
+/*
+ * Definitions for the Enter Power Save command
+ *
+ * The Enter Power Save command requests the device to go into power
+ * saving mode. The device will ack or nak the command depending on it
+ * being ready for it. If it acks, we tell the USB subsystem to
+ *
+ * As well, the device might request to go into power saving mode by
+ * sending a report (REPORT_POWERSAVE_READY), in which case, we issue
+ * this command. The hookups in the RX coder allow
+ */
+enum {
+	I2400M_WAKEUP_ENABLED  = 0x01,
+	I2400M_WAKEUP_DISABLED = 0x02,
+	I2400M_TLV_TYPE_WAKEUP_MODE = 144,
+};
+
+struct i2400m_cmd_enter_power_save {
+	struct i2400m_l3l4_hdr hdr;
+	struct i2400m_tlv_hdr tlv;
+	__le32 val;
+} __packed;
+
+
+/*
+ * Request entering power save
+ *
+ * This command is (mainly) executed when the device indicates that it
+ * is ready to go into powersave mode via a REPORT_POWERSAVE_READY.
+ */
+int i2400m_cmd_enter_powersave(struct i2400m *i2400m)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct i2400m_cmd_enter_power_save *cmd;
+	char strerr[32];
+
+	result = -ENOMEM;
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (cmd == NULL)
+		goto error_alloc;
+	cmd->hdr.type = cpu_to_le16(I2400M_MT_CMD_ENTER_POWERSAVE);
+	cmd->hdr.length = cpu_to_le16(sizeof(*cmd) - sizeof(cmd->hdr));
+	cmd->hdr.version = cpu_to_le16(I2400M_L3L4_VERSION);
+	cmd->tlv.type = cpu_to_le16(I2400M_TLV_TYPE_WAKEUP_MODE);
+	cmd->tlv.length = cpu_to_le16(sizeof(cmd->val));
+	cmd->val = cpu_to_le32(I2400M_WAKEUP_ENABLED);
+
+	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
+	result = PTR_ERR(ack_skb);
+	if (IS_ERR(ack_skb)) {
+		dev_err(dev, "Failed to issue 'Enter power save' command: %d\n",
+			result);
+		goto error_msg_to_dev;
+	}
+	result = i2400m_msg_check_status(wimax_msg_data(ack_skb),
+					 strerr, sizeof(strerr));
+	if (result == -EACCES)
+		d_printf(1, dev, "Cannot enter power save mode\n");
+	else if (result < 0)
+		dev_err(dev, "'Enter power save' (0x%04x) command failed: "
+			"%d - %s\n", I2400M_MT_CMD_ENTER_POWERSAVE,
+			result, strerr);
+	else
+		d_printf(1, dev, "device ready to power save\n");
+	kfree_skb(ack_skb);
+error_msg_to_dev:
+	kfree(cmd);
+error_alloc:
+	return result;
+}
+EXPORT_SYMBOL_GPL(i2400m_cmd_enter_powersave);
+
+
+/*
+ * Definitions for getting device information
+ */
+enum {
+	I2400M_TLV_DETAILED_DEVICE_INFO = 140
+};
+
+/**
+ * i2400m_get_device_info - Query the device for detailed device information
+ *
+ * @i2400m: device descriptor
+ *
+ * Returns: an skb whose skb->data points to a 'struct
+ *    i2400m_tlv_detailed_device_info'. When done, kfree_skb() it. The
+ *    skb is *guaranteed* to contain the whole TLV data structure.
+ *
+ *    On error, IS_ERR(skb) is true and ERR_PTR(skb) is the error
+ *    code.
+ */
+struct sk_buff *i2400m_get_device_info(struct i2400m *i2400m)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct i2400m_l3l4_hdr *cmd;
+	const struct i2400m_l3l4_hdr *ack;
+	size_t ack_len;
+	const struct i2400m_tlv_hdr *tlv;
+	const struct i2400m_tlv_detailed_device_info *ddi;
+	char strerr[32];
+
+	ack_skb = ERR_PTR(-ENOMEM);
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (cmd == NULL)
+		goto error_alloc;
+	cmd->type = cpu_to_le16(I2400M_MT_GET_DEVICE_INFO);
+	cmd->length = 0;
+	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
+
+	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
+	if (IS_ERR(ack_skb)) {
+		dev_err(dev, "Failed to issue 'get device info' command: %ld\n",
+			PTR_ERR(ack_skb));
+		goto error_msg_to_dev;
+	}
+	ack = wimax_msg_data_len(ack_skb, &ack_len);
+	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
+	if (result < 0) {
+		dev_err(dev, "'get device info' (0x%04x) command failed: "
+			"%d - %s\n", I2400M_MT_GET_DEVICE_INFO, result,
+			strerr);
+		goto error_cmd_failed;
+	}
+	tlv = i2400m_tlv_find(i2400m, ack->pl, ack_len - sizeof(*ack),
+			      I2400M_TLV_DETAILED_DEVICE_INFO, sizeof(*ddi));
+	if (tlv == NULL) {
+		dev_err(dev, "GET DEVICE INFO: "
+			"detailed device info TLV not found (0x%04x)\n",
+			I2400M_TLV_DETAILED_DEVICE_INFO);
+		result = -EIO;
+		goto error_no_tlv;
+	}
+	skb_pull(ack_skb, (void *) tlv - (void *) ack_skb->data);
+error_msg_to_dev:
+	kfree(cmd);
+error_alloc:
+	return ack_skb;
+
+error_no_tlv:
+error_cmd_failed:
+	kfree_skb(ack_skb);
+	kfree(cmd);
+	return ERR_PTR(result);
+}
+
+
+/* Firmware interface versions we support */
+enum {
+	I2400M_HDIv_MAJOR = 9,
+	I2400M_HDIv_MINOR = 1,
+	I2400M_HDIv_MINOR_2 = 2,
+};
+
+
+/**
+ * i2400m_firmware_check - check firmware versions are compatible with
+ * the driver
+ *
+ * @i2400m: device descriptor
+ *
+ * Returns: 0 if ok, < 0 errno code an error and a message in the
+ *    kernel log.
+ *
+ * Long function, but quite simple; first chunk launches the command
+ * and double checks the reply for the right TLV. Then we process the
+ * TLV (where the meat is).
+ *
+ * Once we process the TLV that gives us the firmware's interface
+ * version, we encode it and save it in i2400m->fw_version for future
+ * reference.
+ */
+int i2400m_firmware_check(struct i2400m *i2400m)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct i2400m_l3l4_hdr *cmd;
+	const struct i2400m_l3l4_hdr *ack;
+	size_t ack_len;
+	const struct i2400m_tlv_hdr *tlv;
+	const struct i2400m_tlv_l4_message_versions *l4mv;
+	char strerr[32];
+	unsigned major, minor, branch;
+
+	result = -ENOMEM;
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (cmd == NULL)
+		goto error_alloc;
+	cmd->type = cpu_to_le16(I2400M_MT_GET_LM_VERSION);
+	cmd->length = 0;
+	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
+
+	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
+	if (IS_ERR(ack_skb)) {
+		result = PTR_ERR(ack_skb);
+		dev_err(dev, "Failed to issue 'get lm version' command: %-d\n",
+			result);
+		goto error_msg_to_dev;
+	}
+	ack = wimax_msg_data_len(ack_skb, &ack_len);
+	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
+	if (result < 0) {
+		dev_err(dev, "'get lm version' (0x%04x) command failed: "
+			"%d - %s\n", I2400M_MT_GET_LM_VERSION, result,
+			strerr);
+		goto error_cmd_failed;
+	}
+	tlv = i2400m_tlv_find(i2400m, ack->pl, ack_len - sizeof(*ack),
+			      I2400M_TLV_L4_MESSAGE_VERSIONS, sizeof(*l4mv));
+	if (tlv == NULL) {
+		dev_err(dev, "get lm version: TLV not found (0x%04x)\n",
+			I2400M_TLV_L4_MESSAGE_VERSIONS);
+		result = -EIO;
+		goto error_no_tlv;
+	}
+	l4mv = container_of(tlv, typeof(*l4mv), hdr);
+	major = le16_to_cpu(l4mv->major);
+	minor = le16_to_cpu(l4mv->minor);
+	branch = le16_to_cpu(l4mv->branch);
+	result = -EINVAL;
+	if (major != I2400M_HDIv_MAJOR) {
+		dev_err(dev, "unsupported major fw version "
+			"%u.%u.%u\n", major, minor, branch);
+		goto error_bad_major;
+	}
+	result = 0;
+	if (minor > I2400M_HDIv_MINOR_2 || minor < I2400M_HDIv_MINOR)
+		dev_warn(dev, "untested minor fw version %u.%u.%u\n",
+			 major, minor, branch);
+	/* Yes, we ignore the branch -- we don't have to track it */
+	i2400m->fw_version = major << 16 | minor;
+	dev_info(dev, "firmware interface version %u.%u.%u\n",
+		 major, minor, branch);
+error_bad_major:
+error_no_tlv:
+error_cmd_failed:
+	kfree_skb(ack_skb);
+error_msg_to_dev:
+	kfree(cmd);
+error_alloc:
+	return result;
+}
+
+
+/*
+ * Send an DoExitIdle command to the device to ask it to go out of
+ * basestation-idle mode.
+ *
+ * @i2400m: device descriptor
+ *
+ * This starts a renegotiation with the basestation that might involve
+ * another crypto handshake with user space.
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ */
+int i2400m_cmd_exit_idle(struct i2400m *i2400m)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct i2400m_l3l4_hdr *cmd;
+	char strerr[32];
+
+	result = -ENOMEM;
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (cmd == NULL)
+		goto error_alloc;
+	cmd->type = cpu_to_le16(I2400M_MT_CMD_EXIT_IDLE);
+	cmd->length = 0;
+	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
+
+	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
+	result = PTR_ERR(ack_skb);
+	if (IS_ERR(ack_skb)) {
+		dev_err(dev, "Failed to issue 'exit idle' command: %d\n",
+			result);
+		goto error_msg_to_dev;
+	}
+	result = i2400m_msg_check_status(wimax_msg_data(ack_skb),
+					 strerr, sizeof(strerr));
+	kfree_skb(ack_skb);
+error_msg_to_dev:
+	kfree(cmd);
+error_alloc:
+	return result;
+
+}
+
+
+/*
+ * Query the device for its state, update the WiMAX stack's idea of it
+ *
+ * @i2400m: device descriptor
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * Executes a 'Get State' command and parses the returned
+ * TLVs.
+ *
+ * Because this is almost identical to a 'Report State', we use
+ * i2400m_report_state_hook() to parse the answer. This will set the
+ * carrier state, as well as the RF Kill switches state.
+ */
+static int i2400m_cmd_get_state(struct i2400m *i2400m)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct i2400m_l3l4_hdr *cmd;
+	const struct i2400m_l3l4_hdr *ack;
+	size_t ack_len;
+	char strerr[32];
+
+	result = -ENOMEM;
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (cmd == NULL)
+		goto error_alloc;
+	cmd->type = cpu_to_le16(I2400M_MT_GET_STATE);
+	cmd->length = 0;
+	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
+
+	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
+	if (IS_ERR(ack_skb)) {
+		dev_err(dev, "Failed to issue 'get state' command: %ld\n",
+			PTR_ERR(ack_skb));
+		result = PTR_ERR(ack_skb);
+		goto error_msg_to_dev;
+	}
+	ack = wimax_msg_data_len(ack_skb, &ack_len);
+	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
+	if (result < 0) {
+		dev_err(dev, "'get state' (0x%04x) command failed: "
+			"%d - %s\n", I2400M_MT_GET_STATE, result, strerr);
+		goto error_cmd_failed;
+	}
+	i2400m_report_state_hook(i2400m, ack, ack_len - sizeof(*ack),
+				 "GET STATE");
+	result = 0;
+	kfree_skb(ack_skb);
+error_cmd_failed:
+error_msg_to_dev:
+	kfree(cmd);
+error_alloc:
+	return result;
+}
+
+/**
+ * Set basic configuration settings
+ *
+ * @i2400m: device descriptor
+ * @args: array of pointers to the TLV headers to send for
+ *     configuration (each followed by its payload).
+ *     TLV headers and payloads must be properly initialized, with the
+ *     right endianess (LE).
+ * @arg_size: number of pointers in the @args array
+ */
+static int i2400m_set_init_config(struct i2400m *i2400m,
+				  const struct i2400m_tlv_hdr **arg,
+				  size_t args)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct i2400m_l3l4_hdr *cmd;
+	char strerr[32];
+	unsigned argc, argsize, tlv_size;
+	const struct i2400m_tlv_hdr *tlv_hdr;
+	void *buf, *itr;
+
+	d_fnstart(3, dev, "(i2400m %p arg %p args %zu)\n", i2400m, arg, args);
+	result = 0;
+	if (args == 0)
+		goto none;
+	/* Compute the size of all the TLVs, so we can alloc a
+	 * contiguous command block to copy them. */
+	argsize = 0;
+	for (argc = 0; argc < args; argc++) {
+		tlv_hdr = arg[argc];
+		argsize += sizeof(*tlv_hdr) + le16_to_cpu(tlv_hdr->length);
+	}
+	WARN_ON(argc >= 9);	/* As per hw spec */
+
+	/* Alloc the space for the command and TLVs*/
+	result = -ENOMEM;
+	buf = kzalloc(sizeof(*cmd) + argsize, GFP_KERNEL);
+	if (buf == NULL)
+		goto error_alloc;
+	cmd = buf;
+	cmd->type = cpu_to_le16(I2400M_MT_SET_INIT_CONFIG);
+	cmd->length = cpu_to_le16(argsize);
+	cmd->version = cpu_to_le16(I2400M_L3L4_VERSION);
+
+	/* Copy the TLVs */
+	itr = buf + sizeof(*cmd);
+	for (argc = 0; argc < args; argc++) {
+		tlv_hdr = arg[argc];
+		tlv_size = sizeof(*tlv_hdr) + le16_to_cpu(tlv_hdr->length);
+		memcpy(itr, tlv_hdr, tlv_size);
+		itr += tlv_size;
+	}
+
+	/* Send the message! */
+	ack_skb = i2400m_msg_to_dev(i2400m, buf, sizeof(*cmd) + argsize);
+	result = PTR_ERR(ack_skb);
+	if (IS_ERR(ack_skb)) {
+		dev_err(dev, "Failed to issue 'init config' command: %d\n",
+			result);
+
+		goto error_msg_to_dev;
+	}
+	result = i2400m_msg_check_status(wimax_msg_data(ack_skb),
+					 strerr, sizeof(strerr));
+	if (result < 0)
+		dev_err(dev, "'init config' (0x%04x) command failed: %d - %s\n",
+			I2400M_MT_SET_INIT_CONFIG, result, strerr);
+	kfree_skb(ack_skb);
+error_msg_to_dev:
+	kfree(buf);
+error_alloc:
+none:
+	d_fnend(3, dev, "(i2400m %p arg %p args %zu) = %d\n",
+		i2400m, arg, args, result);
+	return result;
+
+}
+
+/**
+ * i2400m_set_idle_timeout - Set the device's idle mode timeout
+ *
+ * @i2400m: i2400m device descriptor
+ *
+ * @msecs: milliseconds for the timeout to enter idle mode. Between
+ *     100 to 300000 (5m); 0 to disable. In increments of 100.
+ *
+ * After this @msecs of the link being idle (no data being sent or
+ * received), the device will negotiate with the basestation entering
+ * idle mode for saving power. The connection is maintained, but
+ * getting out of it (done in tx.c) will require some negotiation,
+ * possible crypto re-handshake and a possible DHCP re-lease.
+ *
+ * Only available if fw_version >= 0x00090002.
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ */
+int i2400m_set_idle_timeout(struct i2400m *i2400m, unsigned msecs)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct {
+		struct i2400m_l3l4_hdr hdr;
+		struct i2400m_tlv_config_idle_timeout cit;
+	} *cmd;
+	const struct i2400m_l3l4_hdr *ack;
+	size_t ack_len;
+	char strerr[32];
+
+	result = -ENOSYS;
+	if (i2400m_le_v1_3(i2400m))
+		goto error_alloc;
+	result = -ENOMEM;
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (cmd == NULL)
+		goto error_alloc;
+	cmd->hdr.type = cpu_to_le16(I2400M_MT_GET_STATE);
+	cmd->hdr.length = cpu_to_le16(sizeof(*cmd) - sizeof(cmd->hdr));
+	cmd->hdr.version = cpu_to_le16(I2400M_L3L4_VERSION);
+
+	cmd->cit.hdr.type =
+		cpu_to_le16(I2400M_TLV_CONFIG_IDLE_TIMEOUT);
+	cmd->cit.hdr.length = cpu_to_le16(sizeof(cmd->cit.timeout));
+	cmd->cit.timeout = cpu_to_le32(msecs);
+
+	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
+	if (IS_ERR(ack_skb)) {
+		dev_err(dev, "Failed to issue 'set idle timeout' command: "
+			"%ld\n", PTR_ERR(ack_skb));
+		result = PTR_ERR(ack_skb);
+		goto error_msg_to_dev;
+	}
+	ack = wimax_msg_data_len(ack_skb, &ack_len);
+	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
+	if (result < 0) {
+		dev_err(dev, "'set idle timeout' (0x%04x) command failed: "
+			"%d - %s\n", I2400M_MT_GET_STATE, result, strerr);
+		goto error_cmd_failed;
+	}
+	result = 0;
+	kfree_skb(ack_skb);
+error_cmd_failed:
+error_msg_to_dev:
+	kfree(cmd);
+error_alloc:
+	return result;
+}
+
+
+/**
+ * i2400m_dev_initialize - Initialize the device once communications are ready
+ *
+ * @i2400m: device descriptor
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * Configures the device to work the way we like it.
+ *
+ * At the point of this call, the device is registered with the WiMAX
+ * and netdev stacks, firmware is uploaded and we can talk to the
+ * device normally.
+ */
+int i2400m_dev_initialize(struct i2400m *i2400m)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_tlv_config_idle_parameters idle_params;
+	struct i2400m_tlv_config_idle_timeout idle_timeout;
+	struct i2400m_tlv_config_d2h_data_format df;
+	struct i2400m_tlv_config_dl_host_reorder dlhr;
+	const struct i2400m_tlv_hdr *args[9];
+	unsigned argc = 0;
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	if (i2400m_passive_mode)
+		goto out_passive;
+	/* Disable idle mode? (enabled by default) */
+	if (i2400m_idle_mode_disabled) {
+		if (i2400m_le_v1_3(i2400m)) {
+			idle_params.hdr.type =
+				cpu_to_le16(I2400M_TLV_CONFIG_IDLE_PARAMETERS);
+			idle_params.hdr.length = cpu_to_le16(
+				sizeof(idle_params) - sizeof(idle_params.hdr));
+			idle_params.idle_timeout = 0;
+			idle_params.idle_paging_interval = 0;
+			args[argc++] = &idle_params.hdr;
+		} else {
+			idle_timeout.hdr.type =
+				cpu_to_le16(I2400M_TLV_CONFIG_IDLE_TIMEOUT);
+			idle_timeout.hdr.length = cpu_to_le16(
+				sizeof(idle_timeout) - sizeof(idle_timeout.hdr));
+			idle_timeout.timeout = 0;
+			args[argc++] = &idle_timeout.hdr;
+		}
+	}
+	if (i2400m_ge_v1_4(i2400m)) {
+		/* Enable extended RX data format? */
+		df.hdr.type =
+			cpu_to_le16(I2400M_TLV_CONFIG_D2H_DATA_FORMAT);
+		df.hdr.length = cpu_to_le16(
+			sizeof(df) - sizeof(df.hdr));
+		df.format = 1;
+		args[argc++] = &df.hdr;
+
+		/* Enable RX data reordering?
+		 * (switch flipped in rx.c:i2400m_rx_setup() after fw upload) */
+		if (i2400m->rx_reorder) {
+			dlhr.hdr.type =
+				cpu_to_le16(I2400M_TLV_CONFIG_DL_HOST_REORDER);
+			dlhr.hdr.length = cpu_to_le16(
+				sizeof(dlhr) - sizeof(dlhr.hdr));
+			dlhr.reorder = 1;
+			args[argc++] = &dlhr.hdr;
+		}
+	}
+	result = i2400m_set_init_config(i2400m, args, argc);
+	if (result < 0)
+		goto error;
+out_passive:
+	/*
+	 * Update state: Here it just calls a get state; parsing the
+	 * result (System State TLV and RF Status TLV [done in the rx
+	 * path hooks]) will set the hardware and software RF-Kill
+	 * status.
+	 */
+	result = i2400m_cmd_get_state(i2400m);
+error:
+	if (result < 0)
+		dev_err(dev, "failed to initialize the device: %d\n", result);
+	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
+	return result;
+}
+
+
+/**
+ * i2400m_dev_shutdown - Shutdown a running device
+ *
+ * @i2400m: device descriptor
+ *
+ * Release resources acquired during the running of the device; in
+ * theory, should also tell the device to go to sleep, switch off the
+ * radio, all that, but at this point, in most cases (driver
+ * disconnection, reset handling) we can't even talk to the device.
+ */
+void i2400m_dev_shutdown(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
+}
diff --git a/drivers/staging/wimax/i2400m/debug-levels.h b/drivers/staging/wimax/i2400m/debug-levels.h
new file mode 100644
index 000000000000..a317e9fbb734
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/debug-levels.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Debug levels control file for the i2400m module
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ */
+#ifndef __debug_levels__h__
+#define __debug_levels__h__
+
+/* Maximum compile and run time debug level for all submodules */
+#define D_MODULENAME i2400m
+#define D_MASTER CONFIG_WIMAX_I2400M_DEBUG_LEVEL
+
+#include "../linux-wimax-debug.h"
+
+/* List of all the enabled modules */
+enum d_module {
+	D_SUBMODULE_DECLARE(control),
+	D_SUBMODULE_DECLARE(driver),
+	D_SUBMODULE_DECLARE(debugfs),
+	D_SUBMODULE_DECLARE(fw),
+	D_SUBMODULE_DECLARE(netdev),
+	D_SUBMODULE_DECLARE(rfkill),
+	D_SUBMODULE_DECLARE(rx),
+	D_SUBMODULE_DECLARE(sysfs),
+	D_SUBMODULE_DECLARE(tx),
+};
+
+
+#endif /* #ifndef __debug_levels__h__ */
diff --git a/drivers/staging/wimax/i2400m/debugfs.c b/drivers/staging/wimax/i2400m/debugfs.c
new file mode 100644
index 000000000000..1c640b41ea4c
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/debugfs.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Debugfs interfaces to manipulate driver and device information
+ *
+ * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/export.h>
+#include "i2400m.h"
+
+
+#define D_SUBMODULE debugfs
+#include "debug-levels.h"
+
+static
+int debugfs_netdev_queue_stopped_get(void *data, u64 *val)
+{
+	struct i2400m *i2400m = data;
+	*val = netif_queue_stopped(i2400m->wimax_dev.net_dev);
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fops_netdev_queue_stopped,
+			debugfs_netdev_queue_stopped_get,
+			NULL, "%llu\n");
+
+/*
+ * We don't allow partial reads of this file, as then the reader would
+ * get weirdly confused data as it is updated.
+ *
+ * So or you read it all or nothing; if you try to read with an offset
+ * != 0, we consider you are done reading.
+ */
+static
+ssize_t i2400m_rx_stats_read(struct file *filp, char __user *buffer,
+			     size_t count, loff_t *ppos)
+{
+	struct i2400m *i2400m = filp->private_data;
+	char buf[128];
+	unsigned long flags;
+
+	if (*ppos != 0)
+		return 0;
+	if (count < sizeof(buf))
+		return -ENOSPC;
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	snprintf(buf, sizeof(buf), "%u %u %u %u %u %u %u\n",
+		 i2400m->rx_pl_num, i2400m->rx_pl_min,
+		 i2400m->rx_pl_max, i2400m->rx_num,
+		 i2400m->rx_size_acc,
+		 i2400m->rx_size_min, i2400m->rx_size_max);
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+
+/* Any write clears the stats */
+static
+ssize_t i2400m_rx_stats_write(struct file *filp, const char __user *buffer,
+			      size_t count, loff_t *ppos)
+{
+	struct i2400m *i2400m = filp->private_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	i2400m->rx_pl_num = 0;
+	i2400m->rx_pl_max = 0;
+	i2400m->rx_pl_min = UINT_MAX;
+	i2400m->rx_num = 0;
+	i2400m->rx_size_acc = 0;
+	i2400m->rx_size_min = UINT_MAX;
+	i2400m->rx_size_max = 0;
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	return count;
+}
+
+static
+const struct file_operations i2400m_rx_stats_fops = {
+	.owner =	THIS_MODULE,
+	.open =		simple_open,
+	.read =		i2400m_rx_stats_read,
+	.write =	i2400m_rx_stats_write,
+	.llseek =	default_llseek,
+};
+
+
+/* See i2400m_rx_stats_read() */
+static
+ssize_t i2400m_tx_stats_read(struct file *filp, char __user *buffer,
+			     size_t count, loff_t *ppos)
+{
+	struct i2400m *i2400m = filp->private_data;
+	char buf[128];
+	unsigned long flags;
+
+	if (*ppos != 0)
+		return 0;
+	if (count < sizeof(buf))
+		return -ENOSPC;
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	snprintf(buf, sizeof(buf), "%u %u %u %u %u %u %u\n",
+		 i2400m->tx_pl_num, i2400m->tx_pl_min,
+		 i2400m->tx_pl_max, i2400m->tx_num,
+		 i2400m->tx_size_acc,
+		 i2400m->tx_size_min, i2400m->tx_size_max);
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+/* Any write clears the stats */
+static
+ssize_t i2400m_tx_stats_write(struct file *filp, const char __user *buffer,
+			      size_t count, loff_t *ppos)
+{
+	struct i2400m *i2400m = filp->private_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	i2400m->tx_pl_num = 0;
+	i2400m->tx_pl_max = 0;
+	i2400m->tx_pl_min = UINT_MAX;
+	i2400m->tx_num = 0;
+	i2400m->tx_size_acc = 0;
+	i2400m->tx_size_min = UINT_MAX;
+	i2400m->tx_size_max = 0;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	return count;
+}
+
+static
+const struct file_operations i2400m_tx_stats_fops = {
+	.owner =	THIS_MODULE,
+	.open =		simple_open,
+	.read =		i2400m_tx_stats_read,
+	.write =	i2400m_tx_stats_write,
+	.llseek =	default_llseek,
+};
+
+
+/* Write 1 to ask the device to go into suspend */
+static
+int debugfs_i2400m_suspend_set(void *data, u64 val)
+{
+	int result;
+	struct i2400m *i2400m = data;
+	result = i2400m_cmd_enter_powersave(i2400m);
+	if (result >= 0)
+		result = 0;
+	return result;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fops_i2400m_suspend,
+			NULL, debugfs_i2400m_suspend_set,
+			"%llu\n");
+
+/*
+ * Reset the device
+ *
+ * Write 0 to ask the device to soft reset, 1 to cold reset, 2 to bus
+ * reset (as defined by enum i2400m_reset_type).
+ */
+static
+int debugfs_i2400m_reset_set(void *data, u64 val)
+{
+	int result;
+	struct i2400m *i2400m = data;
+	enum i2400m_reset_type rt = val;
+	switch(rt) {
+	case I2400M_RT_WARM:
+	case I2400M_RT_COLD:
+	case I2400M_RT_BUS:
+		result = i2400m_reset(i2400m, rt);
+		if (result >= 0)
+			result = 0;
+		break;
+	default:
+		result = -EINVAL;
+	}
+	return result;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fops_i2400m_reset,
+			NULL, debugfs_i2400m_reset_set,
+			"%llu\n");
+
+void i2400m_debugfs_add(struct i2400m *i2400m)
+{
+	struct dentry *dentry = i2400m->wimax_dev.debugfs_dentry;
+
+	dentry = debugfs_create_dir("i2400m", dentry);
+	i2400m->debugfs_dentry = dentry;
+
+	d_level_register_debugfs("dl_", control, dentry);
+	d_level_register_debugfs("dl_", driver, dentry);
+	d_level_register_debugfs("dl_", debugfs, dentry);
+	d_level_register_debugfs("dl_", fw, dentry);
+	d_level_register_debugfs("dl_", netdev, dentry);
+	d_level_register_debugfs("dl_", rfkill, dentry);
+	d_level_register_debugfs("dl_", rx, dentry);
+	d_level_register_debugfs("dl_", tx, dentry);
+
+	debugfs_create_size_t("tx_in", 0400, dentry, &i2400m->tx_in);
+	debugfs_create_size_t("tx_out", 0400, dentry, &i2400m->tx_out);
+	debugfs_create_u32("state", 0600, dentry, &i2400m->state);
+
+	/*
+	 * Trace received messages from user space
+	 *
+	 * In order to tap the bidirectional message stream in the
+	 * 'msg' pipe, user space can read from the 'msg' pipe;
+	 * however, due to limitations in libnl, we can't know what
+	 * the different applications are sending down to the kernel.
+	 *
+	 * So we have this hack where the driver will echo any message
+	 * received on the msg pipe from user space [through a call to
+	 * wimax_dev->op_msg_from_user() into
+	 * i2400m_op_msg_from_user()] into the 'trace' pipe that this
+	 * driver creates.
+	 *
+	 * So then, reading from both the 'trace' and 'msg' pipes in
+	 * user space will provide a full dump of the traffic.
+	 *
+	 * Write 1 to activate, 0 to clear.
+	 *
+	 * It is not really very atomic, but it is also not too
+	 * critical.
+	 */
+	debugfs_create_u8("trace_msg_from_user", 0600, dentry,
+			  &i2400m->trace_msg_from_user);
+
+	debugfs_create_file("netdev_queue_stopped", 0400, dentry, i2400m,
+			    &fops_netdev_queue_stopped);
+
+	debugfs_create_file("rx_stats", 0600, dentry, i2400m,
+			    &i2400m_rx_stats_fops);
+
+	debugfs_create_file("tx_stats", 0600, dentry, i2400m,
+			    &i2400m_tx_stats_fops);
+
+	debugfs_create_file("suspend", 0200, dentry, i2400m,
+			    &fops_i2400m_suspend);
+
+	debugfs_create_file("reset", 0200, dentry, i2400m, &fops_i2400m_reset);
+}
+
+void i2400m_debugfs_rm(struct i2400m *i2400m)
+{
+	debugfs_remove_recursive(i2400m->debugfs_dentry);
+}
diff --git a/drivers/staging/wimax/i2400m/driver.c b/drivers/staging/wimax/i2400m/driver.c
new file mode 100644
index 000000000000..dc8939ff78c0
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/driver.c
@@ -0,0 +1,1002 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Generic probe/disconnect, reset and message passing
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * See i2400m.h for driver documentation. This contains helpers for
+ * the driver model glue [_setup()/_release()], handling device resets
+ * [_dev_reset_handle()], and the backends for the WiMAX stack ops
+ * reset [_op_reset()] and message from user [_op_msg_from_user()].
+ *
+ * ROADMAP:
+ *
+ * i2400m_op_msg_from_user()
+ *   i2400m_msg_to_dev()
+ *   wimax_msg_to_user_send()
+ *
+ * i2400m_op_reset()
+ *   i240m->bus_reset()
+ *
+ * i2400m_dev_reset_handle()
+ *   __i2400m_dev_reset_handle()
+ *     __i2400m_dev_stop()
+ *     __i2400m_dev_start()
+ *
+ * i2400m_setup()
+ *   i2400m->bus_setup()
+ *   i2400m_bootrom_init()
+ *   register_netdev()
+ *   wimax_dev_add()
+ *   i2400m_dev_start()
+ *     __i2400m_dev_start()
+ *       i2400m_dev_bootstrap()
+ *       i2400m_tx_setup()
+ *       i2400m->bus_dev_start()
+ *       i2400m_firmware_check()
+ *       i2400m_check_mac_addr()
+ *
+ * i2400m_release()
+ *   i2400m_dev_stop()
+ *     __i2400m_dev_stop()
+ *       i2400m_dev_shutdown()
+ *       i2400m->bus_dev_stop()
+ *       i2400m_tx_release()
+ *   i2400m->bus_release()
+ *   wimax_dev_rm()
+ *   unregister_netdev()
+ */
+#include "i2400m.h"
+#include <linux/etherdevice.h>
+#include "linux-wimax-i2400m.h"
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/suspend.h>
+#include <linux/slab.h>
+
+#define D_SUBMODULE driver
+#include "debug-levels.h"
+
+
+static char i2400m_debug_params[128];
+module_param_string(debug, i2400m_debug_params, sizeof(i2400m_debug_params),
+		    0644);
+MODULE_PARM_DESC(debug,
+		 "String of space-separated NAME:VALUE pairs, where NAMEs "
+		 "are the different debug submodules and VALUE are the "
+		 "initial debug value to set.");
+
+static char i2400m_barkers_params[128];
+module_param_string(barkers, i2400m_barkers_params,
+		    sizeof(i2400m_barkers_params), 0644);
+MODULE_PARM_DESC(barkers,
+		 "String of comma-separated 32-bit values; each is "
+		 "recognized as the value the device sends as a reboot "
+		 "signal; values are appended to a list--setting one value "
+		 "as zero cleans the existing list and starts a new one.");
+
+/*
+ * WiMAX stack operation: relay a message from user space
+ *
+ * @wimax_dev: device descriptor
+ * @pipe_name: named pipe the message is for
+ * @msg_buf: pointer to the message bytes
+ * @msg_len: length of the buffer
+ * @genl_info: passed by the generic netlink layer
+ *
+ * The WiMAX stack will call this function when a message was received
+ * from user space.
+ *
+ * For the i2400m, this is an L3L4 message, as specified in
+ * include/linux/wimax/i2400m.h, and thus prefixed with a 'struct
+ * i2400m_l3l4_hdr'. Driver (and device) expect the messages to be
+ * coded in Little Endian.
+ *
+ * This function just verifies that the header declaration and the
+ * payload are consistent and then deals with it, either forwarding it
+ * to the device or procesing it locally.
+ *
+ * In the i2400m, messages are basically commands that will carry an
+ * ack, so we use i2400m_msg_to_dev() and then deliver the ack back to
+ * user space. The rx.c code might intercept the response and use it
+ * to update the driver's state, but then it will pass it on so it can
+ * be relayed back to user space.
+ *
+ * Note that asynchronous events from the device are processed and
+ * sent to user space in rx.c.
+ */
+static
+int i2400m_op_msg_from_user(struct wimax_dev *wimax_dev,
+			    const char *pipe_name,
+			    const void *msg_buf, size_t msg_len,
+			    const struct genl_info *genl_info)
+{
+	int result;
+	struct i2400m *i2400m = wimax_dev_to_i2400m(wimax_dev);
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+
+	d_fnstart(4, dev, "(wimax_dev %p [i2400m %p] msg_buf %p "
+		  "msg_len %zu genl_info %p)\n", wimax_dev, i2400m,
+		  msg_buf, msg_len, genl_info);
+	ack_skb = i2400m_msg_to_dev(i2400m, msg_buf, msg_len);
+	result = PTR_ERR(ack_skb);
+	if (IS_ERR(ack_skb))
+		goto error_msg_to_dev;
+	result = wimax_msg_send(&i2400m->wimax_dev, ack_skb);
+error_msg_to_dev:
+	d_fnend(4, dev, "(wimax_dev %p [i2400m %p] msg_buf %p msg_len %zu "
+		"genl_info %p) = %d\n", wimax_dev, i2400m, msg_buf, msg_len,
+		genl_info, result);
+	return result;
+}
+
+
+/*
+ * Context to wait for a reset to finalize
+ */
+struct i2400m_reset_ctx {
+	struct completion completion;
+	int result;
+};
+
+
+/*
+ * WiMAX stack operation: reset a device
+ *
+ * @wimax_dev: device descriptor
+ *
+ * See the documentation for wimax_reset() and wimax_dev->op_reset for
+ * the requirements of this function. The WiMAX stack guarantees
+ * serialization on calls to this function.
+ *
+ * Do a warm reset on the device; if it fails, resort to a cold reset
+ * and return -ENODEV. On successful warm reset, we need to block
+ * until it is complete.
+ *
+ * The bus-driver implementation of reset takes care of falling back
+ * to cold reset if warm fails.
+ */
+static
+int i2400m_op_reset(struct wimax_dev *wimax_dev)
+{
+	int result;
+	struct i2400m *i2400m = wimax_dev_to_i2400m(wimax_dev);
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_reset_ctx ctx = {
+		.completion = COMPLETION_INITIALIZER_ONSTACK(ctx.completion),
+		.result = 0,
+	};
+
+	d_fnstart(4, dev, "(wimax_dev %p)\n", wimax_dev);
+	mutex_lock(&i2400m->init_mutex);
+	i2400m->reset_ctx = &ctx;
+	mutex_unlock(&i2400m->init_mutex);
+	result = i2400m_reset(i2400m, I2400M_RT_WARM);
+	if (result < 0)
+		goto out;
+	result = wait_for_completion_timeout(&ctx.completion, 4*HZ);
+	if (result == 0)
+		result = -ETIMEDOUT;
+	else if (result > 0)
+		result = ctx.result;
+	/* if result < 0, pass it on */
+	mutex_lock(&i2400m->init_mutex);
+	i2400m->reset_ctx = NULL;
+	mutex_unlock(&i2400m->init_mutex);
+out:
+	d_fnend(4, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
+	return result;
+}
+
+
+/*
+ * Check the MAC address we got from boot mode is ok
+ *
+ * @i2400m: device descriptor
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ */
+static
+int i2400m_check_mac_addr(struct i2400m *i2400m)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb;
+	const struct i2400m_tlv_detailed_device_info *ddi;
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	skb = i2400m_get_device_info(i2400m);
+	if (IS_ERR(skb)) {
+		result = PTR_ERR(skb);
+		dev_err(dev, "Cannot verify MAC address, error reading: %d\n",
+			result);
+		goto error;
+	}
+	/* Extract MAC address */
+	ddi = (void *) skb->data;
+	BUILD_BUG_ON(ETH_ALEN != sizeof(ddi->mac_address));
+	d_printf(2, dev, "GET DEVICE INFO: mac addr %pM\n",
+		 ddi->mac_address);
+	if (!memcmp(net_dev->perm_addr, ddi->mac_address,
+		   sizeof(ddi->mac_address)))
+		goto ok;
+	dev_warn(dev, "warning: device reports a different MAC address "
+		 "to that of boot mode's\n");
+	dev_warn(dev, "device reports     %pM\n", ddi->mac_address);
+	dev_warn(dev, "boot mode reported %pM\n", net_dev->perm_addr);
+	if (is_zero_ether_addr(ddi->mac_address))
+		dev_err(dev, "device reports an invalid MAC address, "
+			"not updating\n");
+	else {
+		dev_warn(dev, "updating MAC address\n");
+		net_dev->addr_len = ETH_ALEN;
+		memcpy(net_dev->perm_addr, ddi->mac_address, ETH_ALEN);
+		memcpy(net_dev->dev_addr, ddi->mac_address, ETH_ALEN);
+	}
+ok:
+	result = 0;
+	kfree_skb(skb);
+error:
+	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
+	return result;
+}
+
+
+/**
+ * __i2400m_dev_start - Bring up driver communication with the device
+ *
+ * @i2400m: device descriptor
+ * @flags: boot mode flags
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * Uploads firmware and brings up all the resources needed to be able
+ * to communicate with the device.
+ *
+ * The workqueue has to be setup early, at least before RX handling
+ * (it's only real user for now) so it can process reports as they
+ * arrive. We also want to destroy it if we retry, to make sure it is
+ * flushed...easier like this.
+ *
+ * TX needs to be setup before the bus-specific code (otherwise on
+ * shutdown, the bus-tx code could try to access it).
+ */
+static
+int __i2400m_dev_start(struct i2400m *i2400m, enum i2400m_bri flags)
+{
+	int result;
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	struct net_device *net_dev = wimax_dev->net_dev;
+	struct device *dev = i2400m_dev(i2400m);
+	int times = i2400m->bus_bm_retries;
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+retry:
+	result = i2400m_dev_bootstrap(i2400m, flags);
+	if (result < 0) {
+		dev_err(dev, "cannot bootstrap device: %d\n", result);
+		goto error_bootstrap;
+	}
+	result = i2400m_tx_setup(i2400m);
+	if (result < 0)
+		goto error_tx_setup;
+	result = i2400m_rx_setup(i2400m);
+	if (result < 0)
+		goto error_rx_setup;
+	i2400m->work_queue = create_singlethread_workqueue(wimax_dev->name);
+	if (i2400m->work_queue == NULL) {
+		result = -ENOMEM;
+		dev_err(dev, "cannot create workqueue\n");
+		goto error_create_workqueue;
+	}
+	if (i2400m->bus_dev_start) {
+		result = i2400m->bus_dev_start(i2400m);
+		if (result < 0)
+			goto error_bus_dev_start;
+	}
+	i2400m->ready = 1;
+	wmb();		/* see i2400m->ready's documentation  */
+	/* process pending reports from the device */
+	queue_work(i2400m->work_queue, &i2400m->rx_report_ws);
+	result = i2400m_firmware_check(i2400m);	/* fw versions ok? */
+	if (result < 0)
+		goto error_fw_check;
+	/* At this point is ok to send commands to the device */
+	result = i2400m_check_mac_addr(i2400m);
+	if (result < 0)
+		goto error_check_mac_addr;
+	result = i2400m_dev_initialize(i2400m);
+	if (result < 0)
+		goto error_dev_initialize;
+
+	/* We don't want any additional unwanted error recovery triggered
+	 * from any other context so if anything went wrong before we come
+	 * here, let's keep i2400m->error_recovery untouched and leave it to
+	 * dev_reset_handle(). See dev_reset_handle(). */
+
+	atomic_dec(&i2400m->error_recovery);
+	/* Every thing works so far, ok, now we are ready to
+	 * take error recovery if it's required. */
+
+	/* At this point, reports will come for the device and set it
+	 * to the right state if it is different than UNINITIALIZED */
+	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
+		net_dev, i2400m, result);
+	return result;
+
+error_dev_initialize:
+error_check_mac_addr:
+error_fw_check:
+	i2400m->ready = 0;
+	wmb();		/* see i2400m->ready's documentation  */
+	flush_workqueue(i2400m->work_queue);
+	if (i2400m->bus_dev_stop)
+		i2400m->bus_dev_stop(i2400m);
+error_bus_dev_start:
+	destroy_workqueue(i2400m->work_queue);
+error_create_workqueue:
+	i2400m_rx_release(i2400m);
+error_rx_setup:
+	i2400m_tx_release(i2400m);
+error_tx_setup:
+error_bootstrap:
+	if (result == -EL3RST && times-- > 0) {
+		flags = I2400M_BRI_SOFT|I2400M_BRI_MAC_REINIT;
+		goto retry;
+	}
+	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
+		net_dev, i2400m, result);
+	return result;
+}
+
+
+static
+int i2400m_dev_start(struct i2400m *i2400m, enum i2400m_bri bm_flags)
+{
+	int result = 0;
+	mutex_lock(&i2400m->init_mutex);	/* Well, start the device */
+	if (i2400m->updown == 0) {
+		result = __i2400m_dev_start(i2400m, bm_flags);
+		if (result >= 0) {
+			i2400m->updown = 1;
+			i2400m->alive = 1;
+			wmb();/* see i2400m->updown and i2400m->alive's doc */
+		}
+	}
+	mutex_unlock(&i2400m->init_mutex);
+	return result;
+}
+
+
+/**
+ * i2400m_dev_stop - Tear down driver communication with the device
+ *
+ * @i2400m: device descriptor
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * Releases all the resources allocated to communicate with the
+ * device. Note we cannot destroy the workqueue earlier as until RX is
+ * fully destroyed, it could still try to schedule jobs.
+ */
+static
+void __i2400m_dev_stop(struct i2400m *i2400m)
+{
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING);
+	i2400m_msg_to_dev_cancel_wait(i2400m, -EL3RST);
+	complete(&i2400m->msg_completion);
+	i2400m_net_wake_stop(i2400m);
+	i2400m_dev_shutdown(i2400m);
+	/*
+	 * Make sure no report hooks are running *before* we stop the
+	 * communication infrastructure with the device.
+	 */
+	i2400m->ready = 0;	/* nobody can queue work anymore */
+	wmb();		/* see i2400m->ready's documentation  */
+	flush_workqueue(i2400m->work_queue);
+
+	if (i2400m->bus_dev_stop)
+		i2400m->bus_dev_stop(i2400m);
+	destroy_workqueue(i2400m->work_queue);
+	i2400m_rx_release(i2400m);
+	i2400m_tx_release(i2400m);
+	wimax_state_change(wimax_dev, WIMAX_ST_DOWN);
+	d_fnend(3, dev, "(i2400m %p) = 0\n", i2400m);
+}
+
+
+/*
+ * Watch out -- we only need to stop if there is a need for it. The
+ * device could have reset itself and failed to come up again (see
+ * _i2400m_dev_reset_handle()).
+ */
+static
+void i2400m_dev_stop(struct i2400m *i2400m)
+{
+	mutex_lock(&i2400m->init_mutex);
+	if (i2400m->updown) {
+		__i2400m_dev_stop(i2400m);
+		i2400m->updown = 0;
+		i2400m->alive = 0;
+		wmb();	/* see i2400m->updown and i2400m->alive's doc */
+	}
+	mutex_unlock(&i2400m->init_mutex);
+}
+
+
+/*
+ * Listen to PM events to cache the firmware before suspend/hibernation
+ *
+ * When the device comes out of suspend, it might go into reset and
+ * firmware has to be uploaded again. At resume, most of the times, we
+ * can't load firmware images from disk, so we need to cache it.
+ *
+ * i2400m_fw_cache() will allocate a kobject and attach the firmware
+ * to it; that way we don't have to worry too much about the fw loader
+ * hitting a race condition.
+ *
+ * Note: modus operandi stolen from the Orinoco driver; thx.
+ */
+static
+int i2400m_pm_notifier(struct notifier_block *notifier,
+		       unsigned long pm_event,
+		       void *unused)
+{
+	struct i2400m *i2400m =
+		container_of(notifier, struct i2400m, pm_notifier);
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(i2400m %p pm_event %lx)\n", i2400m, pm_event);
+	switch (pm_event) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+		i2400m_fw_cache(i2400m);
+		break;
+	case PM_POST_RESTORE:
+		/* Restore from hibernation failed. We need to clean
+		 * up in exactly the same way, so fall through. */
+	case PM_POST_HIBERNATION:
+	case PM_POST_SUSPEND:
+		i2400m_fw_uncache(i2400m);
+		break;
+
+	case PM_RESTORE_PREPARE:
+	default:
+		break;
+	}
+	d_fnend(3, dev, "(i2400m %p pm_event %lx) = void\n", i2400m, pm_event);
+	return NOTIFY_DONE;
+}
+
+
+/*
+ * pre-reset is called before a device is going on reset
+ *
+ * This has to be followed by a call to i2400m_post_reset(), otherwise
+ * bad things might happen.
+ */
+int i2400m_pre_reset(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	d_printf(1, dev, "pre-reset shut down\n");
+
+	mutex_lock(&i2400m->init_mutex);
+	if (i2400m->updown) {
+		netif_tx_disable(i2400m->wimax_dev.net_dev);
+		__i2400m_dev_stop(i2400m);
+		/* down't set updown to zero -- this way
+		 * post_reset can restore properly */
+	}
+	mutex_unlock(&i2400m->init_mutex);
+	if (i2400m->bus_release)
+		i2400m->bus_release(i2400m);
+	d_fnend(3, dev, "(i2400m %p) = 0\n", i2400m);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(i2400m_pre_reset);
+
+
+/*
+ * Restore device state after a reset
+ *
+ * Do the work needed after a device reset to bring it up to the same
+ * state as it was before the reset.
+ *
+ * NOTE: this requires i2400m->init_mutex taken
+ */
+int i2400m_post_reset(struct i2400m *i2400m)
+{
+	int result = 0;
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	d_printf(1, dev, "post-reset start\n");
+	if (i2400m->bus_setup) {
+		result = i2400m->bus_setup(i2400m);
+		if (result < 0) {
+			dev_err(dev, "bus-specific setup failed: %d\n",
+				result);
+			goto error_bus_setup;
+		}
+	}
+	mutex_lock(&i2400m->init_mutex);
+	if (i2400m->updown) {
+		result = __i2400m_dev_start(
+			i2400m, I2400M_BRI_SOFT | I2400M_BRI_MAC_REINIT);
+		if (result < 0)
+			goto error_dev_start;
+	}
+	mutex_unlock(&i2400m->init_mutex);
+	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
+	return result;
+
+error_dev_start:
+	if (i2400m->bus_release)
+		i2400m->bus_release(i2400m);
+	/* even if the device was up, it could not be recovered, so we
+	 * mark it as down. */
+	i2400m->updown = 0;
+	wmb();		/* see i2400m->updown's documentation  */
+	mutex_unlock(&i2400m->init_mutex);
+error_bus_setup:
+	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
+	return result;
+}
+EXPORT_SYMBOL_GPL(i2400m_post_reset);
+
+
+/*
+ * The device has rebooted; fix up the device and the driver
+ *
+ * Tear down the driver communication with the device, reload the
+ * firmware and reinitialize the communication with the device.
+ *
+ * If someone calls a reset when the device's firmware is down, in
+ * theory we won't see it because we are not listening. However, just
+ * in case, leave the code to handle it.
+ *
+ * If there is a reset context, use it; this means someone is waiting
+ * for us to tell him when the reset operation is complete and the
+ * device is ready to rock again.
+ *
+ * NOTE: if we are in the process of bringing up or down the
+ *       communication with the device [running i2400m_dev_start() or
+ *       _stop()], don't do anything, let it fail and handle it.
+ *
+ * This function is ran always in a thread context
+ *
+ * This function gets passed, as payload to i2400m_work() a 'const
+ * char *' ptr with a "reason" why the reset happened (for messages).
+ */
+static
+void __i2400m_dev_reset_handle(struct work_struct *ws)
+{
+	struct i2400m *i2400m = container_of(ws, struct i2400m, reset_ws);
+	const char *reason = i2400m->reset_reason;
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_reset_ctx *ctx = i2400m->reset_ctx;
+	int result;
+
+	d_fnstart(3, dev, "(ws %p i2400m %p reason %s)\n", ws, i2400m, reason);
+
+	i2400m->boot_mode = 1;
+	wmb();		/* Make sure i2400m_msg_to_dev() sees boot_mode */
+
+	result = 0;
+	if (mutex_trylock(&i2400m->init_mutex) == 0) {
+		/* We are still in i2400m_dev_start() [let it fail] or
+		 * i2400m_dev_stop() [we are shutting down anyway, so
+		 * ignore it] or we are resetting somewhere else. */
+		dev_err(dev, "device rebooted somewhere else?\n");
+		i2400m_msg_to_dev_cancel_wait(i2400m, -EL3RST);
+		complete(&i2400m->msg_completion);
+		goto out;
+	}
+
+	dev_err(dev, "%s: reinitializing driver\n", reason);
+	rmb();
+	if (i2400m->updown) {
+		__i2400m_dev_stop(i2400m);
+		i2400m->updown = 0;
+		wmb();		/* see i2400m->updown's documentation  */
+	}
+
+	if (i2400m->alive) {
+		result = __i2400m_dev_start(i2400m,
+				    I2400M_BRI_SOFT | I2400M_BRI_MAC_REINIT);
+		if (result < 0) {
+			dev_err(dev, "%s: cannot start the device: %d\n",
+				reason, result);
+			result = -EUCLEAN;
+			if (atomic_read(&i2400m->bus_reset_retries)
+					>= I2400M_BUS_RESET_RETRIES) {
+				result = -ENODEV;
+				dev_err(dev, "tried too many times to "
+					"reset the device, giving up\n");
+			}
+		}
+	}
+
+	if (i2400m->reset_ctx) {
+		ctx->result = result;
+		complete(&ctx->completion);
+	}
+	mutex_unlock(&i2400m->init_mutex);
+	if (result == -EUCLEAN) {
+		/*
+		 * We come here because the reset during operational mode
+		 * wasn't successfully done and need to proceed to a bus
+		 * reset. For the dev_reset_handle() to be able to handle
+		 * the reset event later properly, we restore boot_mode back
+		 * to the state before previous reset. ie: just like we are
+		 * issuing the bus reset for the first time
+		 */
+		i2400m->boot_mode = 0;
+		wmb();
+
+		atomic_inc(&i2400m->bus_reset_retries);
+		/* ops, need to clean up [w/ init_mutex not held] */
+		result = i2400m_reset(i2400m, I2400M_RT_BUS);
+		if (result >= 0)
+			result = -ENODEV;
+	} else {
+		rmb();
+		if (i2400m->alive) {
+			/* great, we expect the device state up and
+			 * dev_start() actually brings the device state up */
+			i2400m->updown = 1;
+			wmb();
+			atomic_set(&i2400m->bus_reset_retries, 0);
+		}
+	}
+out:
+	d_fnend(3, dev, "(ws %p i2400m %p reason %s) = void\n",
+		ws, i2400m, reason);
+}
+
+
+/**
+ * i2400m_dev_reset_handle - Handle a device's reset in a thread context
+ *
+ * Schedule a device reset handling out on a thread context, so it
+ * is safe to call from atomic context. We can't use the i2400m's
+ * queue as we are going to destroy it and reinitialize it as part of
+ * the driver bringup/bringup process.
+ *
+ * See __i2400m_dev_reset_handle() for details; that takes care of
+ * reinitializing the driver to handle the reset, calling into the
+ * bus-specific functions ops as needed.
+ */
+int i2400m_dev_reset_handle(struct i2400m *i2400m, const char *reason)
+{
+	i2400m->reset_reason = reason;
+	return schedule_work(&i2400m->reset_ws);
+}
+EXPORT_SYMBOL_GPL(i2400m_dev_reset_handle);
+
+
+ /*
+ * The actual work of error recovery.
+ *
+ * The current implementation of error recovery is to trigger a bus reset.
+ */
+static
+void __i2400m_error_recovery(struct work_struct *ws)
+{
+	struct i2400m *i2400m = container_of(ws, struct i2400m, recovery_ws);
+
+	i2400m_reset(i2400m, I2400M_RT_BUS);
+}
+
+/*
+ * Schedule a work struct for error recovery.
+ *
+ * The intention of error recovery is to bring back the device to some
+ * known state whenever TX sees -110 (-ETIMEOUT) on copying the data to
+ * the device. The TX failure could mean a device bus stuck, so the current
+ * error recovery implementation is to trigger a bus reset to the device
+ * and hopefully it can bring back the device.
+ *
+ * The actual work of error recovery has to be in a thread context because
+ * it is kicked off in the TX thread (i2400ms->tx_workqueue) which is to be
+ * destroyed by the error recovery mechanism (currently a bus reset).
+ *
+ * Also, there may be already a queue of TX works that all hit
+ * the -ETIMEOUT error condition because the device is stuck already.
+ * Since bus reset is used as the error recovery mechanism and we don't
+ * want consecutive bus resets simply because the multiple TX works
+ * in the queue all hit the same device erratum, the flag "error_recovery"
+ * is introduced for preventing unwanted consecutive bus resets.
+ *
+ * Error recovery shall only be invoked again if previous one was completed.
+ * The flag error_recovery is set when error recovery mechanism is scheduled,
+ * and is checked when we need to schedule another error recovery. If it is
+ * in place already, then we shouldn't schedule another one.
+ */
+void i2400m_error_recovery(struct i2400m *i2400m)
+{
+	if (atomic_add_return(1, &i2400m->error_recovery) == 1)
+		schedule_work(&i2400m->recovery_ws);
+	else
+		atomic_dec(&i2400m->error_recovery);
+}
+EXPORT_SYMBOL_GPL(i2400m_error_recovery);
+
+/*
+ * Alloc the command and ack buffers for boot mode
+ *
+ * Get the buffers needed to deal with boot mode messages.
+ */
+static
+int i2400m_bm_buf_alloc(struct i2400m *i2400m)
+{
+	i2400m->bm_cmd_buf = kzalloc(I2400M_BM_CMD_BUF_SIZE, GFP_KERNEL);
+	if (i2400m->bm_cmd_buf == NULL)
+		goto error_bm_cmd_kzalloc;
+	i2400m->bm_ack_buf = kzalloc(I2400M_BM_ACK_BUF_SIZE, GFP_KERNEL);
+	if (i2400m->bm_ack_buf == NULL)
+		goto error_bm_ack_buf_kzalloc;
+	return 0;
+
+error_bm_ack_buf_kzalloc:
+	kfree(i2400m->bm_cmd_buf);
+error_bm_cmd_kzalloc:
+	return -ENOMEM;
+}
+
+
+/*
+ * Free boot mode command and ack buffers.
+ */
+static
+void i2400m_bm_buf_free(struct i2400m *i2400m)
+{
+	kfree(i2400m->bm_ack_buf);
+	kfree(i2400m->bm_cmd_buf);
+}
+
+
+/**
+ * i2400m_init - Initialize a 'struct i2400m' from all zeroes
+ *
+ * This is a bus-generic API call.
+ */
+void i2400m_init(struct i2400m *i2400m)
+{
+	wimax_dev_init(&i2400m->wimax_dev);
+
+	i2400m->boot_mode = 1;
+	i2400m->rx_reorder = 1;
+	init_waitqueue_head(&i2400m->state_wq);
+
+	spin_lock_init(&i2400m->tx_lock);
+	i2400m->tx_pl_min = UINT_MAX;
+	i2400m->tx_size_min = UINT_MAX;
+
+	spin_lock_init(&i2400m->rx_lock);
+	i2400m->rx_pl_min = UINT_MAX;
+	i2400m->rx_size_min = UINT_MAX;
+	INIT_LIST_HEAD(&i2400m->rx_reports);
+	INIT_WORK(&i2400m->rx_report_ws, i2400m_report_hook_work);
+
+	mutex_init(&i2400m->msg_mutex);
+	init_completion(&i2400m->msg_completion);
+
+	mutex_init(&i2400m->init_mutex);
+	/* wake_tx_ws is initialized in i2400m_tx_setup() */
+
+	INIT_WORK(&i2400m->reset_ws, __i2400m_dev_reset_handle);
+	INIT_WORK(&i2400m->recovery_ws, __i2400m_error_recovery);
+
+	atomic_set(&i2400m->bus_reset_retries, 0);
+
+	i2400m->alive = 0;
+
+	/* initialize error_recovery to 1 for denoting we
+	 * are not yet ready to take any error recovery */
+	atomic_set(&i2400m->error_recovery, 1);
+}
+EXPORT_SYMBOL_GPL(i2400m_init);
+
+
+int i2400m_reset(struct i2400m *i2400m, enum i2400m_reset_type rt)
+{
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+
+	/*
+	 * Make sure we stop TXs and down the carrier before
+	 * resetting; this is needed to avoid things like
+	 * i2400m_wake_tx() scheduling stuff in parallel.
+	 */
+	if (net_dev->reg_state == NETREG_REGISTERED) {
+		netif_tx_disable(net_dev);
+		netif_carrier_off(net_dev);
+	}
+	return i2400m->bus_reset(i2400m, rt);
+}
+EXPORT_SYMBOL_GPL(i2400m_reset);
+
+
+/**
+ * i2400m_setup - bus-generic setup function for the i2400m device
+ *
+ * @i2400m: device descriptor (bus-specific parts have been initialized)
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * Sets up basic device comunication infrastructure, boots the ROM to
+ * read the MAC address, registers with the WiMAX and network stacks
+ * and then brings up the device.
+ */
+int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+
+	snprintf(wimax_dev->name, sizeof(wimax_dev->name),
+		 "i2400m-%s:%s", dev->bus->name, dev_name(dev));
+
+	result = i2400m_bm_buf_alloc(i2400m);
+	if (result < 0) {
+		dev_err(dev, "cannot allocate bootmode scratch buffers\n");
+		goto error_bm_buf_alloc;
+	}
+
+	if (i2400m->bus_setup) {
+		result = i2400m->bus_setup(i2400m);
+		if (result < 0) {
+			dev_err(dev, "bus-specific setup failed: %d\n",
+				result);
+			goto error_bus_setup;
+		}
+	}
+
+	result = i2400m_bootrom_init(i2400m, bm_flags);
+	if (result < 0) {
+		dev_err(dev, "read mac addr: bootrom init "
+			"failed: %d\n", result);
+		goto error_bootrom_init;
+	}
+	result = i2400m_read_mac_addr(i2400m);
+	if (result < 0)
+		goto error_read_mac_addr;
+	eth_random_addr(i2400m->src_mac_addr);
+
+	i2400m->pm_notifier.notifier_call = i2400m_pm_notifier;
+	register_pm_notifier(&i2400m->pm_notifier);
+
+	result = register_netdev(net_dev);	/* Okey dokey, bring it up */
+	if (result < 0) {
+		dev_err(dev, "cannot register i2400m network device: %d\n",
+			result);
+		goto error_register_netdev;
+	}
+	netif_carrier_off(net_dev);
+
+	i2400m->wimax_dev.op_msg_from_user = i2400m_op_msg_from_user;
+	i2400m->wimax_dev.op_rfkill_sw_toggle = i2400m_op_rfkill_sw_toggle;
+	i2400m->wimax_dev.op_reset = i2400m_op_reset;
+
+	result = wimax_dev_add(&i2400m->wimax_dev, net_dev);
+	if (result < 0)
+		goto error_wimax_dev_add;
+
+	/* Now setup all that requires a registered net and wimax device. */
+	result = sysfs_create_group(&net_dev->dev.kobj, &i2400m_dev_attr_group);
+	if (result < 0) {
+		dev_err(dev, "cannot setup i2400m's sysfs: %d\n", result);
+		goto error_sysfs_setup;
+	}
+
+	i2400m_debugfs_add(i2400m);
+
+	result = i2400m_dev_start(i2400m, bm_flags);
+	if (result < 0)
+		goto error_dev_start;
+	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
+	return result;
+
+error_dev_start:
+	i2400m_debugfs_rm(i2400m);
+	sysfs_remove_group(&i2400m->wimax_dev.net_dev->dev.kobj,
+			   &i2400m_dev_attr_group);
+error_sysfs_setup:
+	wimax_dev_rm(&i2400m->wimax_dev);
+error_wimax_dev_add:
+	unregister_netdev(net_dev);
+error_register_netdev:
+	unregister_pm_notifier(&i2400m->pm_notifier);
+error_read_mac_addr:
+error_bootrom_init:
+	if (i2400m->bus_release)
+		i2400m->bus_release(i2400m);
+error_bus_setup:
+	i2400m_bm_buf_free(i2400m);
+error_bm_buf_alloc:
+	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
+	return result;
+}
+EXPORT_SYMBOL_GPL(i2400m_setup);
+
+
+/**
+ * i2400m_release - release the bus-generic driver resources
+ *
+ * Sends a disconnect message and undoes any setup done by i2400m_setup()
+ */
+void i2400m_release(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	netif_stop_queue(i2400m->wimax_dev.net_dev);
+
+	i2400m_dev_stop(i2400m);
+
+	cancel_work_sync(&i2400m->reset_ws);
+	cancel_work_sync(&i2400m->recovery_ws);
+
+	i2400m_debugfs_rm(i2400m);
+	sysfs_remove_group(&i2400m->wimax_dev.net_dev->dev.kobj,
+			   &i2400m_dev_attr_group);
+	wimax_dev_rm(&i2400m->wimax_dev);
+	unregister_netdev(i2400m->wimax_dev.net_dev);
+	unregister_pm_notifier(&i2400m->pm_notifier);
+	if (i2400m->bus_release)
+		i2400m->bus_release(i2400m);
+	i2400m_bm_buf_free(i2400m);
+	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
+}
+EXPORT_SYMBOL_GPL(i2400m_release);
+
+
+/*
+ * Debug levels control; see debug.h
+ */
+struct d_level D_LEVEL[] = {
+	D_SUBMODULE_DEFINE(control),
+	D_SUBMODULE_DEFINE(driver),
+	D_SUBMODULE_DEFINE(debugfs),
+	D_SUBMODULE_DEFINE(fw),
+	D_SUBMODULE_DEFINE(netdev),
+	D_SUBMODULE_DEFINE(rfkill),
+	D_SUBMODULE_DEFINE(rx),
+	D_SUBMODULE_DEFINE(sysfs),
+	D_SUBMODULE_DEFINE(tx),
+};
+size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
+
+
+static
+int __init i2400m_driver_init(void)
+{
+	d_parse_params(D_LEVEL, D_LEVEL_SIZE, i2400m_debug_params,
+		       "i2400m.debug");
+	return i2400m_barker_db_init(i2400m_barkers_params);
+}
+module_init(i2400m_driver_init);
+
+static
+void __exit i2400m_driver_exit(void)
+{
+	i2400m_barker_db_exit();
+}
+module_exit(i2400m_driver_exit);
+
+MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
+MODULE_DESCRIPTION("Intel 2400M WiMAX networking bus-generic driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/wimax/i2400m/fw.c b/drivers/staging/wimax/i2400m/fw.c
new file mode 100644
index 000000000000..6c9a41bff2e0
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/fw.c
@@ -0,0 +1,1653 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Firmware uploader
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Initial implementation
+ *
+ *
+ * THE PROCEDURE
+ *
+ * The 2400m and derived devices work in two modes: boot-mode or
+ * normal mode. In boot mode we can execute only a handful of commands
+ * targeted at uploading the firmware and launching it.
+ *
+ * The 2400m enters boot mode when it is first connected to the
+ * system, when it crashes and when you ask it to reboot. There are
+ * two submodes of the boot mode: signed and non-signed. Signed takes
+ * firmwares signed with a certain private key, non-signed takes any
+ * firmware. Normal hardware takes only signed firmware.
+ *
+ * On boot mode, in USB, we write to the device using the bulk out
+ * endpoint and read from it in the notification endpoint.
+ *
+ * Upon entrance to boot mode, the device sends (preceded with a few
+ * zero length packets (ZLPs) on the notification endpoint in USB) a
+ * reboot barker (4 le32 words with the same value). We ack it by
+ * sending the same barker to the device. The device acks with a
+ * reboot ack barker (4 le32 words with value I2400M_ACK_BARKER) and
+ * then is fully booted. At this point we can upload the firmware.
+ *
+ * Note that different iterations of the device and EEPROM
+ * configurations will send different [re]boot barkers; these are
+ * collected in i2400m_barker_db along with the firmware
+ * characteristics they require.
+ *
+ * This process is accomplished by the i2400m_bootrom_init()
+ * function. All the device interaction happens through the
+ * i2400m_bm_cmd() [boot mode command]. Special return values will
+ * indicate if the device did reset during the process.
+ *
+ * After this, we read the MAC address and then (if needed)
+ * reinitialize the device. We need to read it ahead of time because
+ * in the future, we might not upload the firmware until userspace
+ * 'ifconfig up's the device.
+ *
+ * We can then upload the firmware file. The file is composed of a BCF
+ * header (basic data, keys and signatures) and a list of write
+ * commands and payloads. Optionally more BCF headers might follow the
+ * main payload. We first upload the header [i2400m_dnload_init()] and
+ * then pass the commands and payloads verbatim to the i2400m_bm_cmd()
+ * function [i2400m_dnload_bcf()]. Then we tell the device to jump to
+ * the new firmware [i2400m_dnload_finalize()].
+ *
+ * Once firmware is uploaded, we are good to go :)
+ *
+ * When we don't know in which mode we are, we first try by sending a
+ * warm reset request that will take us to boot-mode. If we time out
+ * waiting for a reboot barker, that means maybe we are already in
+ * boot mode, so we send a reboot barker.
+ *
+ * COMMAND EXECUTION
+ *
+ * This code (and process) is single threaded; for executing commands,
+ * we post a URB to the notification endpoint, post the command, wait
+ * for data on the notification buffer. We don't need to worry about
+ * others as we know we are the only ones in there.
+ *
+ * BACKEND IMPLEMENTATION
+ *
+ * This code is bus-generic; the bus-specific driver provides back end
+ * implementations to send a boot mode command to the device and to
+ * read an acknolwedgement from it (or an asynchronous notification)
+ * from it.
+ *
+ * FIRMWARE LOADING
+ *
+ * Note that in some cases, we can't just load a firmware file (for
+ * example, when resuming). For that, we might cache the firmware
+ * file. Thus, when doing the bootstrap, if there is a cache firmware
+ * file, it is used; if not, loading from disk is attempted.
+ *
+ * ROADMAP
+ *
+ * i2400m_barker_db_init              Called by i2400m_driver_init()
+ *   i2400m_barker_db_add
+ *
+ * i2400m_barker_db_exit              Called by i2400m_driver_exit()
+ *
+ * i2400m_dev_bootstrap               Called by __i2400m_dev_start()
+ *   request_firmware
+ *   i2400m_fw_bootstrap
+ *     i2400m_fw_check
+ *       i2400m_fw_hdr_check
+ *     i2400m_fw_dnload
+ *   release_firmware
+ *
+ * i2400m_fw_dnload
+ *   i2400m_bootrom_init
+ *     i2400m_bm_cmd
+ *     i2400m_reset
+ *   i2400m_dnload_init
+ *     i2400m_dnload_init_signed
+ *     i2400m_dnload_init_nonsigned
+ *       i2400m_download_chunk
+ *         i2400m_bm_cmd
+ *   i2400m_dnload_bcf
+ *     i2400m_bm_cmd
+ *   i2400m_dnload_finalize
+ *     i2400m_bm_cmd
+ *
+ * i2400m_bm_cmd
+ *   i2400m->bus_bm_cmd_send()
+ *   i2400m->bus_bm_wait_for_ack
+ *   __i2400m_bm_ack_verify
+ *     i2400m_is_boot_barker
+ *
+ * i2400m_bm_cmd_prepare              Used by bus-drivers to prep
+ *                                    commands before sending
+ *
+ * i2400m_pm_notifier                 Called on Power Management events
+ *   i2400m_fw_cache
+ *   i2400m_fw_uncache
+ */
+#include <linux/firmware.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/usb.h>
+#include <linux/export.h>
+#include "i2400m.h"
+
+
+#define D_SUBMODULE fw
+#include "debug-levels.h"
+
+
+static const __le32 i2400m_ACK_BARKER[4] = {
+	cpu_to_le32(I2400M_ACK_BARKER),
+	cpu_to_le32(I2400M_ACK_BARKER),
+	cpu_to_le32(I2400M_ACK_BARKER),
+	cpu_to_le32(I2400M_ACK_BARKER)
+};
+
+
+/**
+ * Prepare a boot-mode command for delivery
+ *
+ * @cmd: pointer to bootrom header to prepare
+ *
+ * Computes checksum if so needed. After calling this function, DO NOT
+ * modify the command or header as the checksum won't work anymore.
+ *
+ * We do it from here because some times we cannot do it in the
+ * original context the command was sent (it is a const), so when we
+ * copy it to our staging buffer, we add the checksum there.
+ */
+void i2400m_bm_cmd_prepare(struct i2400m_bootrom_header *cmd)
+{
+	if (i2400m_brh_get_use_checksum(cmd)) {
+		int i;
+		u32 checksum = 0;
+		const u32 *checksum_ptr = (void *) cmd->payload;
+		for (i = 0; i < cmd->data_size / 4; i++)
+			checksum += cpu_to_le32(*checksum_ptr++);
+		checksum += cmd->command + cmd->target_addr + cmd->data_size;
+		cmd->block_checksum = cpu_to_le32(checksum);
+	}
+}
+EXPORT_SYMBOL_GPL(i2400m_bm_cmd_prepare);
+
+
+/*
+ * Database of known barkers.
+ *
+ * A barker is what the device sends indicating he is ready to be
+ * bootloaded. Different versions of the device will send different
+ * barkers. Depending on the barker, it might mean the device wants
+ * some kind of firmware or the other.
+ */
+static struct i2400m_barker_db {
+	__le32 data[4];
+} *i2400m_barker_db;
+static size_t i2400m_barker_db_used, i2400m_barker_db_size;
+
+
+static
+int i2400m_zrealloc_2x(void **ptr, size_t *_count, size_t el_size,
+		       gfp_t gfp_flags)
+{
+	size_t old_count = *_count,
+		new_count = old_count ? 2 * old_count : 2,
+		old_size = el_size * old_count,
+		new_size = el_size * new_count;
+	void *nptr = krealloc(*ptr, new_size, gfp_flags);
+	if (nptr) {
+		/* zero the other half or the whole thing if old_count
+		 * was zero */
+		if (old_size == 0)
+			memset(nptr, 0, new_size);
+		else
+			memset(nptr + old_size, 0, old_size);
+		*_count = new_count;
+		*ptr = nptr;
+		return 0;
+	} else
+		return -ENOMEM;
+}
+
+
+/*
+ * Add a barker to the database
+ *
+ * This cannot used outside of this module and only at at module_init
+ * time. This is to avoid the need to do locking.
+ */
+static
+int i2400m_barker_db_add(u32 barker_id)
+{
+	int result;
+
+	struct i2400m_barker_db *barker;
+	if (i2400m_barker_db_used >= i2400m_barker_db_size) {
+		result = i2400m_zrealloc_2x(
+			(void **) &i2400m_barker_db, &i2400m_barker_db_size,
+			sizeof(i2400m_barker_db[0]), GFP_KERNEL);
+		if (result < 0)
+			return result;
+	}
+	barker = i2400m_barker_db + i2400m_barker_db_used++;
+	barker->data[0] = le32_to_cpu(barker_id);
+	barker->data[1] = le32_to_cpu(barker_id);
+	barker->data[2] = le32_to_cpu(barker_id);
+	barker->data[3] = le32_to_cpu(barker_id);
+	return 0;
+}
+
+
+void i2400m_barker_db_exit(void)
+{
+	kfree(i2400m_barker_db);
+	i2400m_barker_db = NULL;
+	i2400m_barker_db_size = 0;
+	i2400m_barker_db_used = 0;
+}
+
+
+/*
+ * Helper function to add all the known stable barkers to the barker
+ * database.
+ */
+static
+int i2400m_barker_db_known_barkers(void)
+{
+	int result;
+
+	result = i2400m_barker_db_add(I2400M_NBOOT_BARKER);
+	if (result < 0)
+		goto error_add;
+	result = i2400m_barker_db_add(I2400M_SBOOT_BARKER);
+	if (result < 0)
+		goto error_add;
+	result = i2400m_barker_db_add(I2400M_SBOOT_BARKER_6050);
+	if (result < 0)
+		goto error_add;
+error_add:
+       return result;
+}
+
+
+/*
+ * Initialize the barker database
+ *
+ * This can only be used from the module_init function for this
+ * module; this is to avoid the need to do locking.
+ *
+ * @options: command line argument with extra barkers to
+ *     recognize. This is a comma-separated list of 32-bit hex
+ *     numbers. They are appended to the existing list. Setting 0
+ *     cleans the existing list and starts a new one.
+ */
+int i2400m_barker_db_init(const char *_options)
+{
+	int result;
+	char *options = NULL, *options_orig, *token;
+
+	i2400m_barker_db = NULL;
+	i2400m_barker_db_size = 0;
+	i2400m_barker_db_used = 0;
+
+	result = i2400m_barker_db_known_barkers();
+	if (result < 0)
+		goto error_add;
+	/* parse command line options from i2400m.barkers */
+	if (_options != NULL) {
+		unsigned barker;
+
+		options_orig = kstrdup(_options, GFP_KERNEL);
+		if (options_orig == NULL) {
+			result = -ENOMEM;
+			goto error_parse;
+		}
+		options = options_orig;
+
+		while ((token = strsep(&options, ",")) != NULL) {
+			if (*token == '\0')	/* eat joint commas */
+				continue;
+			if (sscanf(token, "%x", &barker) != 1
+			    || barker > 0xffffffff) {
+				printk(KERN_ERR "%s: can't recognize "
+				       "i2400m.barkers value '%s' as "
+				       "a 32-bit number\n",
+				       __func__, token);
+				result = -EINVAL;
+				goto error_parse;
+			}
+			if (barker == 0) {
+				/* clean list and start new */
+				i2400m_barker_db_exit();
+				continue;
+			}
+			result = i2400m_barker_db_add(barker);
+			if (result < 0)
+				goto error_parse_add;
+		}
+		kfree(options_orig);
+	}
+	return 0;
+
+error_parse_add:
+error_parse:
+	kfree(options_orig);
+error_add:
+	kfree(i2400m_barker_db);
+	return result;
+}
+
+
+/*
+ * Recognize a boot barker
+ *
+ * @buf: buffer where the boot barker.
+ * @buf_size: size of the buffer (has to be 16 bytes). It is passed
+ *     here so the function can check it for the caller.
+ *
+ * Note that as a side effect, upon identifying the obtained boot
+ * barker, this function will set i2400m->barker to point to the right
+ * barker database entry. Subsequent calls to the function will result
+ * in verifying that the same type of boot barker is returned when the
+ * device [re]boots (as long as the same device instance is used).
+ *
+ * Return: 0 if @buf matches a known boot barker. -ENOENT if the
+ *     buffer in @buf doesn't match any boot barker in the database or
+ *     -EILSEQ if the buffer doesn't have the right size.
+ */
+int i2400m_is_boot_barker(struct i2400m *i2400m,
+			  const void *buf, size_t buf_size)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_barker_db *barker;
+	int i;
+
+	result = -ENOENT;
+	if (buf_size != sizeof(i2400m_barker_db[i].data))
+		return result;
+
+	/* Short circuit if we have already discovered the barker
+	 * associated with the device. */
+	if (i2400m->barker &&
+	    !memcmp(buf, i2400m->barker, sizeof(i2400m->barker->data)))
+		return 0;
+
+	for (i = 0; i < i2400m_barker_db_used; i++) {
+		barker = &i2400m_barker_db[i];
+		BUILD_BUG_ON(sizeof(barker->data) != 16);
+		if (memcmp(buf, barker->data, sizeof(barker->data)))
+			continue;
+
+		if (i2400m->barker == NULL) {
+			i2400m->barker = barker;
+			d_printf(1, dev, "boot barker set to #%u/%08x\n",
+				 i, le32_to_cpu(barker->data[0]));
+			if (barker->data[0] == le32_to_cpu(I2400M_NBOOT_BARKER))
+				i2400m->sboot = 0;
+			else
+				i2400m->sboot = 1;
+		} else if (i2400m->barker != barker) {
+			dev_err(dev, "HW inconsistency: device "
+				"reports a different boot barker "
+				"than set (from %08x to %08x)\n",
+				le32_to_cpu(i2400m->barker->data[0]),
+				le32_to_cpu(barker->data[0]));
+			result = -EIO;
+		} else
+			d_printf(2, dev, "boot barker confirmed #%u/%08x\n",
+				 i, le32_to_cpu(barker->data[0]));
+		result = 0;
+		break;
+	}
+	return result;
+}
+EXPORT_SYMBOL_GPL(i2400m_is_boot_barker);
+
+
+/*
+ * Verify the ack data received
+ *
+ * Given a reply to a boot mode command, chew it and verify everything
+ * is ok.
+ *
+ * @opcode: opcode which generated this ack. For error messages.
+ * @ack: pointer to ack data we received
+ * @ack_size: size of that data buffer
+ * @flags: I2400M_BM_CMD_* flags we called the command with.
+ *
+ * Way too long function -- maybe it should be further split
+ */
+static
+ssize_t __i2400m_bm_ack_verify(struct i2400m *i2400m, int opcode,
+			       struct i2400m_bootrom_header *ack,
+			       size_t ack_size, int flags)
+{
+	ssize_t result = -ENOMEM;
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(8, dev, "(i2400m %p opcode %d ack %p size %zu)\n",
+		  i2400m, opcode, ack, ack_size);
+	if (ack_size < sizeof(*ack)) {
+		result = -EIO;
+		dev_err(dev, "boot-mode cmd %d: HW BUG? notification didn't "
+			"return enough data (%zu bytes vs %zu expected)\n",
+			opcode, ack_size, sizeof(*ack));
+		goto error_ack_short;
+	}
+	result = i2400m_is_boot_barker(i2400m, ack, ack_size);
+	if (result >= 0) {
+		result = -ERESTARTSYS;
+		d_printf(6, dev, "boot-mode cmd %d: HW boot barker\n", opcode);
+		goto error_reboot;
+	}
+	if (ack_size == sizeof(i2400m_ACK_BARKER)
+		 && memcmp(ack, i2400m_ACK_BARKER, sizeof(*ack)) == 0) {
+		result = -EISCONN;
+		d_printf(3, dev, "boot-mode cmd %d: HW reboot ack barker\n",
+			 opcode);
+		goto error_reboot_ack;
+	}
+	result = 0;
+	if (flags & I2400M_BM_CMD_RAW)
+		goto out_raw;
+	ack->data_size = le32_to_cpu(ack->data_size);
+	ack->target_addr = le32_to_cpu(ack->target_addr);
+	ack->block_checksum = le32_to_cpu(ack->block_checksum);
+	d_printf(5, dev, "boot-mode cmd %d: notification for opcode %u "
+		 "response %u csum %u rr %u da %u\n",
+		 opcode, i2400m_brh_get_opcode(ack),
+		 i2400m_brh_get_response(ack),
+		 i2400m_brh_get_use_checksum(ack),
+		 i2400m_brh_get_response_required(ack),
+		 i2400m_brh_get_direct_access(ack));
+	result = -EIO;
+	if (i2400m_brh_get_signature(ack) != 0xcbbc) {
+		dev_err(dev, "boot-mode cmd %d: HW BUG? wrong signature "
+			"0x%04x\n", opcode, i2400m_brh_get_signature(ack));
+		goto error_ack_signature;
+	}
+	if (opcode != -1 && opcode != i2400m_brh_get_opcode(ack)) {
+		dev_err(dev, "boot-mode cmd %d: HW BUG? "
+			"received response for opcode %u, expected %u\n",
+			opcode, i2400m_brh_get_opcode(ack), opcode);
+		goto error_ack_opcode;
+	}
+	if (i2400m_brh_get_response(ack) != 0) {	/* failed? */
+		dev_err(dev, "boot-mode cmd %d: error; hw response %u\n",
+			opcode, i2400m_brh_get_response(ack));
+		goto error_ack_failed;
+	}
+	if (ack_size < ack->data_size + sizeof(*ack)) {
+		dev_err(dev, "boot-mode cmd %d: SW BUG "
+			"driver provided only %zu bytes for %zu bytes "
+			"of data\n", opcode, ack_size,
+			(size_t) le32_to_cpu(ack->data_size) + sizeof(*ack));
+		goto error_ack_short_buffer;
+	}
+	result = ack_size;
+	/* Don't you love this stack of empty targets? Well, I don't
+	 * either, but it helps track exactly who comes in here and
+	 * why :) */
+error_ack_short_buffer:
+error_ack_failed:
+error_ack_opcode:
+error_ack_signature:
+out_raw:
+error_reboot_ack:
+error_reboot:
+error_ack_short:
+	d_fnend(8, dev, "(i2400m %p opcode %d ack %p size %zu) = %d\n",
+		i2400m, opcode, ack, ack_size, (int) result);
+	return result;
+}
+
+
+/**
+ * i2400m_bm_cmd - Execute a boot mode command
+ *
+ * @cmd: buffer containing the command data (pointing at the header).
+ *     This data can be ANYWHERE (for USB, we will copy it to an
+ *     specific buffer). Make sure everything is in proper little
+ *     endian.
+ *
+ *     A raw buffer can be also sent, just cast it and set flags to
+ *     I2400M_BM_CMD_RAW.
+ *
+ *     This function will generate a checksum for you if the
+ *     checksum bit in the command is set (unless I2400M_BM_CMD_RAW
+ *     is set).
+ *
+ *     You can use the i2400m->bm_cmd_buf to stage your commands and
+ *     send them.
+ *
+ *     If NULL, no command is sent (we just wait for an ack).
+ *
+ * @cmd_size: size of the command. Will be auto padded to the
+ *     bus-specific drivers padding requirements.
+ *
+ * @ack: buffer where to place the acknowledgement. If it is a regular
+ *     command response, all fields will be returned with the right,
+ *     native endianess.
+ *
+ *     You *cannot* use i2400m->bm_ack_buf for this buffer.
+ *
+ * @ack_size: size of @ack, 16 aligned; you need to provide at least
+ *     sizeof(*ack) bytes and then enough to contain the return data
+ *     from the command
+ *
+ * @flags: see I2400M_BM_CMD_* above.
+ *
+ * @returns: bytes received by the notification; if < 0, an errno code
+ *     denoting an error or:
+ *
+ *     -ERESTARTSYS  The device has rebooted
+ *
+ * Executes a boot-mode command and waits for a response, doing basic
+ * validation on it; if a zero length response is received, it retries
+ * waiting for a response until a non-zero one is received (timing out
+ * after %I2400M_BOOT_RETRIES retries).
+ */
+static
+ssize_t i2400m_bm_cmd(struct i2400m *i2400m,
+		      const struct i2400m_bootrom_header *cmd, size_t cmd_size,
+		      struct i2400m_bootrom_header *ack, size_t ack_size,
+		      int flags)
+{
+	ssize_t result = -ENOMEM, rx_bytes;
+	struct device *dev = i2400m_dev(i2400m);
+	int opcode = cmd == NULL ? -1 : i2400m_brh_get_opcode(cmd);
+
+	d_fnstart(6, dev, "(i2400m %p cmd %p size %zu ack %p size %zu)\n",
+		  i2400m, cmd, cmd_size, ack, ack_size);
+	BUG_ON(ack_size < sizeof(*ack));
+	BUG_ON(i2400m->boot_mode == 0);
+
+	if (cmd != NULL) {		/* send the command */
+		result = i2400m->bus_bm_cmd_send(i2400m, cmd, cmd_size, flags);
+		if (result < 0)
+			goto error_cmd_send;
+		if ((flags & I2400M_BM_CMD_RAW) == 0)
+			d_printf(5, dev,
+				 "boot-mode cmd %d csum %u rr %u da %u: "
+				 "addr 0x%04x size %u block csum 0x%04x\n",
+				 opcode, i2400m_brh_get_use_checksum(cmd),
+				 i2400m_brh_get_response_required(cmd),
+				 i2400m_brh_get_direct_access(cmd),
+				 cmd->target_addr, cmd->data_size,
+				 cmd->block_checksum);
+	}
+	result = i2400m->bus_bm_wait_for_ack(i2400m, ack, ack_size);
+	if (result < 0) {
+		dev_err(dev, "boot-mode cmd %d: error waiting for an ack: %d\n",
+			opcode, (int) result);	/* bah, %zd doesn't work */
+		goto error_wait_for_ack;
+	}
+	rx_bytes = result;
+	/* verify the ack and read more if necessary [result is the
+	 * final amount of bytes we get in the ack]  */
+	result = __i2400m_bm_ack_verify(i2400m, opcode, ack, ack_size, flags);
+	if (result < 0)
+		goto error_bad_ack;
+	/* Don't you love this stack of empty targets? Well, I don't
+	 * either, but it helps track exactly who comes in here and
+	 * why :) */
+	result = rx_bytes;
+error_bad_ack:
+error_wait_for_ack:
+error_cmd_send:
+	d_fnend(6, dev, "(i2400m %p cmd %p size %zu ack %p size %zu) = %d\n",
+		i2400m, cmd, cmd_size, ack, ack_size, (int) result);
+	return result;
+}
+
+
+/**
+ * i2400m_download_chunk - write a single chunk of data to the device's memory
+ *
+ * @i2400m: device descriptor
+ * @buf: the buffer to write
+ * @buf_len: length of the buffer to write
+ * @addr: address in the device memory space
+ * @direct: bootrom write mode
+ * @do_csum: should a checksum validation be performed
+ */
+static int i2400m_download_chunk(struct i2400m *i2400m, const void *chunk,
+				 size_t __chunk_len, unsigned long addr,
+				 unsigned int direct, unsigned int do_csum)
+{
+	int ret;
+	size_t chunk_len = ALIGN(__chunk_len, I2400M_PL_ALIGN);
+	struct device *dev = i2400m_dev(i2400m);
+	struct {
+		struct i2400m_bootrom_header cmd;
+		u8 cmd_payload[];
+	} __packed *buf;
+	struct i2400m_bootrom_header ack;
+
+	d_fnstart(5, dev, "(i2400m %p chunk %p __chunk_len %zu addr 0x%08lx "
+		  "direct %u do_csum %u)\n", i2400m, chunk, __chunk_len,
+		  addr, direct, do_csum);
+	buf = i2400m->bm_cmd_buf;
+	memcpy(buf->cmd_payload, chunk, __chunk_len);
+	memset(buf->cmd_payload + __chunk_len, 0xad, chunk_len - __chunk_len);
+
+	buf->cmd.command = i2400m_brh_command(I2400M_BRH_WRITE,
+					      __chunk_len & 0x3 ? 0 : do_csum,
+					      __chunk_len & 0xf ? 0 : direct);
+	buf->cmd.target_addr = cpu_to_le32(addr);
+	buf->cmd.data_size = cpu_to_le32(__chunk_len);
+	ret = i2400m_bm_cmd(i2400m, &buf->cmd, sizeof(buf->cmd) + chunk_len,
+			    &ack, sizeof(ack), 0);
+	if (ret >= 0)
+		ret = 0;
+	d_fnend(5, dev, "(i2400m %p chunk %p __chunk_len %zu addr 0x%08lx "
+		"direct %u do_csum %u) = %d\n", i2400m, chunk, __chunk_len,
+		addr, direct, do_csum, ret);
+	return ret;
+}
+
+
+/*
+ * Download a BCF file's sections to the device
+ *
+ * @i2400m: device descriptor
+ * @bcf: pointer to firmware data (first header followed by the
+ *     payloads). Assumed verified and consistent.
+ * @bcf_len: length (in bytes) of the @bcf buffer.
+ *
+ * Returns: < 0 errno code on error or the offset to the jump instruction.
+ *
+ * Given a BCF file, downloads each section (a command and a payload)
+ * to the device's address space. Actually, it just executes each
+ * command i the BCF file.
+ *
+ * The section size has to be aligned to 4 bytes AND the padding has
+ * to be taken from the firmware file, as the signature takes it into
+ * account.
+ */
+static
+ssize_t i2400m_dnload_bcf(struct i2400m *i2400m,
+			  const struct i2400m_bcf_hdr *bcf, size_t bcf_len)
+{
+	ssize_t ret;
+	struct device *dev = i2400m_dev(i2400m);
+	size_t offset,		/* iterator offset */
+		data_size,	/* Size of the data payload */
+		section_size,	/* Size of the whole section (cmd + payload) */
+		section = 1;
+	const struct i2400m_bootrom_header *bh;
+	struct i2400m_bootrom_header ack;
+
+	d_fnstart(3, dev, "(i2400m %p bcf %p bcf_len %zu)\n",
+		  i2400m, bcf, bcf_len);
+	/* Iterate over the command blocks in the BCF file that start
+	 * after the header */
+	offset = le32_to_cpu(bcf->header_len) * sizeof(u32);
+	while (1) {	/* start sending the file */
+		bh = (void *) bcf + offset;
+		data_size = le32_to_cpu(bh->data_size);
+		section_size = ALIGN(sizeof(*bh) + data_size, 4);
+		d_printf(7, dev,
+			 "downloading section #%zu (@%zu %zu B) to 0x%08x\n",
+			 section, offset, sizeof(*bh) + data_size,
+			 le32_to_cpu(bh->target_addr));
+		/*
+		 * We look for JUMP cmd from the bootmode header,
+		 * either I2400M_BRH_SIGNED_JUMP for secure boot
+		 * or I2400M_BRH_JUMP for unsecure boot, the last chunk
+		 * should be the bootmode header with JUMP cmd.
+		 */
+		if (i2400m_brh_get_opcode(bh) == I2400M_BRH_SIGNED_JUMP ||
+			i2400m_brh_get_opcode(bh) == I2400M_BRH_JUMP) {
+			d_printf(5, dev,  "jump found @%zu\n", offset);
+			break;
+		}
+		if (offset + section_size > bcf_len) {
+			dev_err(dev, "fw %s: bad section #%zu, "
+				"end (@%zu) beyond EOF (@%zu)\n",
+				i2400m->fw_name, section,
+				offset + section_size,  bcf_len);
+			ret = -EINVAL;
+			goto error_section_beyond_eof;
+		}
+		__i2400m_msleep(20);
+		ret = i2400m_bm_cmd(i2400m, bh, section_size,
+				    &ack, sizeof(ack), I2400M_BM_CMD_RAW);
+		if (ret < 0) {
+			dev_err(dev, "fw %s: section #%zu (@%zu %zu B) "
+				"failed %d\n", i2400m->fw_name, section,
+				offset, sizeof(*bh) + data_size, (int) ret);
+			goto error_send;
+		}
+		offset += section_size;
+		section++;
+	}
+	ret = offset;
+error_section_beyond_eof:
+error_send:
+	d_fnend(3, dev, "(i2400m %p bcf %p bcf_len %zu) = %d\n",
+		i2400m, bcf, bcf_len, (int) ret);
+	return ret;
+}
+
+
+/*
+ * Indicate if the device emitted a reboot barker that indicates
+ * "signed boot"
+ */
+static
+unsigned i2400m_boot_is_signed(struct i2400m *i2400m)
+{
+	return likely(i2400m->sboot);
+}
+
+
+/*
+ * Do the final steps of uploading firmware
+ *
+ * @bcf_hdr: BCF header we are actually using
+ * @bcf: pointer to the firmware image (which matches the first header
+ *     that is followed by the actual payloads).
+ * @offset: [byte] offset into @bcf for the command we need to send.
+ *
+ * Depending on the boot mode (signed vs non-signed), different
+ * actions need to be taken.
+ */
+static
+int i2400m_dnload_finalize(struct i2400m *i2400m,
+			   const struct i2400m_bcf_hdr *bcf_hdr,
+			   const struct i2400m_bcf_hdr *bcf, size_t offset)
+{
+	int ret = 0;
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_bootrom_header *cmd, ack;
+	struct {
+		struct i2400m_bootrom_header cmd;
+		u8 cmd_pl[0];
+	} __packed *cmd_buf;
+	size_t signature_block_offset, signature_block_size;
+
+	d_fnstart(3, dev, "offset %zu\n", offset);
+	cmd = (void *) bcf + offset;
+	if (i2400m_boot_is_signed(i2400m) == 0) {
+		struct i2400m_bootrom_header jump_ack;
+		d_printf(1, dev, "unsecure boot, jumping to 0x%08x\n",
+			le32_to_cpu(cmd->target_addr));
+		cmd_buf = i2400m->bm_cmd_buf;
+		memcpy(&cmd_buf->cmd, cmd, sizeof(*cmd));
+		cmd = &cmd_buf->cmd;
+		/* now cmd points to the actual bootrom_header in cmd_buf */
+		i2400m_brh_set_opcode(cmd, I2400M_BRH_JUMP);
+		cmd->data_size = 0;
+		ret = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
+				    &jump_ack, sizeof(jump_ack), 0);
+	} else {
+		d_printf(1, dev, "secure boot, jumping to 0x%08x\n",
+			 le32_to_cpu(cmd->target_addr));
+		cmd_buf = i2400m->bm_cmd_buf;
+		memcpy(&cmd_buf->cmd, cmd, sizeof(*cmd));
+		signature_block_offset =
+			sizeof(*bcf_hdr)
+			+ le32_to_cpu(bcf_hdr->key_size) * sizeof(u32)
+			+ le32_to_cpu(bcf_hdr->exponent_size) * sizeof(u32);
+		signature_block_size =
+			le32_to_cpu(bcf_hdr->modulus_size) * sizeof(u32);
+		memcpy(cmd_buf->cmd_pl,
+		       (void *) bcf_hdr + signature_block_offset,
+		       signature_block_size);
+		ret = i2400m_bm_cmd(i2400m, &cmd_buf->cmd,
+				    sizeof(cmd_buf->cmd) + signature_block_size,
+				    &ack, sizeof(ack), I2400M_BM_CMD_RAW);
+	}
+	d_fnend(3, dev, "returning %d\n", ret);
+	return ret;
+}
+
+
+/**
+ * i2400m_bootrom_init - Reboots a powered device into boot mode
+ *
+ * @i2400m: device descriptor
+ * @flags:
+ *      I2400M_BRI_SOFT: a reboot barker has been seen
+ *          already, so don't wait for it.
+ *
+ *      I2400M_BRI_NO_REBOOT: Don't send a reboot command, but wait
+ *          for a reboot barker notification. This is a one shot; if
+ *          the state machine needs to send a reboot command it will.
+ *
+ * Returns:
+ *
+ *     < 0 errno code on error, 0 if ok.
+ *
+ * Description:
+ *
+ * Tries hard enough to put the device in boot-mode. There are two
+ * main phases to this:
+ *
+ * a. (1) send a reboot command and (2) get a reboot barker
+ *
+ * b. (1) echo/ack the reboot sending the reboot barker back and (2)
+ *        getting an ack barker in return
+ *
+ * We want to skip (a) in some cases [soft]. The state machine is
+ * horrible, but it is basically: on each phase, send what has to be
+ * sent (if any), wait for the answer and act on the answer. We might
+ * have to backtrack and retry, so we keep a max tries counter for
+ * that.
+ *
+ * It sucks because we don't know ahead of time which is going to be
+ * the reboot barker (the device might send different ones depending
+ * on its EEPROM config) and once the device reboots and waits for the
+ * echo/ack reboot barker being sent back, it doesn't understand
+ * anything else. So we can be left at the point where we don't know
+ * what to send to it -- cold reset and bus reset seem to have little
+ * effect. So the function iterates (in this case) through all the
+ * known barkers and tries them all until an ACK is
+ * received. Otherwise, it gives up.
+ *
+ * If we get a timeout after sending a warm reset, we do it again.
+ */
+int i2400m_bootrom_init(struct i2400m *i2400m, enum i2400m_bri flags)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_bootrom_header *cmd;
+	struct i2400m_bootrom_header ack;
+	int count = i2400m->bus_bm_retries;
+	int ack_timeout_cnt = 1;
+	unsigned i;
+
+	BUILD_BUG_ON(sizeof(*cmd) != sizeof(i2400m_barker_db[0].data));
+	BUILD_BUG_ON(sizeof(ack) != sizeof(i2400m_ACK_BARKER));
+
+	d_fnstart(4, dev, "(i2400m %p flags 0x%08x)\n", i2400m, flags);
+	result = -ENOMEM;
+	cmd = i2400m->bm_cmd_buf;
+	if (flags & I2400M_BRI_SOFT)
+		goto do_reboot_ack;
+do_reboot:
+	ack_timeout_cnt = 1;
+	if (--count < 0)
+		goto error_timeout;
+	d_printf(4, dev, "device reboot: reboot command [%d # left]\n",
+		 count);
+	if ((flags & I2400M_BRI_NO_REBOOT) == 0)
+		i2400m_reset(i2400m, I2400M_RT_WARM);
+	result = i2400m_bm_cmd(i2400m, NULL, 0, &ack, sizeof(ack),
+			       I2400M_BM_CMD_RAW);
+	flags &= ~I2400M_BRI_NO_REBOOT;
+	switch (result) {
+	case -ERESTARTSYS:
+		/*
+		 * at this point, i2400m_bm_cmd(), through
+		 * __i2400m_bm_ack_process(), has updated
+		 * i2400m->barker and we are good to go.
+		 */
+		d_printf(4, dev, "device reboot: got reboot barker\n");
+		break;
+	case -EISCONN:	/* we don't know how it got here...but we follow it */
+		d_printf(4, dev, "device reboot: got ack barker - whatever\n");
+		goto do_reboot;
+	case -ETIMEDOUT:
+		/*
+		 * Device has timed out, we might be in boot mode
+		 * already and expecting an ack; if we don't know what
+		 * the barker is, we just send them all. Cold reset
+		 * and bus reset don't work. Beats me.
+		 */
+		if (i2400m->barker != NULL) {
+			dev_err(dev, "device boot: reboot barker timed out, "
+				"trying (set) %08x echo/ack\n",
+				le32_to_cpu(i2400m->barker->data[0]));
+			goto do_reboot_ack;
+		}
+		for (i = 0; i < i2400m_barker_db_used; i++) {
+			struct i2400m_barker_db *barker = &i2400m_barker_db[i];
+			memcpy(cmd, barker->data, sizeof(barker->data));
+			result = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
+					       &ack, sizeof(ack),
+					       I2400M_BM_CMD_RAW);
+			if (result == -EISCONN) {
+				dev_warn(dev, "device boot: got ack barker "
+					 "after sending echo/ack barker "
+					 "#%d/%08x; rebooting j.i.c.\n",
+					 i, le32_to_cpu(barker->data[0]));
+				flags &= ~I2400M_BRI_NO_REBOOT;
+				goto do_reboot;
+			}
+		}
+		dev_err(dev, "device boot: tried all the echo/acks, could "
+			"not get device to respond; giving up");
+		result = -ESHUTDOWN;
+	case -EPROTO:
+	case -ESHUTDOWN:	/* dev is gone */
+	case -EINTR:		/* user cancelled */
+		goto error_dev_gone;
+	default:
+		dev_err(dev, "device reboot: error %d while waiting "
+			"for reboot barker - rebooting\n", result);
+		d_dump(1, dev, &ack, result);
+		goto do_reboot;
+	}
+	/* At this point we ack back with 4 REBOOT barkers and expect
+	 * 4 ACK barkers. This is ugly, as we send a raw command --
+	 * hence the cast. _bm_cmd() will catch the reboot ack
+	 * notification and report it as -EISCONN. */
+do_reboot_ack:
+	d_printf(4, dev, "device reboot ack: sending ack [%d # left]\n", count);
+	memcpy(cmd, i2400m->barker->data, sizeof(i2400m->barker->data));
+	result = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
+			       &ack, sizeof(ack), I2400M_BM_CMD_RAW);
+	switch (result) {
+	case -ERESTARTSYS:
+		d_printf(4, dev, "reboot ack: got reboot barker - retrying\n");
+		if (--count < 0)
+			goto error_timeout;
+		goto do_reboot_ack;
+	case -EISCONN:
+		d_printf(4, dev, "reboot ack: got ack barker - good\n");
+		break;
+	case -ETIMEDOUT:	/* no response, maybe it is the other type? */
+		if (ack_timeout_cnt-- < 0) {
+			d_printf(4, dev, "reboot ack timedout: retrying\n");
+			goto do_reboot_ack;
+		} else {
+			dev_err(dev, "reboot ack timedout too long: "
+				"trying reboot\n");
+			goto do_reboot;
+		}
+		break;
+	case -EPROTO:
+	case -ESHUTDOWN:	/* dev is gone */
+		goto error_dev_gone;
+	default:
+		dev_err(dev, "device reboot ack: error %d while waiting for "
+			"reboot ack barker - rebooting\n", result);
+		goto do_reboot;
+	}
+	d_printf(2, dev, "device reboot ack: got ack barker - boot done\n");
+	result = 0;
+exit_timeout:
+error_dev_gone:
+	d_fnend(4, dev, "(i2400m %p flags 0x%08x) = %d\n",
+		i2400m, flags, result);
+	return result;
+
+error_timeout:
+	dev_err(dev, "Timed out waiting for reboot ack\n");
+	result = -ETIMEDOUT;
+	goto exit_timeout;
+}
+
+
+/*
+ * Read the MAC addr
+ *
+ * The position this function reads is fixed in device memory and
+ * always available, even without firmware.
+ *
+ * Note we specify we want to read only six bytes, but provide space
+ * for 16, as we always get it rounded up.
+ */
+int i2400m_read_mac_addr(struct i2400m *i2400m)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	struct i2400m_bootrom_header *cmd;
+	struct {
+		struct i2400m_bootrom_header ack;
+		u8 ack_pl[16];
+	} __packed ack_buf;
+
+	d_fnstart(5, dev, "(i2400m %p)\n", i2400m);
+	cmd = i2400m->bm_cmd_buf;
+	cmd->command = i2400m_brh_command(I2400M_BRH_READ, 0, 1);
+	cmd->target_addr = cpu_to_le32(0x00203fe8);
+	cmd->data_size = cpu_to_le32(6);
+	result = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
+			       &ack_buf.ack, sizeof(ack_buf), 0);
+	if (result < 0) {
+		dev_err(dev, "BM: read mac addr failed: %d\n", result);
+		goto error_read_mac;
+	}
+	d_printf(2, dev, "mac addr is %pM\n", ack_buf.ack_pl);
+	if (i2400m->bus_bm_mac_addr_impaired == 1) {
+		ack_buf.ack_pl[0] = 0x00;
+		ack_buf.ack_pl[1] = 0x16;
+		ack_buf.ack_pl[2] = 0xd3;
+		get_random_bytes(&ack_buf.ack_pl[3], 3);
+		dev_err(dev, "BM is MAC addr impaired, faking MAC addr to "
+			"mac addr is %pM\n", ack_buf.ack_pl);
+		result = 0;
+	}
+	net_dev->addr_len = ETH_ALEN;
+	memcpy(net_dev->dev_addr, ack_buf.ack_pl, ETH_ALEN);
+error_read_mac:
+	d_fnend(5, dev, "(i2400m %p) = %d\n", i2400m, result);
+	return result;
+}
+
+
+/*
+ * Initialize a non signed boot
+ *
+ * This implies sending some magic values to the device's memory. Note
+ * we convert the values to little endian in the same array
+ * declaration.
+ */
+static
+int i2400m_dnload_init_nonsigned(struct i2400m *i2400m)
+{
+	unsigned i = 0;
+	int ret = 0;
+	struct device *dev = i2400m_dev(i2400m);
+	d_fnstart(5, dev, "(i2400m %p)\n", i2400m);
+	if (i2400m->bus_bm_pokes_table) {
+		while (i2400m->bus_bm_pokes_table[i].address) {
+			ret = i2400m_download_chunk(
+				i2400m,
+				&i2400m->bus_bm_pokes_table[i].data,
+				sizeof(i2400m->bus_bm_pokes_table[i].data),
+				i2400m->bus_bm_pokes_table[i].address, 1, 1);
+			if (ret < 0)
+				break;
+			i++;
+		}
+	}
+	d_fnend(5, dev, "(i2400m %p) = %d\n", i2400m, ret);
+	return ret;
+}
+
+
+/*
+ * Initialize the signed boot process
+ *
+ * @i2400m: device descriptor
+ *
+ * @bcf_hdr: pointer to the firmware header; assumes it is fully in
+ *     memory (it has gone through basic validation).
+ *
+ * Returns: 0 if ok, < 0 errno code on error, -ERESTARTSYS if the hw
+ *     rebooted.
+ *
+ * This writes the firmware BCF header to the device using the
+ * HASH_PAYLOAD_ONLY command.
+ */
+static
+int i2400m_dnload_init_signed(struct i2400m *i2400m,
+			      const struct i2400m_bcf_hdr *bcf_hdr)
+{
+	int ret;
+	struct device *dev = i2400m_dev(i2400m);
+	struct {
+		struct i2400m_bootrom_header cmd;
+		struct i2400m_bcf_hdr cmd_pl;
+	} __packed *cmd_buf;
+	struct i2400m_bootrom_header ack;
+
+	d_fnstart(5, dev, "(i2400m %p bcf_hdr %p)\n", i2400m, bcf_hdr);
+	cmd_buf = i2400m->bm_cmd_buf;
+	cmd_buf->cmd.command =
+		i2400m_brh_command(I2400M_BRH_HASH_PAYLOAD_ONLY, 0, 0);
+	cmd_buf->cmd.target_addr = 0;
+	cmd_buf->cmd.data_size = cpu_to_le32(sizeof(cmd_buf->cmd_pl));
+	memcpy(&cmd_buf->cmd_pl, bcf_hdr, sizeof(*bcf_hdr));
+	ret = i2400m_bm_cmd(i2400m, &cmd_buf->cmd, sizeof(*cmd_buf),
+			    &ack, sizeof(ack), 0);
+	if (ret >= 0)
+		ret = 0;
+	d_fnend(5, dev, "(i2400m %p bcf_hdr %p) = %d\n", i2400m, bcf_hdr, ret);
+	return ret;
+}
+
+
+/*
+ * Initialize the firmware download at the device size
+ *
+ * Multiplex to the one that matters based on the device's mode
+ * (signed or non-signed).
+ */
+static
+int i2400m_dnload_init(struct i2400m *i2400m,
+		       const struct i2400m_bcf_hdr *bcf_hdr)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+
+	if (i2400m_boot_is_signed(i2400m)) {
+		d_printf(1, dev, "signed boot\n");
+		result = i2400m_dnload_init_signed(i2400m, bcf_hdr);
+		if (result == -ERESTARTSYS)
+			return result;
+		if (result < 0)
+			dev_err(dev, "firmware %s: signed boot download "
+				"initialization failed: %d\n",
+				i2400m->fw_name, result);
+	} else {
+		/* non-signed boot process without pokes */
+		d_printf(1, dev, "non-signed boot\n");
+		result = i2400m_dnload_init_nonsigned(i2400m);
+		if (result == -ERESTARTSYS)
+			return result;
+		if (result < 0)
+			dev_err(dev, "firmware %s: non-signed download "
+				"initialization failed: %d\n",
+				i2400m->fw_name, result);
+	}
+	return result;
+}
+
+
+/*
+ * Run consistency tests on the firmware file and load up headers
+ *
+ * Check for the firmware being made for the i2400m device,
+ * etc...These checks are mostly informative, as the device will make
+ * them too; but the driver's response is more informative on what
+ * went wrong.
+ *
+ * This will also look at all the headers present on the firmware
+ * file, and update i2400m->fw_bcf_hdr to point to them.
+ */
+static
+int i2400m_fw_hdr_check(struct i2400m *i2400m,
+			const struct i2400m_bcf_hdr *bcf_hdr,
+			size_t index, size_t offset)
+{
+	struct device *dev = i2400m_dev(i2400m);
+
+	unsigned module_type, header_len, major_version, minor_version,
+		module_id, module_vendor, date, size;
+
+	module_type = le32_to_cpu(bcf_hdr->module_type);
+	header_len = sizeof(u32) * le32_to_cpu(bcf_hdr->header_len);
+	major_version = (le32_to_cpu(bcf_hdr->header_version) & 0xffff0000)
+		>> 16;
+	minor_version = le32_to_cpu(bcf_hdr->header_version) & 0x0000ffff;
+	module_id = le32_to_cpu(bcf_hdr->module_id);
+	module_vendor = le32_to_cpu(bcf_hdr->module_vendor);
+	date = le32_to_cpu(bcf_hdr->date);
+	size = sizeof(u32) * le32_to_cpu(bcf_hdr->size);
+
+	d_printf(1, dev, "firmware %s #%zd@%08zx: BCF header "
+		 "type:vendor:id 0x%x:%x:%x v%u.%u (%u/%u B) built %08x\n",
+		 i2400m->fw_name, index, offset,
+		 module_type, module_vendor, module_id,
+		 major_version, minor_version, header_len, size, date);
+
+	/* Hard errors */
+	if (major_version != 1) {
+		dev_err(dev, "firmware %s #%zd@%08zx: major header version "
+			"v%u.%u not supported\n",
+			i2400m->fw_name, index, offset,
+			major_version, minor_version);
+		return -EBADF;
+	}
+
+	if (module_type != 6) {		/* built for the right hardware? */
+		dev_err(dev, "firmware %s #%zd@%08zx: unexpected module "
+			"type 0x%x; aborting\n",
+			i2400m->fw_name, index, offset,
+			module_type);
+		return -EBADF;
+	}
+
+	if (module_vendor != 0x8086) {
+		dev_err(dev, "firmware %s #%zd@%08zx: unexpected module "
+			"vendor 0x%x; aborting\n",
+			i2400m->fw_name, index, offset, module_vendor);
+		return -EBADF;
+	}
+
+	if (date < 0x20080300)
+		dev_warn(dev, "firmware %s #%zd@%08zx: build date %08x "
+			 "too old; unsupported\n",
+			 i2400m->fw_name, index, offset, date);
+	return 0;
+}
+
+
+/*
+ * Run consistency tests on the firmware file and load up headers
+ *
+ * Check for the firmware being made for the i2400m device,
+ * etc...These checks are mostly informative, as the device will make
+ * them too; but the driver's response is more informative on what
+ * went wrong.
+ *
+ * This will also look at all the headers present on the firmware
+ * file, and update i2400m->fw_hdrs to point to them.
+ */
+static
+int i2400m_fw_check(struct i2400m *i2400m, const void *bcf, size_t bcf_size)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	size_t headers = 0;
+	const struct i2400m_bcf_hdr *bcf_hdr;
+	const void *itr, *next, *top;
+	size_t slots = 0, used_slots = 0;
+
+	for (itr = bcf, top = itr + bcf_size;
+	     itr < top;
+	     headers++, itr = next) {
+		size_t leftover, offset, header_len, size;
+
+		leftover = top - itr;
+		offset = itr - bcf;
+		if (leftover <= sizeof(*bcf_hdr)) {
+			dev_err(dev, "firmware %s: %zu B left at @%zx, "
+				"not enough for BCF header\n",
+				i2400m->fw_name, leftover, offset);
+			break;
+		}
+		bcf_hdr = itr;
+		/* Only the first header is supposed to be followed by
+		 * payload */
+		header_len = sizeof(u32) * le32_to_cpu(bcf_hdr->header_len);
+		size = sizeof(u32) * le32_to_cpu(bcf_hdr->size);
+		if (headers == 0)
+			next = itr + size;
+		else
+			next = itr + header_len;
+
+		result = i2400m_fw_hdr_check(i2400m, bcf_hdr, headers, offset);
+		if (result < 0)
+			continue;
+		if (used_slots + 1 >= slots) {
+			/* +1 -> we need to account for the one we'll
+			 * occupy and at least an extra one for
+			 * always being NULL */
+			result = i2400m_zrealloc_2x(
+				(void **) &i2400m->fw_hdrs, &slots,
+				sizeof(i2400m->fw_hdrs[0]),
+				GFP_KERNEL);
+			if (result < 0)
+				goto error_zrealloc;
+		}
+		i2400m->fw_hdrs[used_slots] = bcf_hdr;
+		used_slots++;
+	}
+	if (headers == 0) {
+		dev_err(dev, "firmware %s: no usable headers found\n",
+			i2400m->fw_name);
+		result = -EBADF;
+	} else
+		result = 0;
+error_zrealloc:
+	return result;
+}
+
+
+/*
+ * Match a barker to a BCF header module ID
+ *
+ * The device sends a barker which tells the firmware loader which
+ * header in the BCF file has to be used. This does the matching.
+ */
+static
+unsigned i2400m_bcf_hdr_match(struct i2400m *i2400m,
+			      const struct i2400m_bcf_hdr *bcf_hdr)
+{
+	u32 barker = le32_to_cpu(i2400m->barker->data[0])
+		& 0x7fffffff;
+	u32 module_id = le32_to_cpu(bcf_hdr->module_id)
+		& 0x7fffffff;	/* high bit used for something else */
+
+	/* special case for 5x50 */
+	if (barker == I2400M_SBOOT_BARKER && module_id == 0)
+		return 1;
+	if (module_id == barker)
+		return 1;
+	return 0;
+}
+
+static
+const struct i2400m_bcf_hdr *i2400m_bcf_hdr_find(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_bcf_hdr **bcf_itr, *bcf_hdr;
+	unsigned i = 0;
+	u32 barker = le32_to_cpu(i2400m->barker->data[0]);
+
+	d_printf(2, dev, "finding BCF header for barker %08x\n", barker);
+	if (barker == I2400M_NBOOT_BARKER) {
+		bcf_hdr = i2400m->fw_hdrs[0];
+		d_printf(1, dev, "using BCF header #%u/%08x for non-signed "
+			 "barker\n", 0, le32_to_cpu(bcf_hdr->module_id));
+		return bcf_hdr;
+	}
+	for (bcf_itr = i2400m->fw_hdrs; *bcf_itr != NULL; bcf_itr++, i++) {
+		bcf_hdr = *bcf_itr;
+		if (i2400m_bcf_hdr_match(i2400m, bcf_hdr)) {
+			d_printf(1, dev, "hit on BCF hdr #%u/%08x\n",
+				 i, le32_to_cpu(bcf_hdr->module_id));
+			return bcf_hdr;
+		} else
+			d_printf(1, dev, "miss on BCF hdr #%u/%08x\n",
+				 i, le32_to_cpu(bcf_hdr->module_id));
+	}
+	dev_err(dev, "cannot find a matching BCF header for barker %08x\n",
+		barker);
+	return NULL;
+}
+
+
+/*
+ * Download the firmware to the device
+ *
+ * @i2400m: device descriptor
+ * @bcf: pointer to loaded (and minimally verified for consistency)
+ *    firmware
+ * @bcf_size: size of the @bcf buffer (header plus payloads)
+ *
+ * The process for doing this is described in this file's header.
+ *
+ * Note we only reinitialize boot-mode if the flags say so. Some hw
+ * iterations need it, some don't. In any case, if we loop, we always
+ * need to reinitialize the boot room, hence the flags modification.
+ */
+static
+int i2400m_fw_dnload(struct i2400m *i2400m, const struct i2400m_bcf_hdr *bcf,
+		     size_t fw_size, enum i2400m_bri flags)
+{
+	int ret = 0;
+	struct device *dev = i2400m_dev(i2400m);
+	int count = i2400m->bus_bm_retries;
+	const struct i2400m_bcf_hdr *bcf_hdr;
+	size_t bcf_size;
+
+	d_fnstart(5, dev, "(i2400m %p bcf %p fw size %zu)\n",
+		  i2400m, bcf, fw_size);
+	i2400m->boot_mode = 1;
+	wmb();		/* Make sure other readers see it */
+hw_reboot:
+	if (count-- == 0) {
+		ret = -ERESTARTSYS;
+		dev_err(dev, "device rebooted too many times, aborting\n");
+		goto error_too_many_reboots;
+	}
+	if (flags & I2400M_BRI_MAC_REINIT) {
+		ret = i2400m_bootrom_init(i2400m, flags);
+		if (ret < 0) {
+			dev_err(dev, "bootrom init failed: %d\n", ret);
+			goto error_bootrom_init;
+		}
+	}
+	flags |= I2400M_BRI_MAC_REINIT;
+
+	/*
+	 * Initialize the download, push the bytes to the device and
+	 * then jump to the new firmware. Note @ret is passed with the
+	 * offset of the jump instruction to _dnload_finalize()
+	 *
+	 * Note we need to use the BCF header in the firmware image
+	 * that matches the barker that the device sent when it
+	 * rebooted, so it has to be passed along.
+	 */
+	ret = -EBADF;
+	bcf_hdr = i2400m_bcf_hdr_find(i2400m);
+	if (bcf_hdr == NULL)
+		goto error_bcf_hdr_find;
+
+	ret = i2400m_dnload_init(i2400m, bcf_hdr);
+	if (ret == -ERESTARTSYS)
+		goto error_dev_rebooted;
+	if (ret < 0)
+		goto error_dnload_init;
+
+	/*
+	 * bcf_size refers to one header size plus the fw sections size
+	 * indicated by the header,ie. if there are other extended headers
+	 * at the tail, they are not counted
+	 */
+	bcf_size = sizeof(u32) * le32_to_cpu(bcf_hdr->size);
+	ret = i2400m_dnload_bcf(i2400m, bcf, bcf_size);
+	if (ret == -ERESTARTSYS)
+		goto error_dev_rebooted;
+	if (ret < 0) {
+		dev_err(dev, "fw %s: download failed: %d\n",
+			i2400m->fw_name, ret);
+		goto error_dnload_bcf;
+	}
+
+	ret = i2400m_dnload_finalize(i2400m, bcf_hdr, bcf, ret);
+	if (ret == -ERESTARTSYS)
+		goto error_dev_rebooted;
+	if (ret < 0) {
+		dev_err(dev, "fw %s: "
+			"download finalization failed: %d\n",
+			i2400m->fw_name, ret);
+		goto error_dnload_finalize;
+	}
+
+	d_printf(2, dev, "fw %s successfully uploaded\n",
+		 i2400m->fw_name);
+	i2400m->boot_mode = 0;
+	wmb();		/* Make sure i2400m_msg_to_dev() sees boot_mode */
+error_dnload_finalize:
+error_dnload_bcf:
+error_dnload_init:
+error_bcf_hdr_find:
+error_bootrom_init:
+error_too_many_reboots:
+	d_fnend(5, dev, "(i2400m %p bcf %p size %zu) = %d\n",
+		i2400m, bcf, fw_size, ret);
+	return ret;
+
+error_dev_rebooted:
+	dev_err(dev, "device rebooted, %d tries left\n", count);
+	/* we got the notification already, no need to wait for it again */
+	flags |= I2400M_BRI_SOFT;
+	goto hw_reboot;
+}
+
+static
+int i2400m_fw_bootstrap(struct i2400m *i2400m, const struct firmware *fw,
+			enum i2400m_bri flags)
+{
+	int ret;
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_bcf_hdr *bcf;	/* Firmware data */
+
+	d_fnstart(5, dev, "(i2400m %p)\n", i2400m);
+	bcf = (void *) fw->data;
+	ret = i2400m_fw_check(i2400m, bcf, fw->size);
+	if (ret >= 0)
+		ret = i2400m_fw_dnload(i2400m, bcf, fw->size, flags);
+	if (ret < 0)
+		dev_err(dev, "%s: cannot use: %d, skipping\n",
+			i2400m->fw_name, ret);
+	kfree(i2400m->fw_hdrs);
+	i2400m->fw_hdrs = NULL;
+	d_fnend(5, dev, "(i2400m %p) = %d\n", i2400m, ret);
+	return ret;
+}
+
+
+/* Refcounted container for firmware data */
+struct i2400m_fw {
+	struct kref kref;
+	const struct firmware *fw;
+};
+
+
+static
+void i2400m_fw_destroy(struct kref *kref)
+{
+	struct i2400m_fw *i2400m_fw =
+		container_of(kref, struct i2400m_fw, kref);
+	release_firmware(i2400m_fw->fw);
+	kfree(i2400m_fw);
+}
+
+
+static
+struct i2400m_fw *i2400m_fw_get(struct i2400m_fw *i2400m_fw)
+{
+	if (i2400m_fw != NULL && i2400m_fw != (void *) ~0)
+		kref_get(&i2400m_fw->kref);
+	return i2400m_fw;
+}
+
+
+static
+void i2400m_fw_put(struct i2400m_fw *i2400m_fw)
+{
+	kref_put(&i2400m_fw->kref, i2400m_fw_destroy);
+}
+
+
+/**
+ * i2400m_dev_bootstrap - Bring the device to a known state and upload firmware
+ *
+ * @i2400m: device descriptor
+ *
+ * Returns: >= 0 if ok, < 0 errno code on error.
+ *
+ * This sets up the firmware upload environment, loads the firmware
+ * file from disk, verifies and then calls the firmware upload process
+ * per se.
+ *
+ * Can be called either from probe, or after a warm reset.  Can not be
+ * called from within an interrupt.  All the flow in this code is
+ * single-threade; all I/Os are synchronous.
+ */
+int i2400m_dev_bootstrap(struct i2400m *i2400m, enum i2400m_bri flags)
+{
+	int ret, itr;
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_fw *i2400m_fw;
+	const struct firmware *fw;
+	const char *fw_name;
+
+	d_fnstart(5, dev, "(i2400m %p)\n", i2400m);
+
+	ret = -ENODEV;
+	spin_lock(&i2400m->rx_lock);
+	i2400m_fw = i2400m_fw_get(i2400m->fw_cached);
+	spin_unlock(&i2400m->rx_lock);
+	if (i2400m_fw == (void *) ~0) {
+		dev_err(dev, "can't load firmware now!");
+		goto out;
+	} else if (i2400m_fw != NULL) {
+		dev_info(dev, "firmware %s: loading from cache\n",
+			 i2400m->fw_name);
+		ret = i2400m_fw_bootstrap(i2400m, i2400m_fw->fw, flags);
+		i2400m_fw_put(i2400m_fw);
+		goto out;
+	}
+
+	/* Load firmware files to memory. */
+	for (itr = 0, ret = -ENOENT; ; itr++) {
+		fw_name = i2400m->bus_fw_names[itr];
+		if (fw_name == NULL) {
+			dev_err(dev, "Could not find a usable firmware image\n");
+			break;
+		}
+		d_printf(1, dev, "trying firmware %s (%d)\n", fw_name, itr);
+		ret = request_firmware(&fw, fw_name, dev);
+		if (ret < 0) {
+			dev_err(dev, "fw %s: cannot load file: %d\n",
+				fw_name, ret);
+			continue;
+		}
+		i2400m->fw_name = fw_name;
+		ret = i2400m_fw_bootstrap(i2400m, fw, flags);
+		release_firmware(fw);
+		if (ret >= 0)	/* firmware loaded successfully */
+			break;
+		i2400m->fw_name = NULL;
+	}
+out:
+	d_fnend(5, dev, "(i2400m %p) = %d\n", i2400m, ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(i2400m_dev_bootstrap);
+
+
+void i2400m_fw_cache(struct i2400m *i2400m)
+{
+	int result;
+	struct i2400m_fw *i2400m_fw;
+	struct device *dev = i2400m_dev(i2400m);
+
+	/* if there is anything there, free it -- now, this'd be weird */
+	spin_lock(&i2400m->rx_lock);
+	i2400m_fw = i2400m->fw_cached;
+	spin_unlock(&i2400m->rx_lock);
+	if (i2400m_fw != NULL && i2400m_fw != (void *) ~0) {
+		i2400m_fw_put(i2400m_fw);
+		WARN(1, "%s:%u: still cached fw still present?\n",
+		     __func__, __LINE__);
+	}
+
+	if (i2400m->fw_name == NULL) {
+		dev_err(dev, "firmware n/a: can't cache\n");
+		i2400m_fw = (void *) ~0;
+		goto out;
+	}
+
+	i2400m_fw = kzalloc(sizeof(*i2400m_fw), GFP_ATOMIC);
+	if (i2400m_fw == NULL)
+		goto out;
+	kref_init(&i2400m_fw->kref);
+	result = request_firmware(&i2400m_fw->fw, i2400m->fw_name, dev);
+	if (result < 0) {
+		dev_err(dev, "firmware %s: failed to cache: %d\n",
+			i2400m->fw_name, result);
+		kfree(i2400m_fw);
+		i2400m_fw = (void *) ~0;
+	} else
+		dev_info(dev, "firmware %s: cached\n", i2400m->fw_name);
+out:
+	spin_lock(&i2400m->rx_lock);
+	i2400m->fw_cached = i2400m_fw;
+	spin_unlock(&i2400m->rx_lock);
+}
+
+
+void i2400m_fw_uncache(struct i2400m *i2400m)
+{
+	struct i2400m_fw *i2400m_fw;
+
+	spin_lock(&i2400m->rx_lock);
+	i2400m_fw = i2400m->fw_cached;
+	i2400m->fw_cached = NULL;
+	spin_unlock(&i2400m->rx_lock);
+
+	if (i2400m_fw != NULL && i2400m_fw != (void *) ~0)
+		i2400m_fw_put(i2400m_fw);
+}
+
diff --git a/drivers/staging/wimax/i2400m/i2400m-usb.h b/drivers/staging/wimax/i2400m/i2400m-usb.h
new file mode 100644
index 000000000000..eff4f464a23e
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/i2400m-usb.h
@@ -0,0 +1,275 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * USB-specific i2400m driver definitions
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ *  - Initial implementation
+ *
+ *
+ * This driver implements the bus-specific part of the i2400m for
+ * USB. Check i2400m.h for a generic driver description.
+ *
+ * ARCHITECTURE
+ *
+ * This driver listens to notifications sent from the notification
+ * endpoint (in usb-notif.c); when data is ready to read, the code in
+ * there schedules a read from the device (usb-rx.c) and then passes
+ * the data to the generic RX code (rx.c).
+ *
+ * When the generic driver needs to send data (network or control), it
+ * queues up in the TX FIFO (tx.c) and that will notify the driver
+ * through the i2400m->bus_tx_kick() callback
+ * (usb-tx.c:i2400mu_bus_tx_kick) which will send the items in the
+ * FIFO queue.
+ *
+ * This driver, as well, implements the USB-specific ops for the generic
+ * driver to be able to setup/teardown communication with the device
+ * [i2400m_bus_dev_start() and i2400m_bus_dev_stop()], reseting the
+ * device [i2400m_bus_reset()] and performing firmware upload
+ * [i2400m_bus_bm_cmd() and i2400_bus_bm_wait_for_ack()].
+ */
+
+#ifndef __I2400M_USB_H__
+#define __I2400M_USB_H__
+
+#include "i2400m.h"
+#include <linux/kthread.h>
+
+
+/*
+ * Error Density Count: cheapo error density (over time) counter
+ *
+ * Originally by Reinette Chatre <reinette.chatre@intel.com>
+ *
+ * Embed an 'struct edc' somewhere. Each time there is a soft or
+ * retryable error, call edc_inc() and check if the error top
+ * watermark has been reached.
+ */
+enum {
+	EDC_MAX_ERRORS = 10,
+	EDC_ERROR_TIMEFRAME = HZ,
+};
+
+/* error density counter */
+struct edc {
+	unsigned long timestart;
+	u16 errorcount;
+};
+
+struct i2400m_endpoint_cfg {
+	unsigned char bulk_out;
+	unsigned char notification;
+	unsigned char reset_cold;
+	unsigned char bulk_in;
+};
+
+static inline void edc_init(struct edc *edc)
+{
+	edc->timestart = jiffies;
+}
+
+/**
+ * edc_inc - report a soft error and check if we are over the watermark
+ *
+ * @edc: pointer to error density counter.
+ * @max_err: maximum number of errors we can accept over the timeframe
+ * @timeframe: length of the timeframe (in jiffies).
+ *
+ * Returns: !0 1 if maximum acceptable errors per timeframe has been
+ *     exceeded. 0 otherwise.
+ *
+ * This is way to determine if the number of acceptable errors per time
+ * period has been exceeded. It is not accurate as there are cases in which
+ * this scheme will not work, for example if there are periodic occurrences
+ * of errors that straddle updates to the start time. This scheme is
+ * sufficient for our usage.
+ *
+ * To use, embed a 'struct edc' somewhere, initialize it with
+ * edc_init() and when an error hits:
+ *
+ * if (do_something_fails_with_a_soft_error) {
+ *        if (edc_inc(&my->edc, MAX_ERRORS, MAX_TIMEFRAME))
+ * 	           Ops, hard error, do something about it
+ *        else
+ *                 Retry or ignore, depending on whatever
+ * }
+ */
+static inline int edc_inc(struct edc *edc, u16 max_err, u16 timeframe)
+{
+	unsigned long now;
+
+	now = jiffies;
+	if (time_after(now, edc->timestart + timeframe)) {
+		edc->errorcount = 1;
+		edc->timestart = now;
+	} else if (++edc->errorcount > max_err) {
+		edc->errorcount = 0;
+		edc->timestart = now;
+		return 1;
+	}
+	return 0;
+}
+
+/* Host-Device interface for USB */
+enum {
+	I2400M_USB_BOOT_RETRIES = 3,
+	I2400MU_MAX_NOTIFICATION_LEN = 256,
+	I2400MU_BLK_SIZE = 16,
+	I2400MU_PL_SIZE_MAX = 0x3EFF,
+
+	/* Device IDs */
+	USB_DEVICE_ID_I6050 = 0x0186,
+	USB_DEVICE_ID_I6050_2 = 0x0188,
+	USB_DEVICE_ID_I6150 = 0x07d6,
+	USB_DEVICE_ID_I6150_2 = 0x07d7,
+	USB_DEVICE_ID_I6150_3 = 0x07d9,
+	USB_DEVICE_ID_I6250 = 0x0187,
+};
+
+
+/**
+ * struct i2400mu - descriptor for a USB connected i2400m
+ *
+ * @i2400m: bus-generic i2400m implementation; has to be first (see
+ *     it's documentation in i2400m.h).
+ *
+ * @usb_dev: pointer to our USB device
+ *
+ * @usb_iface: pointer to our USB interface
+ *
+ * @urb_edc: error density counter; used to keep a density-on-time tab
+ *     on how many soft (retryable or ignorable) errors we get. If we
+ *     go over the threshold, we consider the bus transport is failing
+ *     too much and reset.
+ *
+ * @notif_urb: URB for receiving notifications from the device.
+ *
+ * @tx_kthread: thread we use for data TX. We use a thread because in
+ *     order to do deep power saving and put the device to sleep, we
+ *     need to call usb_autopm_*() [blocking functions].
+ *
+ * @tx_wq: waitqueue for the TX kthread to sleep when there is no data
+ *     to be sent; when more data is available, it is woken up by
+ *     i2400mu_bus_tx_kick().
+ *
+ * @rx_kthread: thread we use for data RX. We use a thread because in
+ *     order to do deep power saving and put the device to sleep, we
+ *     need to call usb_autopm_*() [blocking functions].
+ *
+ * @rx_wq: waitqueue for the RX kthread to sleep when there is no data
+ *     to receive. When data is available, it is woken up by
+ *     usb-notif.c:i2400mu_notification_grok().
+ *
+ * @rx_pending_count: number of rx-data-ready notifications that were
+ *     still not handled by the RX kthread.
+ *
+ * @rx_size: current RX buffer size that is being used.
+ *
+ * @rx_size_acc: accumulator of the sizes of the previous read
+ *     transactions.
+ *
+ * @rx_size_cnt: number of read transactions accumulated in
+ *     @rx_size_acc.
+ *
+ * @do_autopm: disable(0)/enable(>0) calling the
+ *     usb_autopm_get/put_interface() barriers when executing
+ *     commands. See doc in i2400mu_suspend() for more information.
+ *
+ * @rx_size_auto_shrink: if true, the rx_size is shrunk
+ *     automatically based on the average size of the received
+ *     transactions. This allows the receive code to allocate smaller
+ *     chunks of memory and thus reduce pressure on the memory
+ *     allocator by not wasting so much space. By default it is
+ *     enabled.
+ *
+ * @debugfs_dentry: hookup for debugfs files.
+ *     These have to be in a separate directory, a child of
+ *     (wimax_dev->debugfs_dentry) so they can be removed when the
+ *     module unloads, as we don't keep each dentry.
+ */
+struct i2400mu {
+	struct i2400m i2400m;		/* FIRST! See doc */
+
+	struct usb_device *usb_dev;
+	struct usb_interface *usb_iface;
+	struct edc urb_edc;		/* Error density counter */
+	struct i2400m_endpoint_cfg endpoint_cfg;
+
+	struct urb *notif_urb;
+	struct task_struct *tx_kthread;
+	wait_queue_head_t tx_wq;
+
+	struct task_struct *rx_kthread;
+	wait_queue_head_t rx_wq;
+	atomic_t rx_pending_count;
+	size_t rx_size, rx_size_acc, rx_size_cnt;
+	atomic_t do_autopm;
+	u8 rx_size_auto_shrink;
+
+	struct dentry *debugfs_dentry;
+	unsigned i6050:1;	/* 1 if this is a 6050 based SKU */
+};
+
+
+static inline
+void i2400mu_init(struct i2400mu *i2400mu)
+{
+	i2400m_init(&i2400mu->i2400m);
+	edc_init(&i2400mu->urb_edc);
+	init_waitqueue_head(&i2400mu->tx_wq);
+	atomic_set(&i2400mu->rx_pending_count, 0);
+	init_waitqueue_head(&i2400mu->rx_wq);
+	i2400mu->rx_size = PAGE_SIZE - sizeof(struct skb_shared_info);
+	atomic_set(&i2400mu->do_autopm, 1);
+	i2400mu->rx_size_auto_shrink = 1;
+}
+
+int i2400mu_notification_setup(struct i2400mu *);
+void i2400mu_notification_release(struct i2400mu *);
+
+int i2400mu_rx_setup(struct i2400mu *);
+void i2400mu_rx_release(struct i2400mu *);
+void i2400mu_rx_kick(struct i2400mu *);
+
+int i2400mu_tx_setup(struct i2400mu *);
+void i2400mu_tx_release(struct i2400mu *);
+void i2400mu_bus_tx_kick(struct i2400m *);
+
+ssize_t i2400mu_bus_bm_cmd_send(struct i2400m *,
+				const struct i2400m_bootrom_header *, size_t,
+				int);
+ssize_t i2400mu_bus_bm_wait_for_ack(struct i2400m *,
+				    struct i2400m_bootrom_header *, size_t);
+#endif /* #ifndef __I2400M_USB_H__ */
diff --git a/drivers/staging/wimax/i2400m/i2400m.h b/drivers/staging/wimax/i2400m/i2400m.h
new file mode 100644
index 000000000000..de22cc6f2c5c
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/i2400m.h
@@ -0,0 +1,970 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Declarations for bus-generic internal APIs
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ *  - Initial implementation
+ *
+ *
+ * GENERAL DRIVER ARCHITECTURE
+ *
+ * The i2400m driver is split in the following two major parts:
+ *
+ *  - bus specific driver
+ *  - bus generic driver (this part)
+ *
+ * The bus specific driver sets up stuff specific to the bus the
+ * device is connected to (USB, PCI, tam-tam...non-authoritative
+ * nor binding list) which is basically the device-model management
+ * (probe/disconnect, etc), moving data from device to kernel and
+ * back, doing the power saving details and reseting the device.
+ *
+ * For details on each bus-specific driver, see it's include file,
+ * i2400m-BUSNAME.h
+ *
+ * The bus-generic functionality break up is:
+ *
+ *  - Firmware upload: fw.c - takes care of uploading firmware to the
+ *        device. bus-specific driver just needs to provides a way to
+ *        execute boot-mode commands and to reset the device.
+ *
+ *  - RX handling: rx.c - receives data from the bus-specific code and
+ *        feeds it to the network or WiMAX stack or uses it to modify
+ *        the driver state. bus-specific driver only has to receive
+ *        frames and pass them to this module.
+ *
+ *  - TX handling: tx.c - manages the TX FIFO queue and provides means
+ *        for the bus-specific TX code to pull data from the FIFO
+ *        queue. bus-specific code just pulls frames from this module
+ *        to sends them to the device.
+ *
+ *  - netdev glue: netdev.c - interface with Linux networking
+ *        stack. Pass around data frames, and configure when the
+ *        device is up and running or shutdown (through ifconfig up /
+ *        down). Bus-generic only.
+ *
+ *  - control ops: control.c - implements various commands for
+ *        controlling the device. bus-generic only.
+ *
+ *  - device model glue: driver.c - implements helpers for the
+ *        device-model glue done by the bus-specific layer
+ *        (setup/release the driver resources), turning the device on
+ *        and off, handling the device reboots/resets and a few simple
+ *        WiMAX stack ops.
+ *
+ * Code is also broken up in linux-glue / device-glue.
+ *
+ * Linux glue contains functions that deal mostly with gluing with the
+ * rest of the Linux kernel.
+ *
+ * Device-glue are functions that deal mostly with the way the device
+ * does things and talk the device's language.
+ *
+ * device-glue code is licensed BSD so other open source OSes can take
+ * it to implement their drivers.
+ *
+ *
+ * APIs AND HEADER FILES
+ *
+ * This bus generic code exports three APIs:
+ *
+ *  - HDI (host-device interface) definitions common to all busses
+ *    (include/linux/wimax/i2400m.h); these can be also used by user
+ *    space code.
+ *  - internal API for the bus-generic code
+ *  - external API for the bus-specific drivers
+ *
+ *
+ * LIFE CYCLE:
+ *
+ * When the bus-specific driver probes, it allocates a network device
+ * with enough space for it's data structue, that must contain a
+ * &struct i2400m at the top.
+ *
+ * On probe, it needs to fill the i2400m members marked as [fill], as
+ * well as i2400m->wimax_dev.net_dev and call i2400m_setup(). The
+ * i2400m driver will only register with the WiMAX and network stacks;
+ * the only access done to the device is to read the MAC address so we
+ * can register a network device.
+ *
+ * The high-level call flow is:
+ *
+ * bus_probe()
+ *   i2400m_setup()
+ *     i2400m->bus_setup()
+ *     boot rom initialization / read mac addr
+ *     network / WiMAX stacks registration
+ *     i2400m_dev_start()
+ *       i2400m->bus_dev_start()
+ *       i2400m_dev_initialize()
+ *
+ * The reverse applies for a disconnect() call:
+ *
+ * bus_disconnect()
+ *   i2400m_release()
+ *     i2400m_dev_stop()
+ *       i2400m_dev_shutdown()
+ *       i2400m->bus_dev_stop()
+ *     network / WiMAX stack unregistration
+ *     i2400m->bus_release()
+ *
+ * At this point, control and data communications are possible.
+ *
+ * While the device is up, it might reset. The bus-specific driver has
+ * to catch that situation and call i2400m_dev_reset_handle() to deal
+ * with it (reset the internal driver structures and go back to square
+ * one).
+ */
+
+#ifndef __I2400M_H__
+#define __I2400M_H__
+
+#include <linux/usb.h>
+#include <linux/netdevice.h>
+#include <linux/completion.h>
+#include <linux/rwsem.h>
+#include <linux/atomic.h>
+#include "../net-wimax.h"
+#include "linux-wimax-i2400m.h"
+#include <asm/byteorder.h>
+
+enum {
+/* netdev interface */
+	/*
+	 * Out of NWG spec (R1_v1.2.2), 3.3.3 ASN Bearer Plane MTU Size
+	 *
+	 * The MTU is 1400 or less
+	 */
+	I2400M_MAX_MTU = 1400,
+};
+
+/* Misc constants */
+enum {
+	/* Size of the Boot Mode Command buffer */
+	I2400M_BM_CMD_BUF_SIZE = 16 * 1024,
+	I2400M_BM_ACK_BUF_SIZE = 256,
+};
+
+enum {
+	/* Maximum number of bus reset can be retried */
+	I2400M_BUS_RESET_RETRIES = 3,
+};
+
+/**
+ * struct i2400m_poke_table - Hardware poke table for the Intel 2400m
+ *
+ * This structure will be used to create a device specific poke table
+ * to put the device in a consistent state at boot time.
+ *
+ * @address: The device address to poke
+ *
+ * @data: The data value to poke to the device address
+ *
+ */
+struct i2400m_poke_table{
+	__le32 address;
+	__le32 data;
+};
+
+#define I2400M_FW_POKE(a, d) {		\
+	.address = cpu_to_le32(a),	\
+	.data = cpu_to_le32(d)		\
+}
+
+
+/**
+ * i2400m_reset_type - methods to reset a device
+ *
+ * @I2400M_RT_WARM: Reset without device disconnection, device handles
+ *     are kept valid but state is back to power on, with firmware
+ *     re-uploaded.
+ * @I2400M_RT_COLD: Tell the device to disconnect itself from the bus
+ *     and reconnect. Renders all device handles invalid.
+ * @I2400M_RT_BUS: Tells the bus to reset the device; last measure
+ *     used when both types above don't work.
+ */
+enum i2400m_reset_type {
+	I2400M_RT_WARM,	/* first measure */
+	I2400M_RT_COLD,	/* second measure */
+	I2400M_RT_BUS,	/* call in artillery */
+};
+
+struct i2400m_reset_ctx;
+struct i2400m_roq;
+struct i2400m_barker_db;
+
+/**
+ * struct i2400m - descriptor for an Intel 2400m
+ *
+ * Members marked with [fill] must be filled out/initialized before
+ * calling i2400m_setup().
+ *
+ * Note the @bus_setup/@bus_release, @bus_dev_start/@bus_dev_release
+ * call pairs are very much doing almost the same, and depending on
+ * the underlying bus, some stuff has to be put in one or the
+ * other. The idea of setup/release is that they setup the minimal
+ * amount needed for loading firmware, where us dev_start/stop setup
+ * the rest needed to do full data/control traffic.
+ *
+ * @bus_tx_block_size: [fill] USB imposes a 16 block size, but other
+ *     busses will differ.  So we have a tx_blk_size variable that the
+ *     bus layer sets to tell the engine how much of that we need.
+ *
+ * @bus_tx_room_min: [fill] Minimum room required while allocating
+ *     TX queue's buffer space for message header. USB requires
+ *     16 bytes. Refer to bus specific driver code for details.
+ *
+ * @bus_pl_size_max: [fill] Maximum payload size.
+ *
+ * @bus_setup: [optional fill] Function called by the bus-generic code
+ *     [i2400m_setup()] to setup the basic bus-specific communications
+ *     to the the device needed to load firmware. See LIFE CYCLE above.
+ *
+ *     NOTE: Doesn't need to upload the firmware, as that is taken
+ *     care of by the bus-generic code.
+ *
+ * @bus_release: [optional fill] Function called by the bus-generic
+ *     code [i2400m_release()] to shutdown the basic bus-specific
+ *     communications to the the device needed to load firmware. See
+ *     LIFE CYCLE above.
+ *
+ *     This function does not need to reset the device, just tear down
+ *     all the host resources created to  handle communication with
+ *     the device.
+ *
+ * @bus_dev_start: [optional fill] Function called by the bus-generic
+ *     code [i2400m_dev_start()] to do things needed to start the
+ *     device. See LIFE CYCLE above.
+ *
+ *     NOTE: Doesn't need to upload the firmware, as that is taken
+ *     care of by the bus-generic code.
+ *
+ * @bus_dev_stop: [optional fill] Function called by the bus-generic
+ *     code [i2400m_dev_stop()] to do things needed for stopping the
+ *     device. See LIFE CYCLE above.
+ *
+ *     This function does not need to reset the device, just tear down
+ *     all the host resources created to handle communication with
+ *     the device.
+ *
+ * @bus_tx_kick: [fill] Function called by the bus-generic code to let
+ *     the bus-specific code know that there is data available in the
+ *     TX FIFO for transmission to the device.
+ *
+ *     This function cannot sleep.
+ *
+ * @bus_reset: [fill] Function called by the bus-generic code to reset
+ *     the device in in various ways. Doesn't need to wait for the
+ *     reset to finish.
+ *
+ *     If warm or cold reset fail, this function is expected to do a
+ *     bus-specific reset (eg: USB reset) to get the device to a
+ *     working state (even if it implies device disconecction).
+ *
+ *     Note the warm reset is used by the firmware uploader to
+ *     reinitialize the device.
+ *
+ *     IMPORTANT: this is called very early in the device setup
+ *     process, so it cannot rely on common infrastructure being laid
+ *     out.
+ *
+ *     IMPORTANT: don't call reset on RT_BUS with i2400m->init_mutex
+ *     held, as the .pre/.post reset handlers will deadlock.
+ *
+ * @bus_bm_retries: [fill] How many times shall a firmware upload /
+ *     device initialization be retried? Different models of the same
+ *     device might need different values, hence it is set by the
+ *     bus-specific driver. Note this value is used in two places,
+ *     i2400m_fw_dnload() and __i2400m_dev_start(); they won't become
+ *     multiplicative (__i2400m_dev_start() calling N times
+ *     i2400m_fw_dnload() and this trying N times to download the
+ *     firmware), as if __i2400m_dev_start() only retries if the
+ *     firmware crashed while initializing the device (not in a
+ *     general case).
+ *
+ * @bus_bm_cmd_send: [fill] Function called to send a boot-mode
+ *     command. Flags are defined in 'enum i2400m_bm_cmd_flags'. This
+ *     is synchronous and has to return 0 if ok or < 0 errno code in
+ *     any error condition.
+ *
+ * @bus_bm_wait_for_ack: [fill] Function called to wait for a
+ *     boot-mode notification (that can be a response to a previously
+ *     issued command or an asynchronous one). Will read until all the
+ *     indicated size is read or timeout. Reading more or less data
+ *     than asked for is an error condition. Return 0 if ok, < 0 errno
+ *     code on error.
+ *
+ *     The caller to this function will check if the response is a
+ *     barker that indicates the device going into reset mode.
+ *
+ * @bus_fw_names: [fill] a NULL-terminated array with the names of the
+ *     firmware images to try loading. This is made a list so we can
+ *     support backward compatibility of firmware releases (eg: if we
+ *     can't find the default v1.4, we try v1.3). In general, the name
+ *     should be i2400m-fw-X-VERSION.sbcf, where X is the bus name.
+ *     The list is tried in order and the first one that loads is
+ *     used. The fw loader will set i2400m->fw_name to point to the
+ *     active firmware image.
+ *
+ * @bus_bm_mac_addr_impaired: [fill] Set to true if the device's MAC
+ *     address provided in boot mode is kind of broken and needs to
+ *     be re-read later on.
+ *
+ * @bus_bm_pokes_table: [fill/optional] A table of device addresses
+ *     and values that will be poked at device init time to move the
+ *     device to the correct state for the type of boot/firmware being
+ *     used.  This table MUST be terminated with (0x000000,
+ *     0x00000000) or bad things will happen.
+ *
+ *
+ * @wimax_dev: WiMAX generic device for linkage into the kernel WiMAX
+ *     stack. Due to the way a net_device is allocated, we need to
+ *     force this to be the first field so that we can get from
+ *     netdev_priv() the right pointer.
+ *
+ * @updown: the device is up and ready for transmitting control and
+ *     data packets. This implies @ready (communication infrastructure
+ *     with the device is ready) and the device's firmware has been
+ *     loaded and the device initialized.
+ *
+ *     Write to it only inside a i2400m->init_mutex protected area
+ *     followed with a wmb(); rmb() before accesing (unless locked
+ *     inside i2400m->init_mutex). Read access can be loose like that
+ *     [just using rmb()] because the paths that use this also do
+ *     other error checks later on.
+ *
+ * @ready: Communication infrastructure with the device is ready, data
+ *     frames can start to be passed around (this is lighter than
+ *     using the WiMAX state for certain hot paths).
+ *
+ *     Write to it only inside a i2400m->init_mutex protected area
+ *     followed with a wmb(); rmb() before accesing (unless locked
+ *     inside i2400m->init_mutex). Read access can be loose like that
+ *     [just using rmb()] because the paths that use this also do
+ *     other error checks later on.
+ *
+ * @rx_reorder: 1 if RX reordering is enabled; this can only be
+ *     set at probe time.
+ *
+ * @state: device's state (as reported by it)
+ *
+ * @state_wq: waitqueue that is woken up whenever the state changes
+ *
+ * @tx_lock: spinlock to protect TX members
+ *
+ * @tx_buf: FIFO buffer for TX; we queue data here
+ *
+ * @tx_in: FIFO index for incoming data. Note this doesn't wrap around
+ *     and it is always greater than @tx_out.
+ *
+ * @tx_out: FIFO index for outgoing data
+ *
+ * @tx_msg: current TX message that is active in the FIFO for
+ *     appending payloads.
+ *
+ * @tx_sequence: current sequence number for TX messages from the
+ *     device to the host.
+ *
+ * @tx_msg_size: size of the current message being transmitted by the
+ *     bus-specific code.
+ *
+ * @tx_pl_num: total number of payloads sent
+ *
+ * @tx_pl_max: maximum number of payloads sent in a TX message
+ *
+ * @tx_pl_min: minimum number of payloads sent in a TX message
+ *
+ * @tx_num: number of TX messages sent
+ *
+ * @tx_size_acc: number of bytes in all TX messages sent
+ *     (this is different to net_dev's statistics as it also counts
+ *     control messages).
+ *
+ * @tx_size_min: smallest TX message sent.
+ *
+ * @tx_size_max: biggest TX message sent.
+ *
+ * @rx_lock: spinlock to protect RX members and rx_roq_refcount.
+ *
+ * @rx_pl_num: total number of payloads received
+ *
+ * @rx_pl_max: maximum number of payloads received in a RX message
+ *
+ * @rx_pl_min: minimum number of payloads received in a RX message
+ *
+ * @rx_num: number of RX messages received
+ *
+ * @rx_size_acc: number of bytes in all RX messages received
+ *     (this is different to net_dev's statistics as it also counts
+ *     control messages).
+ *
+ * @rx_size_min: smallest RX message received.
+ *
+ * @rx_size_max: buggest RX message received.
+ *
+ * @rx_roq: RX ReOrder queues. (fw >= v1.4) When packets are received
+ *     out of order, the device will ask the driver to hold certain
+ *     packets until the ones that are received out of order can be
+ *     delivered. Then the driver can release them to the host. See
+ *     drivers/net/i2400m/rx.c for details.
+ *
+ * @rx_roq_refcount: refcount rx_roq. This refcounts any access to
+ *     rx_roq thus preventing rx_roq being destroyed when rx_roq
+ *     is being accessed. rx_roq_refcount is protected by rx_lock.
+ *
+ * @rx_reports: reports received from the device that couldn't be
+ *     processed because the driver wasn't still ready; when ready,
+ *     they are pulled from here and chewed.
+ *
+ * @rx_reports_ws: Work struct used to kick a scan of the RX reports
+ *     list and to process each.
+ *
+ * @src_mac_addr: MAC address used to make ethernet packets be coming
+ *     from. This is generated at i2400m_setup() time and used during
+ *     the life cycle of the instance. See i2400m_fake_eth_header().
+ *
+ * @init_mutex: Mutex used for serializing the device bringup
+ *     sequence; this way if the device reboots in the middle, we
+ *     don't try to do a bringup again while we are tearing down the
+ *     one that failed.
+ *
+ *     Can't reuse @msg_mutex because from within the bringup sequence
+ *     we need to send messages to the device and thus use @msg_mutex.
+ *
+ * @msg_mutex: mutex used to send control commands to the device (we
+ *     only allow one at a time, per host-device interface design).
+ *
+ * @msg_completion: used to wait for an ack to a control command sent
+ *     to the device.
+ *
+ * @ack_skb: used to store the actual ack to a control command if the
+ *     reception of the command was successful. Otherwise, a ERR_PTR()
+ *     errno code that indicates what failed with the ack reception.
+ *
+ *     Only valid after @msg_completion is woken up. Only updateable
+ *     if @msg_completion is armed. Only touched by
+ *     i2400m_msg_to_dev().
+ *
+ *     Protected by @rx_lock. In theory the command execution flow is
+ *     sequential, but in case the device sends an out-of-phase or
+ *     very delayed response, we need to avoid it trampling current
+ *     execution.
+ *
+ * @bm_cmd_buf: boot mode command buffer for composing firmware upload
+ *     commands.
+ *
+ *     USB can't r/w to stack, vmalloc, etc...as well, we end up
+ *     having to alloc/free a lot to compose commands, so we use these
+ *     for stagging and not having to realloc all the time.
+ *
+ *     This assumes the code always runs serialized. Only one thread
+ *     can call i2400m_bm_cmd() at the same time.
+ *
+ * @bm_ack_buf: boot mode acknoledge buffer for staging reception of
+ *     responses to commands.
+ *
+ *     See @bm_cmd_buf.
+ *
+ * @work_queue: work queue for processing device reports. This
+ *     workqueue cannot be used for processing TX or RX to the device,
+ *     as from it we'll process device reports, which might require
+ *     further communication with the device.
+ *
+ * @debugfs_dentry: hookup for debugfs files.
+ *     These have to be in a separate directory, a child of
+ *     (wimax_dev->debugfs_dentry) so they can be removed when the
+ *     module unloads, as we don't keep each dentry.
+ *
+ * @fw_name: name of the firmware image that is currently being used.
+ *
+ * @fw_version: version of the firmware interface, Major.minor,
+ *     encoded in the high word and low word (major << 16 | minor).
+ *
+ * @fw_hdrs: NULL terminated array of pointers to the firmware
+ *     headers. This is only available during firmware load time.
+ *
+ * @fw_cached: Used to cache firmware when the system goes to
+ *     suspend/standby/hibernation (as on resume we can't read it). If
+ *     NULL, no firmware was cached, read it. If ~0, you can't read
+ *     any firmware files (the system still didn't come out of suspend
+ *     and failed to cache one), so abort; otherwise, a valid cached
+ *     firmware to be used. Access to this variable is protected by
+ *     the spinlock i2400m->rx_lock.
+ *
+ * @barker: barker type that the device uses; this is initialized by
+ *     i2400m_is_boot_barker() the first time it is called. Then it
+ *     won't change during the life cycle of the device and every time
+ *     a boot barker is received, it is just verified for it being the
+ *     same.
+ *
+ * @pm_notifier: used to register for PM events
+ *
+ * @bus_reset_retries: counter for the number of bus resets attempted for
+ *	this boot. It's not for tracking the number of bus resets during
+ *	the whole driver life cycle (from insmod to rmmod) but for the
+ *	number of dev_start() executed until dev_start() returns a success
+ *	(ie: a good boot means a dev_stop() followed by a successful
+ *	dev_start()). dev_reset_handler() increments this counter whenever
+ *	it is triggering a bus reset. It checks this counter to decide if a
+ *	subsequent bus reset should be retried. dev_reset_handler() retries
+ *	the bus reset until dev_start() succeeds or the counter reaches
+ *	I2400M_BUS_RESET_RETRIES. The counter is cleared to 0 in
+ *	dev_reset_handle() when dev_start() returns a success,
+ *	ie: a successul boot is completed.
+ *
+ * @alive: flag to denote if the device *should* be alive. This flag is
+ *	everything like @updown (see doc for @updown) except reflecting
+ *	the device state *we expect* rather than the actual state as denoted
+ *	by @updown. It is set 1 whenever @updown is set 1 in dev_start().
+ *	Then the device is expected to be alive all the time
+ *	(i2400m->alive remains 1) until the driver is removed. Therefore
+ *	all the device reboot events detected can be still handled properly
+ *	by either dev_reset_handle() or .pre_reset/.post_reset as long as
+ *	the driver presents. It is set 0 along with @updown in dev_stop().
+ *
+ * @error_recovery: flag to denote if we are ready to take an error recovery.
+ *	0 for ready to take an error recovery; 1 for not ready. It is
+ *	initialized to 1 while probe() since we don't tend to take any error
+ *	recovery during probe(). It is decremented by 1 whenever dev_start()
+ *	succeeds to indicate we are ready to take error recovery from now on.
+ *	It is checked every time we wanna schedule an error recovery. If an
+ *	error recovery is already in place (error_recovery was set 1), we
+ *	should not schedule another one until the last one is done.
+ */
+struct i2400m {
+	struct wimax_dev wimax_dev;	/* FIRST! See doc */
+
+	unsigned updown:1;		/* Network device is up or down */
+	unsigned boot_mode:1;		/* is the device in boot mode? */
+	unsigned sboot:1;		/* signed or unsigned fw boot */
+	unsigned ready:1;		/* Device comm infrastructure ready */
+	unsigned rx_reorder:1;		/* RX reorder is enabled */
+	u8 trace_msg_from_user;		/* echo rx msgs to 'trace' pipe */
+					/* typed u8 so /sys/kernel/debug/u8 can tweak */
+	enum i2400m_system_state state;
+	wait_queue_head_t state_wq;	/* Woken up when on state updates */
+
+	size_t bus_tx_block_size;
+	size_t bus_tx_room_min;
+	size_t bus_pl_size_max;
+	unsigned bus_bm_retries;
+
+	int (*bus_setup)(struct i2400m *);
+	int (*bus_dev_start)(struct i2400m *);
+	void (*bus_dev_stop)(struct i2400m *);
+	void (*bus_release)(struct i2400m *);
+	void (*bus_tx_kick)(struct i2400m *);
+	int (*bus_reset)(struct i2400m *, enum i2400m_reset_type);
+	ssize_t (*bus_bm_cmd_send)(struct i2400m *,
+				   const struct i2400m_bootrom_header *,
+				   size_t, int flags);
+	ssize_t (*bus_bm_wait_for_ack)(struct i2400m *,
+				       struct i2400m_bootrom_header *, size_t);
+	const char **bus_fw_names;
+	unsigned bus_bm_mac_addr_impaired:1;
+	const struct i2400m_poke_table *bus_bm_pokes_table;
+
+	spinlock_t tx_lock;		/* protect TX state */
+	void *tx_buf;
+	size_t tx_in, tx_out;
+	struct i2400m_msg_hdr *tx_msg;
+	size_t tx_sequence, tx_msg_size;
+	/* TX stats */
+	unsigned tx_pl_num, tx_pl_max, tx_pl_min,
+		tx_num, tx_size_acc, tx_size_min, tx_size_max;
+
+	/* RX stuff */
+	/* protect RX state and rx_roq_refcount */
+	spinlock_t rx_lock;
+	unsigned rx_pl_num, rx_pl_max, rx_pl_min,
+		rx_num, rx_size_acc, rx_size_min, rx_size_max;
+	struct i2400m_roq *rx_roq;	/* access is refcounted */
+	struct kref rx_roq_refcount;	/* refcount access to rx_roq */
+	u8 src_mac_addr[ETH_HLEN];
+	struct list_head rx_reports;	/* under rx_lock! */
+	struct work_struct rx_report_ws;
+
+	struct mutex msg_mutex;		/* serialize command execution */
+	struct completion msg_completion;
+	struct sk_buff *ack_skb;	/* protected by rx_lock */
+
+	void *bm_ack_buf;		/* for receiving acks over USB */
+	void *bm_cmd_buf;		/* for issuing commands over USB */
+
+	struct workqueue_struct *work_queue;
+
+	struct mutex init_mutex;	/* protect bringup seq */
+	struct i2400m_reset_ctx *reset_ctx;	/* protected by init_mutex */
+
+	struct work_struct wake_tx_ws;
+	struct sk_buff *wake_tx_skb;
+
+	struct work_struct reset_ws;
+	const char *reset_reason;
+
+	struct work_struct recovery_ws;
+
+	struct dentry *debugfs_dentry;
+	const char *fw_name;		/* name of the current firmware image */
+	unsigned long fw_version;	/* version of the firmware interface */
+	const struct i2400m_bcf_hdr **fw_hdrs;
+	struct i2400m_fw *fw_cached;	/* protected by rx_lock */
+	struct i2400m_barker_db *barker;
+
+	struct notifier_block pm_notifier;
+
+	/* counting bus reset retries in this boot */
+	atomic_t bus_reset_retries;
+
+	/* if the device is expected to be alive */
+	unsigned alive;
+
+	/* 0 if we are ready for error recovery; 1 if not ready  */
+	atomic_t error_recovery;
+
+};
+
+
+/*
+ * Bus-generic internal APIs
+ * -------------------------
+ */
+
+static inline
+struct i2400m *wimax_dev_to_i2400m(struct wimax_dev *wimax_dev)
+{
+	return container_of(wimax_dev, struct i2400m, wimax_dev);
+}
+
+static inline
+struct i2400m *net_dev_to_i2400m(struct net_device *net_dev)
+{
+	return wimax_dev_to_i2400m(netdev_priv(net_dev));
+}
+
+/*
+ * Boot mode support
+ */
+
+/**
+ * i2400m_bm_cmd_flags - flags to i2400m_bm_cmd()
+ *
+ * @I2400M_BM_CMD_RAW: send the command block as-is, without doing any
+ *     extra processing for adding CRC.
+ */
+enum i2400m_bm_cmd_flags {
+	I2400M_BM_CMD_RAW	= 1 << 2,
+};
+
+/**
+ * i2400m_bri - Boot-ROM indicators
+ *
+ * Flags for i2400m_bootrom_init() and i2400m_dev_bootstrap() [which
+ * are passed from things like i2400m_setup()]. Can be combined with
+ * |.
+ *
+ * @I2400M_BRI_SOFT: The device rebooted already and a reboot
+ *     barker received, proceed directly to ack the boot sequence.
+ * @I2400M_BRI_NO_REBOOT: Do not reboot the device and proceed
+ *     directly to wait for a reboot barker from the device.
+ * @I2400M_BRI_MAC_REINIT: We need to reinitialize the boot
+ *     rom after reading the MAC address. This is quite a dirty hack,
+ *     if you ask me -- the device requires the bootrom to be
+ *     initialized after reading the MAC address.
+ */
+enum i2400m_bri {
+	I2400M_BRI_SOFT       = 1 << 1,
+	I2400M_BRI_NO_REBOOT  = 1 << 2,
+	I2400M_BRI_MAC_REINIT = 1 << 3,
+};
+
+void i2400m_bm_cmd_prepare(struct i2400m_bootrom_header *);
+int i2400m_dev_bootstrap(struct i2400m *, enum i2400m_bri);
+int i2400m_read_mac_addr(struct i2400m *);
+int i2400m_bootrom_init(struct i2400m *, enum i2400m_bri);
+int i2400m_is_boot_barker(struct i2400m *, const void *, size_t);
+static inline
+int i2400m_is_d2h_barker(const void *buf)
+{
+	const __le32 *barker = buf;
+	return le32_to_cpu(*barker) == I2400M_D2H_MSG_BARKER;
+}
+void i2400m_unknown_barker(struct i2400m *, const void *, size_t);
+
+/* Make/grok boot-rom header commands */
+
+static inline
+__le32 i2400m_brh_command(enum i2400m_brh_opcode opcode, unsigned use_checksum,
+			  unsigned direct_access)
+{
+	return cpu_to_le32(
+		I2400M_BRH_SIGNATURE
+		| (direct_access ? I2400M_BRH_DIRECT_ACCESS : 0)
+		| I2400M_BRH_RESPONSE_REQUIRED /* response always required */
+		| (use_checksum ? I2400M_BRH_USE_CHECKSUM : 0)
+		| (opcode & I2400M_BRH_OPCODE_MASK));
+}
+
+static inline
+void i2400m_brh_set_opcode(struct i2400m_bootrom_header *hdr,
+			   enum i2400m_brh_opcode opcode)
+{
+	hdr->command = cpu_to_le32(
+		(le32_to_cpu(hdr->command) & ~I2400M_BRH_OPCODE_MASK)
+		| (opcode & I2400M_BRH_OPCODE_MASK));
+}
+
+static inline
+unsigned i2400m_brh_get_opcode(const struct i2400m_bootrom_header *hdr)
+{
+	return le32_to_cpu(hdr->command) & I2400M_BRH_OPCODE_MASK;
+}
+
+static inline
+unsigned i2400m_brh_get_response(const struct i2400m_bootrom_header *hdr)
+{
+	return (le32_to_cpu(hdr->command) & I2400M_BRH_RESPONSE_MASK)
+		>> I2400M_BRH_RESPONSE_SHIFT;
+}
+
+static inline
+unsigned i2400m_brh_get_use_checksum(const struct i2400m_bootrom_header *hdr)
+{
+	return le32_to_cpu(hdr->command) & I2400M_BRH_USE_CHECKSUM;
+}
+
+static inline
+unsigned i2400m_brh_get_response_required(
+	const struct i2400m_bootrom_header *hdr)
+{
+	return le32_to_cpu(hdr->command) & I2400M_BRH_RESPONSE_REQUIRED;
+}
+
+static inline
+unsigned i2400m_brh_get_direct_access(const struct i2400m_bootrom_header *hdr)
+{
+	return le32_to_cpu(hdr->command) & I2400M_BRH_DIRECT_ACCESS;
+}
+
+static inline
+unsigned i2400m_brh_get_signature(const struct i2400m_bootrom_header *hdr)
+{
+	return (le32_to_cpu(hdr->command) & I2400M_BRH_SIGNATURE_MASK)
+		>> I2400M_BRH_SIGNATURE_SHIFT;
+}
+
+
+/*
+ * Driver / device setup and internal functions
+ */
+void i2400m_init(struct i2400m *);
+int i2400m_reset(struct i2400m *, enum i2400m_reset_type);
+void i2400m_netdev_setup(struct net_device *net_dev);
+int i2400m_sysfs_setup(struct device_driver *);
+void i2400m_sysfs_release(struct device_driver *);
+int i2400m_tx_setup(struct i2400m *);
+void i2400m_wake_tx_work(struct work_struct *);
+void i2400m_tx_release(struct i2400m *);
+
+int i2400m_rx_setup(struct i2400m *);
+void i2400m_rx_release(struct i2400m *);
+
+void i2400m_fw_cache(struct i2400m *);
+void i2400m_fw_uncache(struct i2400m *);
+
+void i2400m_net_rx(struct i2400m *, struct sk_buff *, unsigned, const void *,
+		   int);
+void i2400m_net_erx(struct i2400m *, struct sk_buff *, enum i2400m_cs);
+void i2400m_net_wake_stop(struct i2400m *);
+enum i2400m_pt;
+int i2400m_tx(struct i2400m *, const void *, size_t, enum i2400m_pt);
+
+#ifdef CONFIG_DEBUG_FS
+void i2400m_debugfs_add(struct i2400m *);
+void i2400m_debugfs_rm(struct i2400m *);
+#else
+static inline void i2400m_debugfs_add(struct i2400m *i2400m) {}
+static inline void i2400m_debugfs_rm(struct i2400m *i2400m) {}
+#endif
+
+/* Initialize/shutdown the device */
+int i2400m_dev_initialize(struct i2400m *);
+void i2400m_dev_shutdown(struct i2400m *);
+
+extern struct attribute_group i2400m_dev_attr_group;
+
+
+/* HDI message's payload description handling */
+
+static inline
+size_t i2400m_pld_size(const struct i2400m_pld *pld)
+{
+	return I2400M_PLD_SIZE_MASK & le32_to_cpu(pld->val);
+}
+
+static inline
+enum i2400m_pt i2400m_pld_type(const struct i2400m_pld *pld)
+{
+	return (I2400M_PLD_TYPE_MASK & le32_to_cpu(pld->val))
+		>> I2400M_PLD_TYPE_SHIFT;
+}
+
+static inline
+void i2400m_pld_set(struct i2400m_pld *pld, size_t size,
+		    enum i2400m_pt type)
+{
+	pld->val = cpu_to_le32(
+		((type << I2400M_PLD_TYPE_SHIFT) & I2400M_PLD_TYPE_MASK)
+		|  (size & I2400M_PLD_SIZE_MASK));
+}
+
+
+/*
+ * API for the bus-specific drivers
+ * --------------------------------
+ */
+
+static inline
+struct i2400m *i2400m_get(struct i2400m *i2400m)
+{
+	dev_hold(i2400m->wimax_dev.net_dev);
+	return i2400m;
+}
+
+static inline
+void i2400m_put(struct i2400m *i2400m)
+{
+	dev_put(i2400m->wimax_dev.net_dev);
+}
+
+int i2400m_dev_reset_handle(struct i2400m *, const char *);
+int i2400m_pre_reset(struct i2400m *);
+int i2400m_post_reset(struct i2400m *);
+void i2400m_error_recovery(struct i2400m *);
+
+/*
+ * _setup()/_release() are called by the probe/disconnect functions of
+ * the bus-specific drivers.
+ */
+int i2400m_setup(struct i2400m *, enum i2400m_bri bm_flags);
+void i2400m_release(struct i2400m *);
+
+int i2400m_rx(struct i2400m *, struct sk_buff *);
+struct i2400m_msg_hdr *i2400m_tx_msg_get(struct i2400m *, size_t *);
+void i2400m_tx_msg_sent(struct i2400m *);
+
+
+/*
+ * Utility functions
+ */
+
+static inline
+struct device *i2400m_dev(struct i2400m *i2400m)
+{
+	return i2400m->wimax_dev.net_dev->dev.parent;
+}
+
+int i2400m_msg_check_status(const struct i2400m_l3l4_hdr *, char *, size_t);
+int i2400m_msg_size_check(struct i2400m *, const struct i2400m_l3l4_hdr *,
+			  size_t);
+struct sk_buff *i2400m_msg_to_dev(struct i2400m *, const void *, size_t);
+void i2400m_msg_to_dev_cancel_wait(struct i2400m *, int);
+void i2400m_report_hook(struct i2400m *, const struct i2400m_l3l4_hdr *,
+			size_t);
+void i2400m_report_hook_work(struct work_struct *);
+int i2400m_cmd_enter_powersave(struct i2400m *);
+int i2400m_cmd_exit_idle(struct i2400m *);
+struct sk_buff *i2400m_get_device_info(struct i2400m *);
+int i2400m_firmware_check(struct i2400m *);
+int i2400m_set_idle_timeout(struct i2400m *, unsigned);
+
+static inline
+struct usb_endpoint_descriptor *usb_get_epd(struct usb_interface *iface, int ep)
+{
+	return &iface->cur_altsetting->endpoint[ep].desc;
+}
+
+int i2400m_op_rfkill_sw_toggle(struct wimax_dev *, enum wimax_rf_state);
+void i2400m_report_tlv_rf_switches_status(struct i2400m *,
+					  const struct i2400m_tlv_rf_switches_status *);
+
+/*
+ * Helpers for firmware backwards compatibility
+ *
+ * As we aim to support at least the firmware version that was
+ * released with the previous kernel/driver release, some code will be
+ * conditionally executed depending on the firmware version. On each
+ * release, the code to support fw releases past the last two ones
+ * will be purged.
+ *
+ * By making it depend on this macros, it is easier to keep it a tab
+ * on what has to go and what not.
+ */
+static inline
+unsigned i2400m_le_v1_3(struct i2400m *i2400m)
+{
+	/* running fw is lower or v1.3 */
+	return i2400m->fw_version <= 0x00090001;
+}
+
+static inline
+unsigned i2400m_ge_v1_4(struct i2400m *i2400m)
+{
+	/* running fw is higher or v1.4 */
+	return i2400m->fw_version >= 0x00090002;
+}
+
+
+/*
+ * Do a millisecond-sleep for allowing wireshark to dump all the data
+ * packets. Used only for debugging.
+ */
+static inline
+void __i2400m_msleep(unsigned ms)
+{
+#if 1
+#else
+	msleep(ms);
+#endif
+}
+
+
+/* module initialization helpers */
+int i2400m_barker_db_init(const char *);
+void i2400m_barker_db_exit(void);
+
+
+
+#endif /* #ifndef __I2400M_H__ */
diff --git a/drivers/staging/wimax/i2400m/linux-wimax-i2400m.h b/drivers/staging/wimax/i2400m/linux-wimax-i2400m.h
new file mode 100644
index 000000000000..fd198bc24a3c
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/linux-wimax-i2400m.h
@@ -0,0 +1,572 @@
+/*
+ * Intel Wireless WiMax Connection 2400m
+ * Host-Device protocol interface definitions
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Initial implementation
+ *
+ *
+ * This header defines the data structures and constants used to
+ * communicate with the device.
+ *
+ * BOOTMODE/BOOTROM/FIRMWARE UPLOAD PROTOCOL
+ *
+ * The firmware upload protocol is quite simple and only requires a
+ * handful of commands. See drivers/net/wimax/i2400m/fw.c for more
+ * details.
+ *
+ * The BCF data structure is for the firmware file header.
+ *
+ *
+ * THE DATA / CONTROL PROTOCOL
+ *
+ * This is the normal protocol spoken with the device once the
+ * firmware is uploaded. It transports data payloads and control
+ * messages back and forth.
+ *
+ * It consists 'messages' that pack one or more payloads each. The
+ * format is described in detail in drivers/net/wimax/i2400m/rx.c and
+ * tx.c.
+ *
+ *
+ * THE L3L4 PROTOCOL
+ *
+ * The term L3L4 refers to Layer 3 (the device), Layer 4 (the
+ * driver/host software).
+ *
+ * This is the control protocol used by the host to control the i2400m
+ * device (scan, connect, disconnect...). This is sent to / received
+ * as control frames. These frames consist of a header and zero or
+ * more TLVs with information. We call each control frame a "message".
+ *
+ * Each message is composed of:
+ *
+ * HEADER
+ * [TLV0 + PAYLOAD0]
+ * [TLV1 + PAYLOAD1]
+ * [...]
+ * [TLVN + PAYLOADN]
+ *
+ * The HEADER is defined by 'struct i2400m_l3l4_hdr'. The payloads are
+ * defined by a TLV structure (Type Length Value) which is a 'header'
+ * (struct i2400m_tlv_hdr) and then the payload.
+ *
+ * All integers are represented as Little Endian.
+ *
+ * - REQUESTS AND EVENTS
+ *
+ * The requests can be clasified as follows:
+ *
+ *   COMMAND:  implies a request from the host to the device requesting
+ *             an action being performed. The device will reply with a
+ *             message (with the same type as the command), status and
+ *             no (TLV) payload. Execution of a command might cause
+ *             events (of different type) to be sent later on as
+ *             device's state changes.
+ *
+ *   GET/SET:  similar to COMMAND, but will not cause other
+ *             EVENTs. The reply, in the case of GET, will contain
+ *             TLVs with the requested information.
+ *
+ *   EVENT:    asynchronous messages sent from the device, maybe as a
+ *             consequence of previous COMMANDs but disassociated from
+ *             them.
+ *
+ * Only one request might be pending at the same time (ie: don't
+ * parallelize nor post another GET request before the previous
+ * COMMAND has been acknowledged with it's corresponding reply by the
+ * device).
+ *
+ * The different requests and their formats are described below:
+ *
+ *  I2400M_MT_*   Message types
+ *  I2400M_MS_*   Message status (for replies, events)
+ *  i2400m_tlv_*  TLVs
+ *
+ * data types are named 'struct i2400m_msg_OPNAME', OPNAME matching the
+ * operation.
+ */
+
+#ifndef __LINUX__WIMAX__I2400M_H__
+#define __LINUX__WIMAX__I2400M_H__
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/*
+ * Host Device Interface (HDI) common to all busses
+ */
+
+/* Boot-mode (firmware upload mode) commands */
+
+/* Header for the firmware file */
+struct i2400m_bcf_hdr {
+	__le32 module_type;
+	__le32 header_len;
+	__le32 header_version;
+	__le32 module_id;
+	__le32 module_vendor;
+	__le32 date;		/* BCD YYYMMDD */
+	__le32 size;            /* in dwords */
+	__le32 key_size;	/* in dwords */
+	__le32 modulus_size;	/* in dwords */
+	__le32 exponent_size;	/* in dwords */
+	__u8 reserved[88];
+} __attribute__ ((packed));
+
+/* Boot mode opcodes */
+enum i2400m_brh_opcode {
+	I2400M_BRH_READ = 1,
+	I2400M_BRH_WRITE = 2,
+	I2400M_BRH_JUMP = 3,
+	I2400M_BRH_SIGNED_JUMP = 8,
+	I2400M_BRH_HASH_PAYLOAD_ONLY = 9,
+};
+
+/* Boot mode command masks and stuff */
+enum i2400m_brh {
+	I2400M_BRH_SIGNATURE = 0xcbbc0000,
+	I2400M_BRH_SIGNATURE_MASK = 0xffff0000,
+	I2400M_BRH_SIGNATURE_SHIFT = 16,
+	I2400M_BRH_OPCODE_MASK = 0x0000000f,
+	I2400M_BRH_RESPONSE_MASK = 0x000000f0,
+	I2400M_BRH_RESPONSE_SHIFT = 4,
+	I2400M_BRH_DIRECT_ACCESS = 0x00000400,
+	I2400M_BRH_RESPONSE_REQUIRED = 0x00000200,
+	I2400M_BRH_USE_CHECKSUM = 0x00000100,
+};
+
+
+/**
+ * i2400m_bootrom_header - Header for a boot-mode command
+ *
+ * @cmd: the above command descriptor
+ * @target_addr: where on the device memory should the action be performed.
+ * @data_size: for read/write, amount of data to be read/written
+ * @block_checksum: checksum value (if applicable)
+ * @payload: the beginning of data attached to this header
+ */
+struct i2400m_bootrom_header {
+	__le32 command;		/* Compose with enum i2400_brh */
+	__le32 target_addr;
+	__le32 data_size;
+	__le32 block_checksum;
+	char payload[0];
+} __attribute__ ((packed));
+
+
+/*
+ * Data / control protocol
+ */
+
+/* Packet types for the host-device interface */
+enum i2400m_pt {
+	I2400M_PT_DATA = 0,
+	I2400M_PT_CTRL,
+	I2400M_PT_TRACE,	/* For device debug */
+	I2400M_PT_RESET_WARM,	/* device reset */
+	I2400M_PT_RESET_COLD,	/* USB[transport] reset, like reconnect */
+	I2400M_PT_EDATA,	/* Extended RX data */
+	I2400M_PT_ILLEGAL
+};
+
+
+/*
+ * Payload for a data packet
+ *
+ * This is prefixed to each and every outgoing DATA type.
+ */
+struct i2400m_pl_data_hdr {
+	__le32 reserved;
+} __attribute__((packed));
+
+
+/*
+ * Payload for an extended data packet
+ *
+ * New in fw v1.4
+ *
+ * @reorder: if this payload has to be reorder or not (and how)
+ * @cs: the type of data in the packet, as defined per (802.16e
+ *     T11.13.19.1). Currently only 2 (IPv4 packet) supported.
+ *
+ * This is prefixed to each and every INCOMING DATA packet.
+ */
+struct i2400m_pl_edata_hdr {
+	__le32 reorder;		/* bits defined in i2400m_ro */
+	__u8 cs;
+	__u8 reserved[11];
+} __attribute__((packed));
+
+enum i2400m_cs {
+	I2400M_CS_IPV4_0 = 0,
+	I2400M_CS_IPV4 = 2,
+};
+
+enum i2400m_ro {
+	I2400M_RO_NEEDED     = 0x01,
+	I2400M_RO_TYPE       = 0x03,
+	I2400M_RO_TYPE_SHIFT = 1,
+	I2400M_RO_CIN        = 0x0f,
+	I2400M_RO_CIN_SHIFT  = 4,
+	I2400M_RO_FBN        = 0x07ff,
+	I2400M_RO_FBN_SHIFT  = 8,
+	I2400M_RO_SN         = 0x07ff,
+	I2400M_RO_SN_SHIFT   = 21,
+};
+
+enum i2400m_ro_type {
+	I2400M_RO_TYPE_RESET = 0,
+	I2400M_RO_TYPE_PACKET,
+	I2400M_RO_TYPE_WS,
+	I2400M_RO_TYPE_PACKET_WS,
+};
+
+
+/* Misc constants */
+enum {
+	I2400M_PL_ALIGN = 16,	/* Payload data size alignment */
+	I2400M_PL_SIZE_MAX = 0x3EFF,
+	I2400M_MAX_PLS_IN_MSG = 60,
+	/* protocol barkers: sync sequences; for notifications they
+	 * are sent in groups of four. */
+	I2400M_H2D_PREVIEW_BARKER = 0xcafe900d,
+	I2400M_COLD_RESET_BARKER = 0xc01dc01d,
+	I2400M_WARM_RESET_BARKER = 0x50f750f7,
+	I2400M_NBOOT_BARKER = 0xdeadbeef,
+	I2400M_SBOOT_BARKER = 0x0ff1c1a1,
+	I2400M_SBOOT_BARKER_6050 = 0x80000001,
+	I2400M_ACK_BARKER = 0xfeedbabe,
+	I2400M_D2H_MSG_BARKER = 0xbeefbabe,
+};
+
+
+/*
+ * Hardware payload descriptor
+ *
+ * Bitfields encoded in a struct to enforce typing semantics.
+ *
+ * Look in rx.c and tx.c for a full description of the format.
+ */
+struct i2400m_pld {
+	__le32 val;
+} __attribute__ ((packed));
+
+#define I2400M_PLD_SIZE_MASK 0x00003fff
+#define I2400M_PLD_TYPE_SHIFT 16
+#define I2400M_PLD_TYPE_MASK 0x000f0000
+
+/*
+ * Header for a TX message or RX message
+ *
+ * @barker: preamble
+ * @size: used for management of the FIFO queue buffer; before
+ *     sending, this is converted to be a real preamble. This
+ *     indicates the real size of the TX message that starts at this
+ *     point. If the highest bit is set, then this message is to be
+ *     skipped.
+ * @sequence: sequence number of this message
+ * @offset: offset where the message itself starts -- see the comments
+ *     in the file header about message header and payload descriptor
+ *     alignment.
+ * @num_pls: number of payloads in this message
+ * @padding: amount of padding bytes at the end of the message to make
+ *           it be of block-size aligned
+ *
+ * Look in rx.c and tx.c for a full description of the format.
+ */
+struct i2400m_msg_hdr {
+	union {
+		__le32 barker;
+		__u32 size;	/* same size type as barker!! */
+	};
+	union {
+		__le32 sequence;
+		__u32 offset;	/* same size type as barker!! */
+	};
+	__le16 num_pls;
+	__le16 rsv1;
+	__le16 padding;
+	__le16 rsv2;
+	struct i2400m_pld pld[0];
+} __attribute__ ((packed));
+
+
+
+/*
+ * L3/L4 control protocol
+ */
+
+enum {
+	/* Interface version */
+	I2400M_L3L4_VERSION             = 0x0100,
+};
+
+/* Message types */
+enum i2400m_mt {
+	I2400M_MT_RESERVED              = 0x0000,
+	I2400M_MT_INVALID               = 0xffff,
+	I2400M_MT_REPORT_MASK		= 0x8000,
+
+	I2400M_MT_GET_SCAN_RESULT  	= 0x4202,
+	I2400M_MT_SET_SCAN_PARAM   	= 0x4402,
+	I2400M_MT_CMD_RF_CONTROL   	= 0x4602,
+	I2400M_MT_CMD_SCAN         	= 0x4603,
+	I2400M_MT_CMD_CONNECT      	= 0x4604,
+	I2400M_MT_CMD_DISCONNECT   	= 0x4605,
+	I2400M_MT_CMD_EXIT_IDLE   	= 0x4606,
+	I2400M_MT_GET_LM_VERSION   	= 0x5201,
+	I2400M_MT_GET_DEVICE_INFO  	= 0x5202,
+	I2400M_MT_GET_LINK_STATUS  	= 0x5203,
+	I2400M_MT_GET_STATISTICS   	= 0x5204,
+	I2400M_MT_GET_STATE        	= 0x5205,
+	I2400M_MT_GET_MEDIA_STATUS	= 0x5206,
+	I2400M_MT_SET_INIT_CONFIG	= 0x5404,
+	I2400M_MT_CMD_INIT	        = 0x5601,
+	I2400M_MT_CMD_TERMINATE		= 0x5602,
+	I2400M_MT_CMD_MODE_OF_OP	= 0x5603,
+	I2400M_MT_CMD_RESET_DEVICE	= 0x5604,
+	I2400M_MT_CMD_MONITOR_CONTROL   = 0x5605,
+	I2400M_MT_CMD_ENTER_POWERSAVE   = 0x5606,
+	I2400M_MT_GET_TLS_OPERATION_RESULT = 0x6201,
+	I2400M_MT_SET_EAP_SUCCESS       = 0x6402,
+	I2400M_MT_SET_EAP_FAIL          = 0x6403,
+	I2400M_MT_SET_EAP_KEY          	= 0x6404,
+	I2400M_MT_CMD_SEND_EAP_RESPONSE = 0x6602,
+	I2400M_MT_REPORT_SCAN_RESULT    = 0xc002,
+	I2400M_MT_REPORT_STATE		= 0xd002,
+	I2400M_MT_REPORT_POWERSAVE_READY = 0xd005,
+	I2400M_MT_REPORT_EAP_REQUEST    = 0xe002,
+	I2400M_MT_REPORT_EAP_RESTART    = 0xe003,
+	I2400M_MT_REPORT_ALT_ACCEPT    	= 0xe004,
+	I2400M_MT_REPORT_KEY_REQUEST 	= 0xe005,
+};
+
+
+/*
+ * Message Ack Status codes
+ *
+ * When a message is replied-to, this status is reported.
+ */
+enum i2400m_ms {
+	I2400M_MS_DONE_OK                  = 0,
+	I2400M_MS_DONE_IN_PROGRESS         = 1,
+	I2400M_MS_INVALID_OP               = 2,
+	I2400M_MS_BAD_STATE                = 3,
+	I2400M_MS_ILLEGAL_VALUE            = 4,
+	I2400M_MS_MISSING_PARAMS           = 5,
+	I2400M_MS_VERSION_ERROR            = 6,
+	I2400M_MS_ACCESSIBILITY_ERROR      = 7,
+	I2400M_MS_BUSY                     = 8,
+	I2400M_MS_CORRUPTED_TLV            = 9,
+	I2400M_MS_UNINITIALIZED            = 10,
+	I2400M_MS_UNKNOWN_ERROR            = 11,
+	I2400M_MS_PRODUCTION_ERROR         = 12,
+	I2400M_MS_NO_RF                    = 13,
+	I2400M_MS_NOT_READY_FOR_POWERSAVE  = 14,
+	I2400M_MS_THERMAL_CRITICAL         = 15,
+	I2400M_MS_MAX
+};
+
+
+/**
+ * i2400m_tlv - enumeration of the different types of TLVs
+ *
+ * TLVs stand for type-length-value and are the header for a payload
+ * composed of almost anything. Each payload has a type assigned
+ * and a length.
+ */
+enum i2400m_tlv {
+	I2400M_TLV_L4_MESSAGE_VERSIONS = 129,
+	I2400M_TLV_SYSTEM_STATE = 141,
+	I2400M_TLV_MEDIA_STATUS = 161,
+	I2400M_TLV_RF_OPERATION = 162,
+	I2400M_TLV_RF_STATUS = 163,
+	I2400M_TLV_DEVICE_RESET_TYPE = 132,
+	I2400M_TLV_CONFIG_IDLE_PARAMETERS = 601,
+	I2400M_TLV_CONFIG_IDLE_TIMEOUT = 611,
+	I2400M_TLV_CONFIG_D2H_DATA_FORMAT = 614,
+	I2400M_TLV_CONFIG_DL_HOST_REORDER = 615,
+};
+
+
+struct i2400m_tlv_hdr {
+	__le16 type;
+	__le16 length;		/* payload's */
+	__u8   pl[0];
+} __attribute__((packed));
+
+
+struct i2400m_l3l4_hdr {
+	__le16 type;
+	__le16 length;		/* payload's */
+	__le16 version;
+	__le16 resv1;
+	__le16 status;
+	__le16 resv2;
+	struct i2400m_tlv_hdr pl[0];
+} __attribute__((packed));
+
+
+/**
+ * i2400m_system_state - different states of the device
+ */
+enum i2400m_system_state {
+	I2400M_SS_UNINITIALIZED = 1,
+	I2400M_SS_INIT,
+	I2400M_SS_READY,
+	I2400M_SS_SCAN,
+	I2400M_SS_STANDBY,
+	I2400M_SS_CONNECTING,
+	I2400M_SS_WIMAX_CONNECTED,
+	I2400M_SS_DATA_PATH_CONNECTED,
+	I2400M_SS_IDLE,
+	I2400M_SS_DISCONNECTING,
+	I2400M_SS_OUT_OF_ZONE,
+	I2400M_SS_SLEEPACTIVE,
+	I2400M_SS_PRODUCTION,
+	I2400M_SS_CONFIG,
+	I2400M_SS_RF_OFF,
+	I2400M_SS_RF_SHUTDOWN,
+	I2400M_SS_DEVICE_DISCONNECT,
+	I2400M_SS_MAX,
+};
+
+
+/**
+ * i2400m_tlv_system_state - report on the state of the system
+ *
+ * @state: see enum i2400m_system_state
+ */
+struct i2400m_tlv_system_state {
+	struct i2400m_tlv_hdr hdr;
+	__le32 state;
+} __attribute__((packed));
+
+
+struct i2400m_tlv_l4_message_versions {
+	struct i2400m_tlv_hdr hdr;
+	__le16 major;
+	__le16 minor;
+	__le16 branch;
+	__le16 reserved;
+} __attribute__((packed));
+
+
+struct i2400m_tlv_detailed_device_info {
+	struct i2400m_tlv_hdr hdr;
+	__u8 reserved1[400];
+	__u8 mac_address[ETH_ALEN];
+	__u8 reserved2[2];
+} __attribute__((packed));
+
+
+enum i2400m_rf_switch_status {
+	I2400M_RF_SWITCH_ON = 1,
+	I2400M_RF_SWITCH_OFF = 2,
+};
+
+struct i2400m_tlv_rf_switches_status {
+	struct i2400m_tlv_hdr hdr;
+	__u8 sw_rf_switch;	/* 1 ON, 2 OFF */
+	__u8 hw_rf_switch;	/* 1 ON, 2 OFF */
+	__u8 reserved[2];
+} __attribute__((packed));
+
+
+enum {
+	i2400m_rf_operation_on = 1,
+	i2400m_rf_operation_off = 2
+};
+
+struct i2400m_tlv_rf_operation {
+	struct i2400m_tlv_hdr hdr;
+	__le32 status;	/* 1 ON, 2 OFF */
+} __attribute__((packed));
+
+
+enum i2400m_tlv_reset_type {
+	I2400M_RESET_TYPE_COLD = 1,
+	I2400M_RESET_TYPE_WARM
+};
+
+struct i2400m_tlv_device_reset_type {
+	struct i2400m_tlv_hdr hdr;
+	__le32 reset_type;
+} __attribute__((packed));
+
+
+struct i2400m_tlv_config_idle_parameters {
+	struct i2400m_tlv_hdr hdr;
+	__le32 idle_timeout;	/* 100 to 300000 ms [5min], 100 increments
+				 * 0 disabled */
+	__le32 idle_paging_interval;	/* frames */
+} __attribute__((packed));
+
+
+enum i2400m_media_status {
+	I2400M_MEDIA_STATUS_LINK_UP = 1,
+	I2400M_MEDIA_STATUS_LINK_DOWN,
+	I2400M_MEDIA_STATUS_LINK_RENEW,
+};
+
+struct i2400m_tlv_media_status {
+	struct i2400m_tlv_hdr hdr;
+	__le32 media_status;
+} __attribute__((packed));
+
+
+/* New in v1.4 */
+struct i2400m_tlv_config_idle_timeout {
+	struct i2400m_tlv_hdr hdr;
+	__le32 timeout;	/* 100 to 300000 ms [5min], 100 increments
+			 * 0 disabled */
+} __attribute__((packed));
+
+/* New in v1.4 -- for backward compat, will be removed */
+struct i2400m_tlv_config_d2h_data_format {
+	struct i2400m_tlv_hdr hdr;
+	__u8 format; 		/* 0 old format, 1 enhanced */
+	__u8 reserved[3];
+} __attribute__((packed));
+
+/* New in v1.4 */
+struct i2400m_tlv_config_dl_host_reorder {
+	struct i2400m_tlv_hdr hdr;
+	__u8 reorder; 		/* 0 disabled, 1 enabled */
+	__u8 reserved[3];
+} __attribute__((packed));
+
+
+#endif /* #ifndef __LINUX__WIMAX__I2400M_H__ */
diff --git a/drivers/staging/wimax/i2400m/netdev.c b/drivers/staging/wimax/i2400m/netdev.c
new file mode 100644
index 000000000000..a7fcbceb6e6b
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/netdev.c
@@ -0,0 +1,603 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Glue with the networking stack
+ *
+ * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This implements an ethernet device for the i2400m.
+ *
+ * We fake being an ethernet device to simplify the support from user
+ * space and from the other side. The world is (sadly) configured to
+ * take in only Ethernet devices...
+ *
+ * Because of this, when using firmwares <= v1.3, there is an
+ * copy-each-rxed-packet overhead on the RX path. Each IP packet has
+ * to be reallocated to add an ethernet header (as there is no space
+ * in what we get from the device). This is a known drawback and
+ * firmwares >= 1.4 add header space that can be used to insert the
+ * ethernet header without having to reallocate and copy.
+ *
+ * TX error handling is tricky; because we have to FIFO/queue the
+ * buffers for transmission (as the hardware likes it aggregated), we
+ * just give the skb to the TX subsystem and by the time it is
+ * transmitted, we have long forgotten about it. So we just don't care
+ * too much about it.
+ *
+ * Note that when the device is in idle mode with the basestation, we
+ * need to negotiate coming back up online. That involves negotiation
+ * and possible user space interaction. Thus, we defer to a workqueue
+ * to do all that. By default, we only queue a single packet and drop
+ * the rest, as potentially the time to go back from idle to normal is
+ * long.
+ *
+ * ROADMAP
+ *
+ * i2400m_open         Called on ifconfig up
+ * i2400m_stop         Called on ifconfig down
+ *
+ * i2400m_hard_start_xmit Called by the network stack to send a packet
+ *   i2400m_net_wake_tx	  Wake up device from basestation-IDLE & TX
+ *     i2400m_wake_tx_work
+ *       i2400m_cmd_exit_idle
+ *       i2400m_tx
+ *   i2400m_net_tx        TX a data frame
+ *     i2400m_tx
+ *
+ * i2400m_change_mtu      Called on ifconfig mtu XXX
+ *
+ * i2400m_tx_timeout      Called when the device times out
+ *
+ * i2400m_net_rx          Called by the RX code when a data frame is
+ *                        available (firmware <= 1.3)
+ * i2400m_net_erx         Called by the RX code when a data frame is
+ *                        available (firmware >= 1.4).
+ * i2400m_netdev_setup    Called to setup all the netdev stuff from
+ *                        alloc_netdev.
+ */
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/export.h>
+#include "i2400m.h"
+
+
+#define D_SUBMODULE netdev
+#include "debug-levels.h"
+
+enum {
+/* netdev interface */
+	/* 20 secs? yep, this is the maximum timeout that the device
+	 * might take to get out of IDLE / negotiate it with the base
+	 * station. We add 1sec for good measure. */
+	I2400M_TX_TIMEOUT = 21 * HZ,
+	/*
+	 * Experimentation has determined that, 20 to be a good value
+	 * for minimizing the jitter in the throughput.
+	 */
+	I2400M_TX_QLEN = 20,
+};
+
+
+static
+int i2400m_open(struct net_device *net_dev)
+{
+	int result;
+	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(net_dev %p [i2400m %p])\n", net_dev, i2400m);
+	/* Make sure we wait until init is complete... */
+	mutex_lock(&i2400m->init_mutex);
+	if (i2400m->updown)
+		result = 0;
+	else
+		result = -EBUSY;
+	mutex_unlock(&i2400m->init_mutex);
+	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
+		net_dev, i2400m, result);
+	return result;
+}
+
+
+static
+int i2400m_stop(struct net_device *net_dev)
+{
+	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(net_dev %p [i2400m %p])\n", net_dev, i2400m);
+	i2400m_net_wake_stop(i2400m);
+	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = 0\n", net_dev, i2400m);
+	return 0;
+}
+
+
+/*
+ * Wake up the device and transmit a held SKB, then restart the net queue
+ *
+ * When the device goes into basestation-idle mode, we need to tell it
+ * to exit that mode; it will negotiate with the base station, user
+ * space may have to intervene to rehandshake crypto and then tell us
+ * when it is ready to transmit the packet we have "queued". Still we
+ * need to give it sometime after it reports being ok.
+ *
+ * On error, there is not much we can do. If the error was on TX, we
+ * still wake the queue up to see if the next packet will be luckier.
+ *
+ * If _cmd_exit_idle() fails...well, it could be many things; most
+ * commonly it is that something else took the device out of IDLE mode
+ * (for example, the base station). In that case we get an -EILSEQ and
+ * we are just going to ignore that one. If the device is back to
+ * connected, then fine -- if it is someother state, the packet will
+ * be dropped anyway.
+ */
+void i2400m_wake_tx_work(struct work_struct *ws)
+{
+	int result;
+	struct i2400m *i2400m = container_of(ws, struct i2400m, wake_tx_ws);
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	skb = i2400m->wake_tx_skb;
+	i2400m->wake_tx_skb = NULL;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+
+	d_fnstart(3, dev, "(ws %p i2400m %p skb %p)\n", ws, i2400m, skb);
+	result = -EINVAL;
+	if (skb == NULL) {
+		dev_err(dev, "WAKE&TX: skb disappeared!\n");
+		goto out_put;
+	}
+	/* If we have, somehow, lost the connection after this was
+	 * queued, don't do anything; this might be the device got
+	 * reset or just disconnected. */
+	if (unlikely(!netif_carrier_ok(net_dev)))
+		goto out_kfree;
+	result = i2400m_cmd_exit_idle(i2400m);
+	if (result == -EILSEQ)
+		result = 0;
+	if (result < 0) {
+		dev_err(dev, "WAKE&TX: device didn't get out of idle: "
+			"%d - resetting\n", result);
+		i2400m_reset(i2400m, I2400M_RT_BUS);
+		goto error;
+	}
+	result = wait_event_timeout(i2400m->state_wq,
+				    i2400m->state != I2400M_SS_IDLE,
+				    net_dev->watchdog_timeo - HZ/2);
+	if (result == 0)
+		result = -ETIMEDOUT;
+	if (result < 0) {
+		dev_err(dev, "WAKE&TX: error waiting for device to exit IDLE: "
+			"%d - resetting\n", result);
+		i2400m_reset(i2400m, I2400M_RT_BUS);
+		goto error;
+	}
+	msleep(20);	/* device still needs some time or it drops it */
+	result = i2400m_tx(i2400m, skb->data, skb->len, I2400M_PT_DATA);
+error:
+	netif_wake_queue(net_dev);
+out_kfree:
+	kfree_skb(skb);	/* refcount transferred by _hard_start_xmit() */
+out_put:
+	i2400m_put(i2400m);
+	d_fnend(3, dev, "(ws %p i2400m %p skb %p) = void [%d]\n",
+		ws, i2400m, skb, result);
+}
+
+
+/*
+ * Prepare the data payload TX header
+ *
+ * The i2400m expects a 4 byte header in front of a data packet.
+ *
+ * Because we pretend to be an ethernet device, this packet comes with
+ * an ethernet header. Pull it and push our header.
+ */
+static
+void i2400m_tx_prep_header(struct sk_buff *skb)
+{
+	struct i2400m_pl_data_hdr *pl_hdr;
+	skb_pull(skb, ETH_HLEN);
+	pl_hdr = skb_push(skb, sizeof(*pl_hdr));
+	pl_hdr->reserved = 0;
+}
+
+
+
+/*
+ * Cleanup resources acquired during i2400m_net_wake_tx()
+ *
+ * This is called by __i2400m_dev_stop and means we have to make sure
+ * the workqueue is flushed from any pending work.
+ */
+void i2400m_net_wake_stop(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *wake_tx_skb;
+	unsigned long flags;
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	/*
+	 * See i2400m_hard_start_xmit(), references are taken there and
+	 * here we release them if the packet was still pending.
+	 */
+	cancel_work_sync(&i2400m->wake_tx_ws);
+
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	wake_tx_skb = i2400m->wake_tx_skb;
+	i2400m->wake_tx_skb = NULL;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+
+	if (wake_tx_skb) {
+		i2400m_put(i2400m);
+		kfree_skb(wake_tx_skb);
+	}
+
+	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
+}
+
+
+/*
+ * TX an skb to an idle device
+ *
+ * When the device is in basestation-idle mode, we need to wake it up
+ * and then TX. So we queue a work_struct for doing so.
+ *
+ * We need to get an extra ref for the skb (so it is not dropped), as
+ * well as be careful not to queue more than one request (won't help
+ * at all). If more than one request comes or there are errors, we
+ * just drop the packets (see i2400m_hard_start_xmit()).
+ */
+static
+int i2400m_net_wake_tx(struct i2400m *i2400m, struct net_device *net_dev,
+		       struct sk_buff *skb)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned long flags;
+
+	d_fnstart(3, dev, "(skb %p net_dev %p)\n", skb, net_dev);
+	if (net_ratelimit()) {
+		d_printf(3, dev, "WAKE&NETTX: "
+			 "skb %p sending %d bytes to radio\n",
+			 skb, skb->len);
+		d_dump(4, dev, skb->data, skb->len);
+	}
+	/* We hold a ref count for i2400m and skb, so when
+	 * stopping() the device, we need to cancel that work
+	 * and if pending, release those resources. */
+	result = 0;
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	if (!i2400m->wake_tx_skb) {
+		netif_stop_queue(net_dev);
+		i2400m_get(i2400m);
+		i2400m->wake_tx_skb = skb_get(skb);	/* transfer ref count */
+		i2400m_tx_prep_header(skb);
+		result = schedule_work(&i2400m->wake_tx_ws);
+		WARN_ON(result == 0);
+	}
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	if (result == 0) {
+		/* Yes, this happens even if we stopped the
+		 * queue -- blame the queue disciplines that
+		 * queue without looking -- I guess there is a reason
+		 * for that. */
+		if (net_ratelimit())
+			d_printf(1, dev, "NETTX: device exiting idle, "
+				 "dropping skb %p, queue running %d\n",
+				 skb, netif_queue_stopped(net_dev));
+		result = -EBUSY;
+	}
+	d_fnend(3, dev, "(skb %p net_dev %p) = %d\n", skb, net_dev, result);
+	return result;
+}
+
+
+/*
+ * Transmit a packet to the base station on behalf of the network stack.
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * We need to pull the ethernet header and add the hardware header,
+ * which is currently set to all zeroes and reserved.
+ */
+static
+int i2400m_net_tx(struct i2400m *i2400m, struct net_device *net_dev,
+		  struct sk_buff *skb)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(i2400m %p net_dev %p skb %p)\n",
+		  i2400m, net_dev, skb);
+	/* FIXME: check eth hdr, only IPv4 is routed by the device as of now */
+	netif_trans_update(net_dev);
+	i2400m_tx_prep_header(skb);
+	d_printf(3, dev, "NETTX: skb %p sending %d bytes to radio\n",
+		 skb, skb->len);
+	d_dump(4, dev, skb->data, skb->len);
+	result = i2400m_tx(i2400m, skb->data, skb->len, I2400M_PT_DATA);
+	d_fnend(3, dev, "(i2400m %p net_dev %p skb %p) = %d\n",
+		i2400m, net_dev, skb, result);
+	return result;
+}
+
+
+/*
+ * Transmit a packet to the base station on behalf of the network stack
+ *
+ *
+ * Returns: NETDEV_TX_OK (always, even in case of error)
+ *
+ * In case of error, we just drop it. Reasons:
+ *
+ *  - we add a hw header to each skb, and if the network stack
+ *    retries, we have no way to know if that skb has it or not.
+ *
+ *  - network protocols have their own drop-recovery mechanisms
+ *
+ *  - there is not much else we can do
+ *
+ * If the device is idle, we need to wake it up; that is an operation
+ * that will sleep. See i2400m_net_wake_tx() for details.
+ */
+static
+netdev_tx_t i2400m_hard_start_xmit(struct sk_buff *skb,
+					 struct net_device *net_dev)
+{
+	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
+	struct device *dev = i2400m_dev(i2400m);
+	int result = -1;
+
+	d_fnstart(3, dev, "(skb %p net_dev %p)\n", skb, net_dev);
+
+	if (skb_cow_head(skb, 0))
+		goto drop;
+
+	if (i2400m->state == I2400M_SS_IDLE)
+		result = i2400m_net_wake_tx(i2400m, net_dev, skb);
+	else
+		result = i2400m_net_tx(i2400m, net_dev, skb);
+	if (result <  0) {
+drop:
+		net_dev->stats.tx_dropped++;
+	} else {
+		net_dev->stats.tx_packets++;
+		net_dev->stats.tx_bytes += skb->len;
+	}
+	dev_kfree_skb(skb);
+	d_fnend(3, dev, "(skb %p net_dev %p) = %d\n", skb, net_dev, result);
+	return NETDEV_TX_OK;
+}
+
+
+static
+void i2400m_tx_timeout(struct net_device *net_dev, unsigned int txqueue)
+{
+	/*
+	 * We might want to kick the device
+	 *
+	 * There is not much we can do though, as the device requires
+	 * that we send the data aggregated. By the time we receive
+	 * this, there might be data pending to be sent or not...
+	 */
+	net_dev->stats.tx_errors++;
+}
+
+
+/*
+ * Create a fake ethernet header
+ *
+ * For emulating an ethernet device, every received IP header has to
+ * be prefixed with an ethernet header. Fake it with the given
+ * protocol.
+ */
+static
+void i2400m_rx_fake_eth_header(struct net_device *net_dev,
+			       void *_eth_hdr, __be16 protocol)
+{
+	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
+	struct ethhdr *eth_hdr = _eth_hdr;
+
+	memcpy(eth_hdr->h_dest, net_dev->dev_addr, sizeof(eth_hdr->h_dest));
+	memcpy(eth_hdr->h_source, i2400m->src_mac_addr,
+	       sizeof(eth_hdr->h_source));
+	eth_hdr->h_proto = protocol;
+}
+
+
+/*
+ * i2400m_net_rx - pass a network packet to the stack
+ *
+ * @i2400m: device instance
+ * @skb_rx: the skb where the buffer pointed to by @buf is
+ * @i: 1 if payload is the only one
+ * @buf: pointer to the buffer containing the data
+ * @len: buffer's length
+ *
+ * This is only used now for the v1.3 firmware. It will be deprecated
+ * in >= 2.6.31.
+ *
+ * Note that due to firmware limitations, we don't have space to add
+ * an ethernet header, so we need to copy each packet. Firmware
+ * versions >= v1.4 fix this [see i2400m_net_erx()].
+ *
+ * We just clone the skb and set it up so that it's skb->data pointer
+ * points to "buf" and it's length.
+ *
+ * Note that if the payload is the last (or the only one) in a
+ * multi-payload message, we don't clone the SKB but just reuse it.
+ *
+ * This function is normally run from a thread context. However, we
+ * still use netif_rx() instead of netif_receive_skb() as was
+ * recommended in the mailing list. Reason is in some stress tests
+ * when sending/receiving a lot of data we seem to hit a softlock in
+ * the kernel's TCP implementation [aroudn tcp_delay_timer()]. Using
+ * netif_rx() took care of the issue.
+ *
+ * This is, of course, still open to do more research on why running
+ * with netif_receive_skb() hits this softlock. FIXME.
+ *
+ * FIXME: currently we don't do any efforts at distinguishing if what
+ * we got was an IPv4 or IPv6 header, to setup the protocol field
+ * correctly.
+ */
+void i2400m_net_rx(struct i2400m *i2400m, struct sk_buff *skb_rx,
+		   unsigned i, const void *buf, int buf_len)
+{
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb;
+
+	d_fnstart(2, dev, "(i2400m %p buf %p buf_len %d)\n",
+		  i2400m, buf, buf_len);
+	if (i) {
+		skb = skb_get(skb_rx);
+		d_printf(2, dev, "RX: reusing first payload skb %p\n", skb);
+		skb_pull(skb, buf - (void *) skb->data);
+		skb_trim(skb, (void *) skb_end_pointer(skb) - buf);
+	} else {
+		/* Yes, this is bad -- a lot of overhead -- see
+		 * comments at the top of the file */
+		skb = __netdev_alloc_skb(net_dev, buf_len, GFP_KERNEL);
+		if (skb == NULL) {
+			dev_err(dev, "NETRX: no memory to realloc skb\n");
+			net_dev->stats.rx_dropped++;
+			goto error_skb_realloc;
+		}
+		skb_put_data(skb, buf, buf_len);
+	}
+	i2400m_rx_fake_eth_header(i2400m->wimax_dev.net_dev,
+				  skb->data - ETH_HLEN,
+				  cpu_to_be16(ETH_P_IP));
+	skb_set_mac_header(skb, -ETH_HLEN);
+	skb->dev = i2400m->wimax_dev.net_dev;
+	skb->protocol = htons(ETH_P_IP);
+	net_dev->stats.rx_packets++;
+	net_dev->stats.rx_bytes += buf_len;
+	d_printf(3, dev, "NETRX: receiving %d bytes to network stack\n",
+		buf_len);
+	d_dump(4, dev, buf, buf_len);
+	netif_rx_ni(skb);	/* see notes in function header */
+error_skb_realloc:
+	d_fnend(2, dev, "(i2400m %p buf %p buf_len %d) = void\n",
+		i2400m, buf, buf_len);
+}
+
+
+/*
+ * i2400m_net_erx - pass a network packet to the stack (extended version)
+ *
+ * @i2400m: device descriptor
+ * @skb: the skb where the packet is - the skb should be set to point
+ *     at the IP packet; this function will add ethernet headers if
+ *     needed.
+ * @cs: packet type
+ *
+ * This is only used now for firmware >= v1.4. Note it is quite
+ * similar to i2400m_net_rx() (used only for v1.3 firmware).
+ *
+ * This function is normally run from a thread context. However, we
+ * still use netif_rx() instead of netif_receive_skb() as was
+ * recommended in the mailing list. Reason is in some stress tests
+ * when sending/receiving a lot of data we seem to hit a softlock in
+ * the kernel's TCP implementation [aroudn tcp_delay_timer()]. Using
+ * netif_rx() took care of the issue.
+ *
+ * This is, of course, still open to do more research on why running
+ * with netif_receive_skb() hits this softlock. FIXME.
+ */
+void i2400m_net_erx(struct i2400m *i2400m, struct sk_buff *skb,
+		    enum i2400m_cs cs)
+{
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(2, dev, "(i2400m %p skb %p [%u] cs %d)\n",
+		  i2400m, skb, skb->len, cs);
+	switch(cs) {
+	case I2400M_CS_IPV4_0:
+	case I2400M_CS_IPV4:
+		i2400m_rx_fake_eth_header(i2400m->wimax_dev.net_dev,
+					  skb->data - ETH_HLEN,
+					  cpu_to_be16(ETH_P_IP));
+		skb_set_mac_header(skb, -ETH_HLEN);
+		skb->dev = i2400m->wimax_dev.net_dev;
+		skb->protocol = htons(ETH_P_IP);
+		net_dev->stats.rx_packets++;
+		net_dev->stats.rx_bytes += skb->len;
+		break;
+	default:
+		dev_err(dev, "ERX: BUG? CS type %u unsupported\n", cs);
+		goto error;
+
+	}
+	d_printf(3, dev, "ERX: receiving %d bytes to the network stack\n",
+		 skb->len);
+	d_dump(4, dev, skb->data, skb->len);
+	netif_rx_ni(skb);	/* see notes in function header */
+error:
+	d_fnend(2, dev, "(i2400m %p skb %p [%u] cs %d) = void\n",
+		i2400m, skb, skb->len, cs);
+}
+
+static const struct net_device_ops i2400m_netdev_ops = {
+	.ndo_open = i2400m_open,
+	.ndo_stop = i2400m_stop,
+	.ndo_start_xmit = i2400m_hard_start_xmit,
+	.ndo_tx_timeout = i2400m_tx_timeout,
+};
+
+static void i2400m_get_drvinfo(struct net_device *net_dev,
+			       struct ethtool_drvinfo *info)
+{
+	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
+
+	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
+	strlcpy(info->fw_version, i2400m->fw_name ? : "",
+		sizeof(info->fw_version));
+	if (net_dev->dev.parent)
+		strlcpy(info->bus_info, dev_name(net_dev->dev.parent),
+			sizeof(info->bus_info));
+}
+
+static const struct ethtool_ops i2400m_ethtool_ops = {
+	.get_drvinfo = i2400m_get_drvinfo,
+	.get_link = ethtool_op_get_link,
+};
+
+/**
+ * i2400m_netdev_setup - Setup setup @net_dev's i2400m private data
+ *
+ * Called by alloc_netdev()
+ */
+void i2400m_netdev_setup(struct net_device *net_dev)
+{
+	d_fnstart(3, NULL, "(net_dev %p)\n", net_dev);
+	ether_setup(net_dev);
+	net_dev->mtu = I2400M_MAX_MTU;
+	net_dev->min_mtu = 0;
+	net_dev->max_mtu = I2400M_MAX_MTU;
+	net_dev->tx_queue_len = I2400M_TX_QLEN;
+	net_dev->features =
+		  NETIF_F_VLAN_CHALLENGED
+		| NETIF_F_HIGHDMA;
+	net_dev->flags =
+		IFF_NOARP		/* i2400m is apure IP device */
+		& (~IFF_BROADCAST	/* i2400m is P2P */
+		   & ~IFF_MULTICAST);
+	net_dev->watchdog_timeo = I2400M_TX_TIMEOUT;
+	net_dev->netdev_ops = &i2400m_netdev_ops;
+	net_dev->ethtool_ops = &i2400m_ethtool_ops;
+	d_fnend(3, NULL, "(net_dev %p) = void\n", net_dev);
+}
+EXPORT_SYMBOL_GPL(i2400m_netdev_setup);
+
diff --git a/drivers/staging/wimax/i2400m/op-rfkill.c b/drivers/staging/wimax/i2400m/op-rfkill.c
new file mode 100644
index 000000000000..fbddf2e18c14
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/op-rfkill.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Implement backend for the WiMAX stack rfkill support
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * The WiMAX kernel stack integrates into RF-Kill and keeps the
+ * switches's status. We just need to:
+ *
+ * - report changes in the HW RF Kill switch [with
+ *   wimax_rfkill_{sw,hw}_report(), which happens when we detect those
+ *   indications coming through hardware reports]. We also do it on
+ *   initialization to let the stack know the initial HW state.
+ *
+ * - implement indications from the stack to change the SW RF Kill
+ *   switch (coming from sysfs, the wimax stack or user space).
+ */
+#include "i2400m.h"
+#include "linux-wimax-i2400m.h"
+#include <linux/slab.h>
+
+
+
+#define D_SUBMODULE rfkill
+#include "debug-levels.h"
+
+/*
+ * Return true if the i2400m radio is in the requested wimax_rf_state state
+ *
+ */
+static
+int i2400m_radio_is(struct i2400m *i2400m, enum wimax_rf_state state)
+{
+	if (state == WIMAX_RF_OFF)
+		return i2400m->state == I2400M_SS_RF_OFF
+			|| i2400m->state == I2400M_SS_RF_SHUTDOWN;
+	else if (state == WIMAX_RF_ON)
+		/* state == WIMAX_RF_ON */
+		return i2400m->state != I2400M_SS_RF_OFF
+			&& i2400m->state != I2400M_SS_RF_SHUTDOWN;
+	else {
+		BUG();
+		return -EINVAL;	/* shut gcc warnings on certain arches */
+	}
+}
+
+
+/*
+ * WiMAX stack operation: implement SW RFKill toggling
+ *
+ * @wimax_dev: device descriptor
+ * @skb: skb where the message has been received; skb->data is
+ *       expected to point to the message payload.
+ * @genl_info: passed by the generic netlink layer
+ *
+ * Generic Netlink will call this function when a message is sent from
+ * userspace to change the software RF-Kill switch status.
+ *
+ * This function will set the device's software RF-Kill switch state to
+ * match what is requested.
+ *
+ * NOTE: the i2400m has a strict state machine; we can only set the
+ *       RF-Kill switch when it is on, the HW RF-Kill is on and the
+ *       device is initialized. So we ignore errors steaming from not
+ *       being in the right state (-EILSEQ).
+ */
+int i2400m_op_rfkill_sw_toggle(struct wimax_dev *wimax_dev,
+			       enum wimax_rf_state state)
+{
+	int result;
+	struct i2400m *i2400m = wimax_dev_to_i2400m(wimax_dev);
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct {
+		struct i2400m_l3l4_hdr hdr;
+		struct i2400m_tlv_rf_operation sw_rf;
+	} __packed *cmd;
+	char strerr[32];
+
+	d_fnstart(4, dev, "(wimax_dev %p state %d)\n", wimax_dev, state);
+
+	result = -ENOMEM;
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (cmd == NULL)
+		goto error_alloc;
+	cmd->hdr.type = cpu_to_le16(I2400M_MT_CMD_RF_CONTROL);
+	cmd->hdr.length = sizeof(cmd->sw_rf);
+	cmd->hdr.version = cpu_to_le16(I2400M_L3L4_VERSION);
+	cmd->sw_rf.hdr.type = cpu_to_le16(I2400M_TLV_RF_OPERATION);
+	cmd->sw_rf.hdr.length = cpu_to_le16(sizeof(cmd->sw_rf.status));
+	switch (state) {
+	case WIMAX_RF_OFF:	/* RFKILL ON, radio OFF */
+		cmd->sw_rf.status = cpu_to_le32(2);
+		break;
+	case WIMAX_RF_ON:	/* RFKILL OFF, radio ON */
+		cmd->sw_rf.status = cpu_to_le32(1);
+		break;
+	default:
+		BUG();
+	}
+
+	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
+	result = PTR_ERR(ack_skb);
+	if (IS_ERR(ack_skb)) {
+		dev_err(dev, "Failed to issue 'RF Control' command: %d\n",
+			result);
+		goto error_msg_to_dev;
+	}
+	result = i2400m_msg_check_status(wimax_msg_data(ack_skb),
+					 strerr, sizeof(strerr));
+	if (result < 0) {
+		dev_err(dev, "'RF Control' (0x%04x) command failed: %d - %s\n",
+			I2400M_MT_CMD_RF_CONTROL, result, strerr);
+		goto error_cmd;
+	}
+
+	/* Now we wait for the state to change to RADIO_OFF or RADIO_ON */
+	result = wait_event_timeout(
+		i2400m->state_wq, i2400m_radio_is(i2400m, state),
+		5 * HZ);
+	if (result == 0)
+		result = -ETIMEDOUT;
+	if (result < 0)
+		dev_err(dev, "Error waiting for device to toggle RF state: "
+			"%d\n", result);
+	result = 0;
+error_cmd:
+	kfree_skb(ack_skb);
+error_msg_to_dev:
+error_alloc:
+	d_fnend(4, dev, "(wimax_dev %p state %d) = %d\n",
+		wimax_dev, state, result);
+	kfree(cmd);
+	return result;
+}
+
+
+/*
+ * Inform the WiMAX stack of changes in the RF Kill switches reported
+ * by the device
+ *
+ * @i2400m: device descriptor
+ * @rfss: TLV for RF Switches status; already validated
+ *
+ * NOTE: the reports on RF switch status cannot be trusted
+ *       or used until the device is in a state of RADIO_OFF
+ *       or greater.
+ */
+void i2400m_report_tlv_rf_switches_status(
+	struct i2400m *i2400m,
+	const struct i2400m_tlv_rf_switches_status *rfss)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	enum i2400m_rf_switch_status hw, sw;
+	enum wimax_st wimax_state;
+
+	sw = le32_to_cpu(rfss->sw_rf_switch);
+	hw = le32_to_cpu(rfss->hw_rf_switch);
+
+	d_fnstart(3, dev, "(i2400m %p rfss %p [hw %u sw %u])\n",
+		  i2400m, rfss, hw, sw);
+	/* We only process rw switch evens when the device has been
+	 * fully initialized */
+	wimax_state = wimax_state_get(&i2400m->wimax_dev);
+	if (wimax_state < WIMAX_ST_RADIO_OFF) {
+		d_printf(3, dev, "ignoring RF switches report, state %u\n",
+			 wimax_state);
+		goto out;
+	}
+	switch (sw) {
+	case I2400M_RF_SWITCH_ON:	/* RF Kill disabled (radio on) */
+		wimax_report_rfkill_sw(&i2400m->wimax_dev, WIMAX_RF_ON);
+		break;
+	case I2400M_RF_SWITCH_OFF:	/* RF Kill enabled (radio off) */
+		wimax_report_rfkill_sw(&i2400m->wimax_dev, WIMAX_RF_OFF);
+		break;
+	default:
+		dev_err(dev, "HW BUG? Unknown RF SW state 0x%x\n", sw);
+	}
+
+	switch (hw) {
+	case I2400M_RF_SWITCH_ON:	/* RF Kill disabled (radio on) */
+		wimax_report_rfkill_hw(&i2400m->wimax_dev, WIMAX_RF_ON);
+		break;
+	case I2400M_RF_SWITCH_OFF:	/* RF Kill enabled (radio off) */
+		wimax_report_rfkill_hw(&i2400m->wimax_dev, WIMAX_RF_OFF);
+		break;
+	default:
+		dev_err(dev, "HW BUG? Unknown RF HW state 0x%x\n", hw);
+	}
+out:
+	d_fnend(3, dev, "(i2400m %p rfss %p [hw %u sw %u]) = void\n",
+		i2400m, rfss, hw, sw);
+}
diff --git a/drivers/staging/wimax/i2400m/rx.c b/drivers/staging/wimax/i2400m/rx.c
new file mode 100644
index 000000000000..c9fb619a9e01
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/rx.c
@@ -0,0 +1,1395 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Handle incoming traffic and deliver it to the control or data planes
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ *  - Initial implementation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Use skb_clone(), break up processing in chunks
+ *  - Split transport/device specific
+ *  - Make buffer size dynamic to exert less memory pressure
+ *  - RX reorder support
+ *
+ * This handles the RX path.
+ *
+ * We receive an RX message from the bus-specific driver, which
+ * contains one or more payloads that have potentially different
+ * destinataries (data or control paths).
+ *
+ * So we just take that payload from the transport specific code in
+ * the form of an skb, break it up in chunks (a cloned skb each in the
+ * case of network packets) and pass it to netdev or to the
+ * command/ack handler (and from there to the WiMAX stack).
+ *
+ * PROTOCOL FORMAT
+ *
+ * The format of the buffer is:
+ *
+ * HEADER                      (struct i2400m_msg_hdr)
+ * PAYLOAD DESCRIPTOR 0        (struct i2400m_pld)
+ * PAYLOAD DESCRIPTOR 1
+ * ...
+ * PAYLOAD DESCRIPTOR N
+ * PAYLOAD 0                   (raw bytes)
+ * PAYLOAD 1
+ * ...
+ * PAYLOAD N
+ *
+ * See tx.c for a deeper description on alignment requirements and
+ * other fun facts of it.
+ *
+ * DATA PACKETS
+ *
+ * In firmwares <= v1.3, data packets have no header for RX, but they
+ * do for TX (currently unused).
+ *
+ * In firmware >= 1.4, RX packets have an extended header (16
+ * bytes). This header conveys information for management of host
+ * reordering of packets (the device offloads storage of the packets
+ * for reordering to the host). Read below for more information.
+ *
+ * The header is used as dummy space to emulate an ethernet header and
+ * thus be able to act as an ethernet device without having to reallocate.
+ *
+ * DATA RX REORDERING
+ *
+ * Starting in firmware v1.4, the device can deliver packets for
+ * delivery with special reordering information; this allows it to
+ * more effectively do packet management when some frames were lost in
+ * the radio traffic.
+ *
+ * Thus, for RX packets that come out of order, the device gives the
+ * driver enough information to queue them properly and then at some
+ * point, the signal to deliver the whole (or part) of the queued
+ * packets to the networking stack. There are 16 such queues.
+ *
+ * This only happens when a packet comes in with the "need reorder"
+ * flag set in the RX header. When such bit is set, the following
+ * operations might be indicated:
+ *
+ *  - reset queue: send all queued packets to the OS
+ *
+ *  - queue: queue a packet
+ *
+ *  - update ws: update the queue's window start and deliver queued
+ *    packets that meet the criteria
+ *
+ *  - queue & update ws: queue a packet, update the window start and
+ *    deliver queued packets that meet the criteria
+ *
+ * (delivery criteria: the packet's [normalized] sequence number is
+ * lower than the new [normalized] window start).
+ *
+ * See the i2400m_roq_*() functions for details.
+ *
+ * ROADMAP
+ *
+ * i2400m_rx
+ *   i2400m_rx_msg_hdr_check
+ *   i2400m_rx_pl_descr_check
+ *   i2400m_rx_payload
+ *     i2400m_net_rx
+ *     i2400m_rx_edata
+ *       i2400m_net_erx
+ *       i2400m_roq_reset
+ *         i2400m_net_erx
+ *       i2400m_roq_queue
+ *         __i2400m_roq_queue
+ *       i2400m_roq_update_ws
+ *         __i2400m_roq_update_ws
+ *           i2400m_net_erx
+ *       i2400m_roq_queue_update_ws
+ *         __i2400m_roq_queue
+ *         __i2400m_roq_update_ws
+ *           i2400m_net_erx
+ *     i2400m_rx_ctl
+ *       i2400m_msg_size_check
+ *       i2400m_report_hook_work    [in a workqueue]
+ *         i2400m_report_hook
+ *       wimax_msg_to_user
+ *       i2400m_rx_ctl_ack
+ *         wimax_msg_to_user_alloc
+ *     i2400m_rx_trace
+ *       i2400m_msg_size_check
+ *       wimax_msg
+ */
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include "i2400m.h"
+
+
+#define D_SUBMODULE rx
+#include "debug-levels.h"
+
+static int i2400m_rx_reorder_disabled;	/* 0 (rx reorder enabled) by default */
+module_param_named(rx_reorder_disabled, i2400m_rx_reorder_disabled, int, 0644);
+MODULE_PARM_DESC(rx_reorder_disabled,
+		 "If true, RX reordering will be disabled.");
+
+struct i2400m_report_hook_args {
+	struct sk_buff *skb_rx;
+	const struct i2400m_l3l4_hdr *l3l4_hdr;
+	size_t size;
+	struct list_head list_node;
+};
+
+
+/*
+ * Execute i2400m_report_hook in a workqueue
+ *
+ * Goes over the list of queued reports in i2400m->rx_reports and
+ * processes them.
+ *
+ * NOTE: refcounts on i2400m are not needed because we flush the
+ *     workqueue this runs on (i2400m->work_queue) before destroying
+ *     i2400m.
+ */
+void i2400m_report_hook_work(struct work_struct *ws)
+{
+	struct i2400m *i2400m = container_of(ws, struct i2400m, rx_report_ws);
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_report_hook_args *args, *args_next;
+	LIST_HEAD(list);
+	unsigned long flags;
+
+	while (1) {
+		spin_lock_irqsave(&i2400m->rx_lock, flags);
+		list_splice_init(&i2400m->rx_reports, &list);
+		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+		if (list_empty(&list))
+			break;
+		else
+			d_printf(1, dev, "processing queued reports\n");
+		list_for_each_entry_safe(args, args_next, &list, list_node) {
+			d_printf(2, dev, "processing queued report %p\n", args);
+			i2400m_report_hook(i2400m, args->l3l4_hdr, args->size);
+			kfree_skb(args->skb_rx);
+			list_del(&args->list_node);
+			kfree(args);
+		}
+	}
+}
+
+
+/*
+ * Flush the list of queued reports
+ */
+static
+void i2400m_report_hook_flush(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_report_hook_args *args, *args_next;
+	LIST_HEAD(list);
+	unsigned long flags;
+
+	d_printf(1, dev, "flushing queued reports\n");
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	list_splice_init(&i2400m->rx_reports, &list);
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	list_for_each_entry_safe(args, args_next, &list, list_node) {
+		d_printf(2, dev, "flushing queued report %p\n", args);
+		kfree_skb(args->skb_rx);
+		list_del(&args->list_node);
+		kfree(args);
+	}
+}
+
+
+/*
+ * Queue a report for later processing
+ *
+ * @i2400m: device descriptor
+ * @skb_rx: skb that contains the payload (for reference counting)
+ * @l3l4_hdr: pointer to the control
+ * @size: size of the message
+ */
+static
+void i2400m_report_hook_queue(struct i2400m *i2400m, struct sk_buff *skb_rx,
+			      const void *l3l4_hdr, size_t size)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned long flags;
+	struct i2400m_report_hook_args *args;
+
+	args = kzalloc(sizeof(*args), GFP_NOIO);
+	if (args) {
+		args->skb_rx = skb_get(skb_rx);
+		args->l3l4_hdr = l3l4_hdr;
+		args->size = size;
+		spin_lock_irqsave(&i2400m->rx_lock, flags);
+		list_add_tail(&args->list_node, &i2400m->rx_reports);
+		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+		d_printf(2, dev, "queued report %p\n", args);
+		rmb();		/* see i2400m->ready's documentation  */
+		if (likely(i2400m->ready))	/* only send if up */
+			queue_work(i2400m->work_queue, &i2400m->rx_report_ws);
+	} else  {
+		if (printk_ratelimit())
+			dev_err(dev, "%s:%u: Can't allocate %zu B\n",
+				__func__, __LINE__, sizeof(*args));
+	}
+}
+
+
+/*
+ * Process an ack to a command
+ *
+ * @i2400m: device descriptor
+ * @payload: pointer to message
+ * @size: size of the message
+ *
+ * Pass the acknodledgment (in an skb) to the thread that is waiting
+ * for it in i2400m->msg_completion.
+ *
+ * We need to coordinate properly with the thread waiting for the
+ * ack. Check if it is waiting or if it is gone. We loose the spinlock
+ * to avoid allocating on atomic contexts (yeah, could use GFP_ATOMIC,
+ * but this is not so speed critical).
+ */
+static
+void i2400m_rx_ctl_ack(struct i2400m *i2400m,
+		       const void *payload, size_t size)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	unsigned long flags;
+	struct sk_buff *ack_skb;
+
+	/* Anyone waiting for an answer? */
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	if (i2400m->ack_skb != ERR_PTR(-EINPROGRESS)) {
+		dev_err(dev, "Huh? reply to command with no waiters\n");
+		goto error_no_waiter;
+	}
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+
+	ack_skb = wimax_msg_alloc(wimax_dev, NULL, payload, size, GFP_KERNEL);
+
+	/* Check waiter didn't time out waiting for the answer... */
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	if (i2400m->ack_skb != ERR_PTR(-EINPROGRESS)) {
+		d_printf(1, dev, "Huh? waiter for command reply cancelled\n");
+		goto error_waiter_cancelled;
+	}
+	if (IS_ERR(ack_skb))
+		dev_err(dev, "CMD/GET/SET ack: cannot allocate SKB\n");
+	i2400m->ack_skb = ack_skb;
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	complete(&i2400m->msg_completion);
+	return;
+
+error_waiter_cancelled:
+	if (!IS_ERR(ack_skb))
+		kfree_skb(ack_skb);
+error_no_waiter:
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+}
+
+
+/*
+ * Receive and process a control payload
+ *
+ * @i2400m: device descriptor
+ * @skb_rx: skb that contains the payload (for reference counting)
+ * @payload: pointer to message
+ * @size: size of the message
+ *
+ * There are two types of control RX messages: reports (asynchronous,
+ * like your every day interrupts) and 'acks' (reponses to a command,
+ * get or set request).
+ *
+ * If it is a report, we run hooks on it (to extract information for
+ * things we need to do in the driver) and then pass it over to the
+ * WiMAX stack to send it to user space.
+ *
+ * NOTE: report processing is done in a workqueue specific to the
+ *     generic driver, to avoid deadlocks in the system.
+ *
+ * If it is not a report, it is an ack to a previously executed
+ * command, set or get, so wake up whoever is waiting for it from
+ * i2400m_msg_to_dev(). i2400m_rx_ctl_ack() takes care of that.
+ *
+ * Note that the sizes we pass to other functions from here are the
+ * sizes of the _l3l4_hdr + payload, not full buffer sizes, as we have
+ * verified in _msg_size_check() that they are congruent.
+ *
+ * For reports: We can't clone the original skb where the data is
+ * because we need to send this up via netlink; netlink has to add
+ * headers and we can't overwrite what's preceding the payload...as
+ * it is another message. So we just dup them.
+ */
+static
+void i2400m_rx_ctl(struct i2400m *i2400m, struct sk_buff *skb_rx,
+		   const void *payload, size_t size)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_l3l4_hdr *l3l4_hdr = payload;
+	unsigned msg_type;
+
+	result = i2400m_msg_size_check(i2400m, l3l4_hdr, size);
+	if (result < 0) {
+		dev_err(dev, "HW BUG? device sent a bad message: %d\n",
+			result);
+		goto error_check;
+	}
+	msg_type = le16_to_cpu(l3l4_hdr->type);
+	d_printf(1, dev, "%s 0x%04x: %zu bytes\n",
+		 msg_type & I2400M_MT_REPORT_MASK ? "REPORT" : "CMD/SET/GET",
+		 msg_type, size);
+	d_dump(2, dev, l3l4_hdr, size);
+	if (msg_type & I2400M_MT_REPORT_MASK) {
+		/*
+		 * Process each report
+		 *
+		 * - has to be ran serialized as well
+		 *
+		 * - the handling might force the execution of
+		 *   commands. That might cause reentrancy issues with
+		 *   bus-specific subdrivers and workqueues, so the we
+		 *   run it in a separate workqueue.
+		 *
+		 * - when the driver is not yet ready to handle them,
+		 *   they are queued and at some point the queue is
+		 *   restarted [NOTE: we can't queue SKBs directly, as
+		 *   this might be a piece of a SKB, not the whole
+		 *   thing, and this is cheaper than cloning the
+		 *   SKB].
+		 *
+		 * Note we don't do refcounting for the device
+		 * structure; this is because before destroying
+		 * 'i2400m', we make sure to flush the
+		 * i2400m->work_queue, so there are no issues.
+		 */
+		i2400m_report_hook_queue(i2400m, skb_rx, l3l4_hdr, size);
+		if (unlikely(i2400m->trace_msg_from_user))
+			wimax_msg(&i2400m->wimax_dev, "echo",
+				  l3l4_hdr, size, GFP_KERNEL);
+		result = wimax_msg(&i2400m->wimax_dev, NULL, l3l4_hdr, size,
+				   GFP_KERNEL);
+		if (result < 0)
+			dev_err(dev, "error sending report to userspace: %d\n",
+				result);
+	} else		/* an ack to a CMD, GET or SET */
+		i2400m_rx_ctl_ack(i2400m, payload, size);
+error_check:
+	return;
+}
+
+
+/*
+ * Receive and send up a trace
+ *
+ * @i2400m: device descriptor
+ * @skb_rx: skb that contains the trace (for reference counting)
+ * @payload: pointer to trace message inside the skb
+ * @size: size of the message
+ *
+ * THe i2400m might produce trace information (diagnostics) and we
+ * send them through a different kernel-to-user pipe (to avoid
+ * clogging it).
+ *
+ * As in i2400m_rx_ctl(), we can't clone the original skb where the
+ * data is because we need to send this up via netlink; netlink has to
+ * add headers and we can't overwrite what's preceding the
+ * payload...as it is another message. So we just dup them.
+ */
+static
+void i2400m_rx_trace(struct i2400m *i2400m,
+		     const void *payload, size_t size)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	const struct i2400m_l3l4_hdr *l3l4_hdr = payload;
+	unsigned msg_type;
+
+	result = i2400m_msg_size_check(i2400m, l3l4_hdr, size);
+	if (result < 0) {
+		dev_err(dev, "HW BUG? device sent a bad trace message: %d\n",
+			result);
+		goto error_check;
+	}
+	msg_type = le16_to_cpu(l3l4_hdr->type);
+	d_printf(1, dev, "Trace %s 0x%04x: %zu bytes\n",
+		 msg_type & I2400M_MT_REPORT_MASK ? "REPORT" : "CMD/SET/GET",
+		 msg_type, size);
+	d_dump(2, dev, l3l4_hdr, size);
+	result = wimax_msg(wimax_dev, "trace", l3l4_hdr, size, GFP_KERNEL);
+	if (result < 0)
+		dev_err(dev, "error sending trace to userspace: %d\n",
+			result);
+error_check:
+	return;
+}
+
+
+/*
+ * Reorder queue data stored on skb->cb while the skb is queued in the
+ * reorder queues.
+ */
+struct i2400m_roq_data {
+	unsigned sn;		/* Serial number for the skb */
+	enum i2400m_cs cs;	/* packet type for the skb */
+};
+
+
+/*
+ * ReOrder Queue
+ *
+ * @ws: Window Start; sequence number where the current window start
+ *     is for this queue
+ * @queue: the skb queue itself
+ * @log: circular ring buffer used to log information about the
+ *     reorder process in this queue that can be displayed in case of
+ *     error to help diagnose it.
+ *
+ * This is the head for a list of skbs. In the skb->cb member of the
+ * skb when queued here contains a 'struct i2400m_roq_data' were we
+ * store the sequence number (sn) and the cs (packet type) coming from
+ * the RX payload header from the device.
+ */
+struct i2400m_roq
+{
+	unsigned ws;
+	struct sk_buff_head queue;
+	struct i2400m_roq_log *log;
+};
+
+
+static
+void __i2400m_roq_init(struct i2400m_roq *roq)
+{
+	roq->ws = 0;
+	skb_queue_head_init(&roq->queue);
+}
+
+
+static
+unsigned __i2400m_roq_index(struct i2400m *i2400m, struct i2400m_roq *roq)
+{
+	return ((unsigned long) roq - (unsigned long) i2400m->rx_roq)
+		/ sizeof(*roq);
+}
+
+
+/*
+ * Normalize a sequence number based on the queue's window start
+ *
+ * nsn = (sn - ws) % 2048
+ *
+ * Note that if @sn < @roq->ws, we still need a positive number; %'s
+ * sign is implementation specific, so we normalize it by adding 2048
+ * to bring it to be positive.
+ */
+static
+unsigned __i2400m_roq_nsn(struct i2400m_roq *roq, unsigned sn)
+{
+	int r;
+	r =  ((int) sn - (int) roq->ws) % 2048;
+	if (r < 0)
+		r += 2048;
+	return r;
+}
+
+
+/*
+ * Circular buffer to keep the last N reorder operations
+ *
+ * In case something fails, dumb then to try to come up with what
+ * happened.
+ */
+enum {
+	I2400M_ROQ_LOG_LENGTH = 32,
+};
+
+struct i2400m_roq_log {
+	struct i2400m_roq_log_entry {
+		enum i2400m_ro_type type;
+		unsigned ws, count, sn, nsn, new_ws;
+	} entry[I2400M_ROQ_LOG_LENGTH];
+	unsigned in, out;
+};
+
+
+/* Print a log entry */
+static
+void i2400m_roq_log_entry_print(struct i2400m *i2400m, unsigned index,
+				unsigned e_index,
+				struct i2400m_roq_log_entry *e)
+{
+	struct device *dev = i2400m_dev(i2400m);
+
+	switch(e->type) {
+	case I2400M_RO_TYPE_RESET:
+		dev_err(dev, "q#%d reset           ws %u cnt %u sn %u/%u"
+			" - new nws %u\n",
+			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
+		break;
+	case I2400M_RO_TYPE_PACKET:
+		dev_err(dev, "q#%d queue           ws %u cnt %u sn %u/%u\n",
+			index, e->ws, e->count, e->sn, e->nsn);
+		break;
+	case I2400M_RO_TYPE_WS:
+		dev_err(dev, "q#%d update_ws       ws %u cnt %u sn %u/%u"
+			" - new nws %u\n",
+			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
+		break;
+	case I2400M_RO_TYPE_PACKET_WS:
+		dev_err(dev, "q#%d queue_update_ws ws %u cnt %u sn %u/%u"
+			" - new nws %u\n",
+			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
+		break;
+	default:
+		dev_err(dev, "q#%d BUG? entry %u - unknown type %u\n",
+			index, e_index, e->type);
+		break;
+	}
+}
+
+
+static
+void i2400m_roq_log_add(struct i2400m *i2400m,
+			struct i2400m_roq *roq, enum i2400m_ro_type type,
+			unsigned ws, unsigned count, unsigned sn,
+			unsigned nsn, unsigned new_ws)
+{
+	struct i2400m_roq_log_entry *e;
+	unsigned cnt_idx;
+	int index = __i2400m_roq_index(i2400m, roq);
+
+	/* if we run out of space, we eat from the end */
+	if (roq->log->in - roq->log->out == I2400M_ROQ_LOG_LENGTH)
+		roq->log->out++;
+	cnt_idx = roq->log->in++ % I2400M_ROQ_LOG_LENGTH;
+	e = &roq->log->entry[cnt_idx];
+
+	e->type = type;
+	e->ws = ws;
+	e->count = count;
+	e->sn = sn;
+	e->nsn = nsn;
+	e->new_ws = new_ws;
+
+	if (d_test(1))
+		i2400m_roq_log_entry_print(i2400m, index, cnt_idx, e);
+}
+
+
+/* Dump all the entries in the FIFO and reinitialize it */
+static
+void i2400m_roq_log_dump(struct i2400m *i2400m, struct i2400m_roq *roq)
+{
+	unsigned cnt, cnt_idx;
+	struct i2400m_roq_log_entry *e;
+	int index = __i2400m_roq_index(i2400m, roq);
+
+	BUG_ON(roq->log->out > roq->log->in);
+	for (cnt = roq->log->out; cnt < roq->log->in; cnt++) {
+		cnt_idx = cnt % I2400M_ROQ_LOG_LENGTH;
+		e = &roq->log->entry[cnt_idx];
+		i2400m_roq_log_entry_print(i2400m, index, cnt_idx, e);
+		memset(e, 0, sizeof(*e));
+	}
+	roq->log->in = roq->log->out = 0;
+}
+
+
+/*
+ * Backbone for the queuing of an skb (by normalized sequence number)
+ *
+ * @i2400m: device descriptor
+ * @roq: reorder queue where to add
+ * @skb: the skb to add
+ * @sn: the sequence number of the skb
+ * @nsn: the normalized sequence number of the skb (pre-computed by the
+ *     caller from the @sn and @roq->ws).
+ *
+ * We try first a couple of quick cases:
+ *
+ *   - the queue is empty
+ *   - the skb would be appended to the queue
+ *
+ * These will be the most common operations.
+ *
+ * If these fail, then we have to do a sorted insertion in the queue,
+ * which is the slowest path.
+ *
+ * We don't have to acquire a reference count as we are going to own it.
+ */
+static
+void __i2400m_roq_queue(struct i2400m *i2400m, struct i2400m_roq *roq,
+			struct sk_buff *skb, unsigned sn, unsigned nsn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb_itr;
+	struct i2400m_roq_data *roq_data_itr, *roq_data;
+	unsigned nsn_itr;
+
+	d_fnstart(4, dev, "(i2400m %p roq %p skb %p sn %u nsn %u)\n",
+		  i2400m, roq, skb, sn, nsn);
+
+	roq_data = (struct i2400m_roq_data *) &skb->cb;
+	BUILD_BUG_ON(sizeof(*roq_data) > sizeof(skb->cb));
+	roq_data->sn = sn;
+	d_printf(3, dev, "ERX: roq %p [ws %u] nsn %d sn %u\n",
+		 roq, roq->ws, nsn, roq_data->sn);
+
+	/* Queues will be empty on not-so-bad environments, so try
+	 * that first */
+	if (skb_queue_empty(&roq->queue)) {
+		d_printf(2, dev, "ERX: roq %p - first one\n", roq);
+		__skb_queue_head(&roq->queue, skb);
+		goto out;
+	}
+	/* Now try append, as most of the operations will be that */
+	skb_itr = skb_peek_tail(&roq->queue);
+	roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+	nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
+	/* NSN bounds assumed correct (checked when it was queued) */
+	if (nsn >= nsn_itr) {
+		d_printf(2, dev, "ERX: roq %p - appended after %p (nsn %d sn %u)\n",
+			 roq, skb_itr, nsn_itr, roq_data_itr->sn);
+		__skb_queue_tail(&roq->queue, skb);
+		goto out;
+	}
+	/* None of the fast paths option worked. Iterate to find the
+	 * right spot where to insert the packet; we know the queue is
+	 * not empty, so we are not the first ones; we also know we
+	 * are not going to be the last ones. The list is sorted, so
+	 * we have to insert before the the first guy with an nsn_itr
+	 * greater that our nsn. */
+	skb_queue_walk(&roq->queue, skb_itr) {
+		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
+		/* NSN bounds assumed correct (checked when it was queued) */
+		if (nsn_itr > nsn) {
+			d_printf(2, dev, "ERX: roq %p - queued before %p "
+				 "(nsn %d sn %u)\n", roq, skb_itr, nsn_itr,
+				 roq_data_itr->sn);
+			__skb_queue_before(&roq->queue, skb_itr, skb);
+			goto out;
+		}
+	}
+	/* If we get here, that is VERY bad -- print info to help
+	 * diagnose and crash it */
+	dev_err(dev, "SW BUG? failed to insert packet\n");
+	dev_err(dev, "ERX: roq %p [ws %u] skb %p nsn %d sn %u\n",
+		roq, roq->ws, skb, nsn, roq_data->sn);
+	skb_queue_walk(&roq->queue, skb_itr) {
+		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
+		/* NSN bounds assumed correct (checked when it was queued) */
+		dev_err(dev, "ERX: roq %p skb_itr %p nsn %d sn %u\n",
+			roq, skb_itr, nsn_itr, roq_data_itr->sn);
+	}
+	BUG();
+out:
+	d_fnend(4, dev, "(i2400m %p roq %p skb %p sn %u nsn %d) = void\n",
+		i2400m, roq, skb, sn, nsn);
+}
+
+
+/*
+ * Backbone for the update window start operation
+ *
+ * @i2400m: device descriptor
+ * @roq: Reorder queue
+ * @sn: New sequence number
+ *
+ * Updates the window start of a queue; when doing so, it must deliver
+ * to the networking stack all the queued skb's whose normalized
+ * sequence number is lower than the new normalized window start.
+ */
+static
+unsigned __i2400m_roq_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
+				unsigned sn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb_itr, *tmp_itr;
+	struct i2400m_roq_data *roq_data_itr;
+	unsigned new_nws, nsn_itr;
+
+	new_nws = __i2400m_roq_nsn(roq, sn);
+	/*
+	 * For type 2(update_window_start) rx messages, there is no
+	 * need to check if the normalized sequence number is greater 1023.
+	 * Simply insert and deliver all packets to the host up to the
+	 * window start.
+	 */
+	skb_queue_walk_safe(&roq->queue, skb_itr, tmp_itr) {
+		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
+		/* NSN bounds assumed correct (checked when it was queued) */
+		if (nsn_itr < new_nws) {
+			d_printf(2, dev, "ERX: roq %p - release skb %p "
+				 "(nsn %u/%u new nws %u)\n",
+				 roq, skb_itr, nsn_itr, roq_data_itr->sn,
+				 new_nws);
+			__skb_unlink(skb_itr, &roq->queue);
+			i2400m_net_erx(i2400m, skb_itr, roq_data_itr->cs);
+		}
+		else
+			break;	/* rest of packets all nsn_itr > nws */
+	}
+	roq->ws = sn;
+	return new_nws;
+}
+
+
+/*
+ * Reset a queue
+ *
+ * @i2400m: device descriptor
+ * @cin: Queue Index
+ *
+ * Deliver all the packets and reset the window-start to zero. Name is
+ * kind of misleading.
+ */
+static
+void i2400m_roq_reset(struct i2400m *i2400m, struct i2400m_roq *roq)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb_itr, *tmp_itr;
+	struct i2400m_roq_data *roq_data_itr;
+
+	d_fnstart(2, dev, "(i2400m %p roq %p)\n", i2400m, roq);
+	i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_RESET,
+			     roq->ws, skb_queue_len(&roq->queue),
+			     ~0, ~0, 0);
+	skb_queue_walk_safe(&roq->queue, skb_itr, tmp_itr) {
+		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+		d_printf(2, dev, "ERX: roq %p - release skb %p (sn %u)\n",
+			 roq, skb_itr, roq_data_itr->sn);
+		__skb_unlink(skb_itr, &roq->queue);
+		i2400m_net_erx(i2400m, skb_itr, roq_data_itr->cs);
+	}
+	roq->ws = 0;
+	d_fnend(2, dev, "(i2400m %p roq %p) = void\n", i2400m, roq);
+}
+
+
+/*
+ * Queue a packet
+ *
+ * @i2400m: device descriptor
+ * @cin: Queue Index
+ * @skb: containing the packet data
+ * @fbn: First block number of the packet in @skb
+ * @lbn: Last block number of the packet in @skb
+ *
+ * The hardware is asking the driver to queue a packet for later
+ * delivery to the networking stack.
+ */
+static
+void i2400m_roq_queue(struct i2400m *i2400m, struct i2400m_roq *roq,
+		      struct sk_buff * skb, unsigned lbn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned nsn, len;
+
+	d_fnstart(2, dev, "(i2400m %p roq %p skb %p lbn %u) = void\n",
+		  i2400m, roq, skb, lbn);
+	len = skb_queue_len(&roq->queue);
+	nsn = __i2400m_roq_nsn(roq, lbn);
+	if (unlikely(nsn >= 1024)) {
+		dev_err(dev, "SW BUG? queue nsn %d (lbn %u ws %u)\n",
+			nsn, lbn, roq->ws);
+		i2400m_roq_log_dump(i2400m, roq);
+		i2400m_reset(i2400m, I2400M_RT_WARM);
+	} else {
+		__i2400m_roq_queue(i2400m, roq, skb, lbn, nsn);
+		i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_PACKET,
+				     roq->ws, len, lbn, nsn, ~0);
+	}
+	d_fnend(2, dev, "(i2400m %p roq %p skb %p lbn %u) = void\n",
+		i2400m, roq, skb, lbn);
+}
+
+
+/*
+ * Update the window start in a reorder queue and deliver all skbs
+ * with a lower window start
+ *
+ * @i2400m: device descriptor
+ * @roq: Reorder queue
+ * @sn: New sequence number
+ */
+static
+void i2400m_roq_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
+			  unsigned sn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned old_ws, nsn, len;
+
+	d_fnstart(2, dev, "(i2400m %p roq %p sn %u)\n", i2400m, roq, sn);
+	old_ws = roq->ws;
+	len = skb_queue_len(&roq->queue);
+	nsn = __i2400m_roq_update_ws(i2400m, roq, sn);
+	i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_WS,
+			     old_ws, len, sn, nsn, roq->ws);
+	d_fnstart(2, dev, "(i2400m %p roq %p sn %u) = void\n", i2400m, roq, sn);
+}
+
+
+/*
+ * Queue a packet and update the window start
+ *
+ * @i2400m: device descriptor
+ * @cin: Queue Index
+ * @skb: containing the packet data
+ * @fbn: First block number of the packet in @skb
+ * @sn: Last block number of the packet in @skb
+ *
+ * Note that unlike i2400m_roq_update_ws(), which sets the new window
+ * start to @sn, in here we'll set it to @sn + 1.
+ */
+static
+void i2400m_roq_queue_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
+				struct sk_buff * skb, unsigned sn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned nsn, old_ws, len;
+
+	d_fnstart(2, dev, "(i2400m %p roq %p skb %p sn %u)\n",
+		  i2400m, roq, skb, sn);
+	len = skb_queue_len(&roq->queue);
+	nsn = __i2400m_roq_nsn(roq, sn);
+	/*
+	 * For type 3(queue_update_window_start) rx messages, there is no
+	 * need to check if the normalized sequence number is greater 1023.
+	 * Simply insert and deliver all packets to the host up to the
+	 * window start.
+	 */
+	old_ws = roq->ws;
+	/* If the queue is empty, don't bother as we'd queue
+	 * it and immediately unqueue it -- just deliver it.
+	 */
+	if (len == 0) {
+		struct i2400m_roq_data *roq_data;
+		roq_data = (struct i2400m_roq_data *) &skb->cb;
+		i2400m_net_erx(i2400m, skb, roq_data->cs);
+	} else
+		__i2400m_roq_queue(i2400m, roq, skb, sn, nsn);
+
+	__i2400m_roq_update_ws(i2400m, roq, sn + 1);
+	i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_PACKET_WS,
+			   old_ws, len, sn, nsn, roq->ws);
+
+	d_fnend(2, dev, "(i2400m %p roq %p skb %p sn %u) = void\n",
+		i2400m, roq, skb, sn);
+}
+
+
+/*
+ * This routine destroys the memory allocated for rx_roq, when no
+ * other thread is accessing it. Access to rx_roq is refcounted by
+ * rx_roq_refcount, hence memory allocated must be destroyed when
+ * rx_roq_refcount becomes zero. This routine gets executed when
+ * rx_roq_refcount becomes zero.
+ */
+static void i2400m_rx_roq_destroy(struct kref *ref)
+{
+	unsigned itr;
+	struct i2400m *i2400m
+			= container_of(ref, struct i2400m, rx_roq_refcount);
+	for (itr = 0; itr < I2400M_RO_CIN + 1; itr++)
+		__skb_queue_purge(&i2400m->rx_roq[itr].queue);
+	kfree(i2400m->rx_roq[0].log);
+	kfree(i2400m->rx_roq);
+	i2400m->rx_roq = NULL;
+}
+
+/*
+ * Receive and send up an extended data packet
+ *
+ * @i2400m: device descriptor
+ * @skb_rx: skb that contains the extended data packet
+ * @single_last: 1 if the payload is the only one or the last one of
+ *     the skb.
+ * @payload: pointer to the packet's data inside the skb
+ * @size: size of the payload
+ *
+ * Starting in v1.4 of the i2400m's firmware, the device can send data
+ * packets to the host in an extended format that; this incudes a 16
+ * byte header (struct i2400m_pl_edata_hdr). Using this header's space
+ * we can fake ethernet headers for ethernet device emulation without
+ * having to copy packets around.
+ *
+ * This function handles said path.
+ *
+ *
+ * Receive and send up an extended data packet that requires no reordering
+ *
+ * @i2400m: device descriptor
+ * @skb_rx: skb that contains the extended data packet
+ * @single_last: 1 if the payload is the only one or the last one of
+ *     the skb.
+ * @payload: pointer to the packet's data (past the actual extended
+ *     data payload header).
+ * @size: size of the payload
+ *
+ * Pass over to the networking stack a data packet that might have
+ * reordering requirements.
+ *
+ * This needs to the decide if the skb in which the packet is
+ * contained can be reused or if it needs to be cloned. Then it has to
+ * be trimmed in the edges so that the beginning is the space for eth
+ * header and then pass it to i2400m_net_erx() for the stack
+ *
+ * Assumes the caller has verified the sanity of the payload (size,
+ * etc) already.
+ */
+static
+void i2400m_rx_edata(struct i2400m *i2400m, struct sk_buff *skb_rx,
+		     unsigned single_last, const void *payload, size_t size)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_pl_edata_hdr *hdr = payload;
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	struct sk_buff *skb;
+	enum i2400m_cs cs;
+	u32 reorder;
+	unsigned ro_needed, ro_type, ro_cin, ro_sn;
+	struct i2400m_roq *roq;
+	struct i2400m_roq_data *roq_data;
+	unsigned long flags;
+
+	BUILD_BUG_ON(ETH_HLEN > sizeof(*hdr));
+
+	d_fnstart(2, dev, "(i2400m %p skb_rx %p single %u payload %p "
+		  "size %zu)\n", i2400m, skb_rx, single_last, payload, size);
+	if (size < sizeof(*hdr)) {
+		dev_err(dev, "ERX: HW BUG? message with short header (%zu "
+			"vs %zu bytes expected)\n", size, sizeof(*hdr));
+		goto error;
+	}
+
+	if (single_last) {
+		skb = skb_get(skb_rx);
+		d_printf(3, dev, "ERX: skb %p reusing\n", skb);
+	} else {
+		skb = skb_clone(skb_rx, GFP_KERNEL);
+		if (skb == NULL) {
+			dev_err(dev, "ERX: no memory to clone skb\n");
+			net_dev->stats.rx_dropped++;
+			goto error_skb_clone;
+		}
+		d_printf(3, dev, "ERX: skb %p cloned from %p\n", skb, skb_rx);
+	}
+	/* now we have to pull and trim so that the skb points to the
+	 * beginning of the IP packet; the netdev part will add the
+	 * ethernet header as needed - we know there is enough space
+	 * because we checked in i2400m_rx_edata(). */
+	skb_pull(skb, payload + sizeof(*hdr) - (void *) skb->data);
+	skb_trim(skb, (void *) skb_end_pointer(skb) - payload - sizeof(*hdr));
+
+	reorder = le32_to_cpu(hdr->reorder);
+	ro_needed = reorder & I2400M_RO_NEEDED;
+	cs = hdr->cs;
+	if (ro_needed) {
+		ro_type = (reorder >> I2400M_RO_TYPE_SHIFT) & I2400M_RO_TYPE;
+		ro_cin = (reorder >> I2400M_RO_CIN_SHIFT) & I2400M_RO_CIN;
+		ro_sn = (reorder >> I2400M_RO_SN_SHIFT) & I2400M_RO_SN;
+
+		spin_lock_irqsave(&i2400m->rx_lock, flags);
+		if (i2400m->rx_roq == NULL) {
+			kfree_skb(skb);	/* rx_roq is already destroyed */
+			spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+			goto error;
+		}
+		roq = &i2400m->rx_roq[ro_cin];
+		kref_get(&i2400m->rx_roq_refcount);
+		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+
+		roq_data = (struct i2400m_roq_data *) &skb->cb;
+		roq_data->sn = ro_sn;
+		roq_data->cs = cs;
+		d_printf(2, dev, "ERX: reorder needed: "
+			 "type %u cin %u [ws %u] sn %u/%u len %zuB\n",
+			 ro_type, ro_cin, roq->ws, ro_sn,
+			 __i2400m_roq_nsn(roq, ro_sn), size);
+		d_dump(2, dev, payload, size);
+		switch(ro_type) {
+		case I2400M_RO_TYPE_RESET:
+			i2400m_roq_reset(i2400m, roq);
+			kfree_skb(skb);	/* no data here */
+			break;
+		case I2400M_RO_TYPE_PACKET:
+			i2400m_roq_queue(i2400m, roq, skb, ro_sn);
+			break;
+		case I2400M_RO_TYPE_WS:
+			i2400m_roq_update_ws(i2400m, roq, ro_sn);
+			kfree_skb(skb);	/* no data here */
+			break;
+		case I2400M_RO_TYPE_PACKET_WS:
+			i2400m_roq_queue_update_ws(i2400m, roq, skb, ro_sn);
+			break;
+		default:
+			dev_err(dev, "HW BUG? unknown reorder type %u\n", ro_type);
+		}
+
+		spin_lock_irqsave(&i2400m->rx_lock, flags);
+		kref_put(&i2400m->rx_roq_refcount, i2400m_rx_roq_destroy);
+		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	}
+	else
+		i2400m_net_erx(i2400m, skb, cs);
+error_skb_clone:
+error:
+	d_fnend(2, dev, "(i2400m %p skb_rx %p single %u payload %p "
+		"size %zu) = void\n", i2400m, skb_rx, single_last, payload, size);
+}
+
+
+/*
+ * Act on a received payload
+ *
+ * @i2400m: device instance
+ * @skb_rx: skb where the transaction was received
+ * @single_last: 1 this is the only payload or the last one (so the
+ *     skb can be reused instead of cloned).
+ * @pld: payload descriptor
+ * @payload: payload data
+ *
+ * Upon reception of a payload, look at its guts in the payload
+ * descriptor and decide what to do with it. If it is a single payload
+ * skb or if the last skb is a data packet, the skb will be referenced
+ * and modified (so it doesn't have to be cloned).
+ */
+static
+void i2400m_rx_payload(struct i2400m *i2400m, struct sk_buff *skb_rx,
+		       unsigned single_last, const struct i2400m_pld *pld,
+		       const void *payload)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	size_t pl_size = i2400m_pld_size(pld);
+	enum i2400m_pt pl_type = i2400m_pld_type(pld);
+
+	d_printf(7, dev, "RX: received payload type %u, %zu bytes\n",
+		 pl_type, pl_size);
+	d_dump(8, dev, payload, pl_size);
+
+	switch (pl_type) {
+	case I2400M_PT_DATA:
+		d_printf(3, dev, "RX: data payload %zu bytes\n", pl_size);
+		i2400m_net_rx(i2400m, skb_rx, single_last, payload, pl_size);
+		break;
+	case I2400M_PT_CTRL:
+		i2400m_rx_ctl(i2400m, skb_rx, payload, pl_size);
+		break;
+	case I2400M_PT_TRACE:
+		i2400m_rx_trace(i2400m, payload, pl_size);
+		break;
+	case I2400M_PT_EDATA:
+		d_printf(3, dev, "ERX: data payload %zu bytes\n", pl_size);
+		i2400m_rx_edata(i2400m, skb_rx, single_last, payload, pl_size);
+		break;
+	default:	/* Anything else shouldn't come to the host */
+		if (printk_ratelimit())
+			dev_err(dev, "RX: HW BUG? unexpected payload type %u\n",
+				pl_type);
+	}
+}
+
+
+/*
+ * Check a received transaction's message header
+ *
+ * @i2400m: device descriptor
+ * @msg_hdr: message header
+ * @buf_size: size of the received buffer
+ *
+ * Check that the declarations done by a RX buffer message header are
+ * sane and consistent with the amount of data that was received.
+ */
+static
+int i2400m_rx_msg_hdr_check(struct i2400m *i2400m,
+			    const struct i2400m_msg_hdr *msg_hdr,
+			    size_t buf_size)
+{
+	int result = -EIO;
+	struct device *dev = i2400m_dev(i2400m);
+	if (buf_size < sizeof(*msg_hdr)) {
+		dev_err(dev, "RX: HW BUG? message with short header (%zu "
+			"vs %zu bytes expected)\n", buf_size, sizeof(*msg_hdr));
+		goto error;
+	}
+	if (msg_hdr->barker != cpu_to_le32(I2400M_D2H_MSG_BARKER)) {
+		dev_err(dev, "RX: HW BUG? message received with unknown "
+			"barker 0x%08x (buf_size %zu bytes)\n",
+			le32_to_cpu(msg_hdr->barker), buf_size);
+		goto error;
+	}
+	if (msg_hdr->num_pls == 0) {
+		dev_err(dev, "RX: HW BUG? zero payload packets in message\n");
+		goto error;
+	}
+	if (le16_to_cpu(msg_hdr->num_pls) > I2400M_MAX_PLS_IN_MSG) {
+		dev_err(dev, "RX: HW BUG? message contains more payload "
+			"than maximum; ignoring.\n");
+		goto error;
+	}
+	result = 0;
+error:
+	return result;
+}
+
+
+/*
+ * Check a payload descriptor against the received data
+ *
+ * @i2400m: device descriptor
+ * @pld: payload descriptor
+ * @pl_itr: offset (in bytes) in the received buffer the payload is
+ *          located
+ * @buf_size: size of the received buffer
+ *
+ * Given a payload descriptor (part of a RX buffer), check it is sane
+ * and that the data it declares fits in the buffer.
+ */
+static
+int i2400m_rx_pl_descr_check(struct i2400m *i2400m,
+			      const struct i2400m_pld *pld,
+			      size_t pl_itr, size_t buf_size)
+{
+	int result = -EIO;
+	struct device *dev = i2400m_dev(i2400m);
+	size_t pl_size = i2400m_pld_size(pld);
+	enum i2400m_pt pl_type = i2400m_pld_type(pld);
+
+	if (pl_size > i2400m->bus_pl_size_max) {
+		dev_err(dev, "RX: HW BUG? payload @%zu: size %zu is "
+			"bigger than maximum %zu; ignoring message\n",
+			pl_itr, pl_size, i2400m->bus_pl_size_max);
+		goto error;
+	}
+	if (pl_itr + pl_size > buf_size) {	/* enough? */
+		dev_err(dev, "RX: HW BUG? payload @%zu: size %zu "
+			"goes beyond the received buffer "
+			"size (%zu bytes); ignoring message\n",
+			pl_itr, pl_size, buf_size);
+		goto error;
+	}
+	if (pl_type >= I2400M_PT_ILLEGAL) {
+		dev_err(dev, "RX: HW BUG? illegal payload type %u; "
+			"ignoring message\n", pl_type);
+		goto error;
+	}
+	result = 0;
+error:
+	return result;
+}
+
+
+/**
+ * i2400m_rx - Receive a buffer of data from the device
+ *
+ * @i2400m: device descriptor
+ * @skb: skbuff where the data has been received
+ *
+ * Parse in a buffer of data that contains an RX message sent from the
+ * device. See the file header for the format. Run all checks on the
+ * buffer header, then run over each payload's descriptors, verify
+ * their consistency and act on each payload's contents.  If
+ * everything is successful, update the device's statistics.
+ *
+ * Note: You need to set the skb to contain only the length of the
+ * received buffer; for that, use skb_trim(skb, RECEIVED_SIZE).
+ *
+ * Returns:
+ *
+ * 0 if ok, < 0 errno on error
+ *
+ * If ok, this function owns now the skb and the caller DOESN'T have
+ * to run kfree_skb() on it. However, on error, the caller still owns
+ * the skb and it is responsible for releasing it.
+ */
+int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
+{
+	int i, result;
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_msg_hdr *msg_hdr;
+	size_t pl_itr, pl_size;
+	unsigned long flags;
+	unsigned num_pls, single_last, skb_len;
+
+	skb_len = skb->len;
+	d_fnstart(4, dev, "(i2400m %p skb %p [size %u])\n",
+		  i2400m, skb, skb_len);
+	msg_hdr = (void *) skb->data;
+	result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb_len);
+	if (result < 0)
+		goto error_msg_hdr_check;
+	result = -EIO;
+	num_pls = le16_to_cpu(msg_hdr->num_pls);
+	/* Check payload descriptor(s) */
+	pl_itr = struct_size(msg_hdr, pld, num_pls);
+	pl_itr = ALIGN(pl_itr, I2400M_PL_ALIGN);
+	if (pl_itr > skb_len) {	/* got all the payload descriptors? */
+		dev_err(dev, "RX: HW BUG? message too short (%u bytes) for "
+			"%u payload descriptors (%zu each, total %zu)\n",
+			skb_len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr);
+		goto error_pl_descr_short;
+	}
+	/* Walk each payload payload--check we really got it */
+	for (i = 0; i < num_pls; i++) {
+		/* work around old gcc warnings */
+		pl_size = i2400m_pld_size(&msg_hdr->pld[i]);
+		result = i2400m_rx_pl_descr_check(i2400m, &msg_hdr->pld[i],
+						  pl_itr, skb_len);
+		if (result < 0)
+			goto error_pl_descr_check;
+		single_last = num_pls == 1 || i == num_pls - 1;
+		i2400m_rx_payload(i2400m, skb, single_last, &msg_hdr->pld[i],
+				  skb->data + pl_itr);
+		pl_itr += ALIGN(pl_size, I2400M_PL_ALIGN);
+		cond_resched();		/* Don't monopolize */
+	}
+	kfree_skb(skb);
+	/* Update device statistics */
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	i2400m->rx_pl_num += i;
+	if (i > i2400m->rx_pl_max)
+		i2400m->rx_pl_max = i;
+	if (i < i2400m->rx_pl_min)
+		i2400m->rx_pl_min = i;
+	i2400m->rx_num++;
+	i2400m->rx_size_acc += skb_len;
+	if (skb_len < i2400m->rx_size_min)
+		i2400m->rx_size_min = skb_len;
+	if (skb_len > i2400m->rx_size_max)
+		i2400m->rx_size_max = skb_len;
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+error_pl_descr_check:
+error_pl_descr_short:
+error_msg_hdr_check:
+	d_fnend(4, dev, "(i2400m %p skb %p [size %u]) = %d\n",
+		i2400m, skb, skb_len, result);
+	return result;
+}
+EXPORT_SYMBOL_GPL(i2400m_rx);
+
+
+void i2400m_unknown_barker(struct i2400m *i2400m,
+			   const void *buf, size_t size)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	char prefix[64];
+	const __le32 *barker = buf;
+	dev_err(dev, "RX: HW BUG? unknown barker %08x, "
+		"dropping %zu bytes\n", le32_to_cpu(*barker), size);
+	snprintf(prefix, sizeof(prefix), "%s %s: ",
+		 dev_driver_string(dev), dev_name(dev));
+	if (size > 64) {
+		print_hex_dump(KERN_ERR, prefix, DUMP_PREFIX_OFFSET,
+			       8, 4, buf, 64, 0);
+		printk(KERN_ERR "%s... (only first 64 bytes "
+		       "dumped)\n", prefix);
+	} else
+		print_hex_dump(KERN_ERR, prefix, DUMP_PREFIX_OFFSET,
+			       8, 4, buf, size, 0);
+}
+EXPORT_SYMBOL(i2400m_unknown_barker);
+
+
+/*
+ * Initialize the RX queue and infrastructure
+ *
+ * This sets up all the RX reordering infrastructures, which will not
+ * be used if reordering is not enabled or if the firmware does not
+ * support it. The device is told to do reordering in
+ * i2400m_dev_initialize(), where it also looks at the value of the
+ * i2400m->rx_reorder switch before taking a decission.
+ *
+ * Note we allocate the roq queues in one chunk and the actual logging
+ * support for it (logging) in another one and then we setup the
+ * pointers from the first to the last.
+ */
+int i2400m_rx_setup(struct i2400m *i2400m)
+{
+	int result = 0;
+
+	i2400m->rx_reorder = i2400m_rx_reorder_disabled? 0 : 1;
+	if (i2400m->rx_reorder) {
+		unsigned itr;
+		struct i2400m_roq_log *rd;
+
+		result = -ENOMEM;
+
+		i2400m->rx_roq = kcalloc(I2400M_RO_CIN + 1,
+					 sizeof(i2400m->rx_roq[0]), GFP_KERNEL);
+		if (i2400m->rx_roq == NULL)
+			goto error_roq_alloc;
+
+		rd = kcalloc(I2400M_RO_CIN + 1, sizeof(*i2400m->rx_roq[0].log),
+			     GFP_KERNEL);
+		if (rd == NULL) {
+			result = -ENOMEM;
+			goto error_roq_log_alloc;
+		}
+
+		for(itr = 0; itr < I2400M_RO_CIN + 1; itr++) {
+			__i2400m_roq_init(&i2400m->rx_roq[itr]);
+			i2400m->rx_roq[itr].log = &rd[itr];
+		}
+		kref_init(&i2400m->rx_roq_refcount);
+	}
+	return 0;
+
+error_roq_log_alloc:
+	kfree(i2400m->rx_roq);
+error_roq_alloc:
+	return result;
+}
+
+
+/* Tear down the RX queue and infrastructure */
+void i2400m_rx_release(struct i2400m *i2400m)
+{
+	unsigned long flags;
+
+	if (i2400m->rx_reorder) {
+		spin_lock_irqsave(&i2400m->rx_lock, flags);
+		kref_put(&i2400m->rx_roq_refcount, i2400m_rx_roq_destroy);
+		spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	}
+	/* at this point, nothing can be received... */
+	i2400m_report_hook_flush(i2400m);
+}
diff --git a/drivers/staging/wimax/i2400m/sysfs.c b/drivers/staging/wimax/i2400m/sysfs.c
new file mode 100644
index 000000000000..895ee265909b
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/sysfs.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Sysfs interfaces to show driver and device information
+ *
+ * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include "i2400m.h"
+
+
+#define D_SUBMODULE sysfs
+#include "debug-levels.h"
+
+
+/*
+ * Set the idle timeout (msecs)
+ *
+ * FIXME: eventually this should be a common WiMAX stack method, but
+ * would like to wait to see how other devices manage it.
+ */
+static
+ssize_t i2400m_idle_timeout_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t size)
+{
+	ssize_t result;
+	struct i2400m *i2400m = net_dev_to_i2400m(to_net_dev(dev));
+	unsigned val;
+
+	result = -EINVAL;
+	if (sscanf(buf, "%u\n", &val) != 1)
+		goto error_no_unsigned;
+	if (val != 0 && (val < 100 || val > 300000 || val % 100 != 0)) {
+		dev_err(dev, "idle_timeout: %u: invalid msecs specification; "
+			"valid values are 0, 100-300000 in 100 increments\n",
+			val);
+		goto error_bad_value;
+	}
+	result = i2400m_set_idle_timeout(i2400m, val);
+	if (result >= 0)
+		result = size;
+error_no_unsigned:
+error_bad_value:
+	return result;
+}
+
+static
+DEVICE_ATTR_WO(i2400m_idle_timeout);
+
+static
+struct attribute *i2400m_dev_attrs[] = {
+	&dev_attr_i2400m_idle_timeout.attr,
+	NULL,
+};
+
+struct attribute_group i2400m_dev_attr_group = {
+	.name = NULL,		/* we want them in the same directory */
+	.attrs = i2400m_dev_attrs,
+};
diff --git a/drivers/staging/wimax/i2400m/tx.c b/drivers/staging/wimax/i2400m/tx.c
new file mode 100644
index 000000000000..1255302e251e
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/tx.c
@@ -0,0 +1,1011 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Generic (non-bus specific) TX handling
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ *  - Initial implementation
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Rewritten to use a single FIFO to lower the memory allocation
+ *    pressure and optimize cache hits when copying to the queue, as
+ *    well as splitting out bus-specific code.
+ *
+ *
+ * Implements data transmission to the device; this is done through a
+ * software FIFO, as data/control frames can be coalesced (while the
+ * device is reading the previous tx transaction, others accumulate).
+ *
+ * A FIFO is used because at the end it is resource-cheaper that trying
+ * to implement scatter/gather over USB. As well, most traffic is going
+ * to be download (vs upload).
+ *
+ * The format for sending/receiving data to/from the i2400m is
+ * described in detail in rx.c:PROTOCOL FORMAT. In here we implement
+ * the transmission of that. This is split between a bus-independent
+ * part that just prepares everything and a bus-specific part that
+ * does the actual transmission over the bus to the device (in the
+ * bus-specific driver).
+ *
+ *
+ * The general format of a device-host transaction is MSG-HDR, PLD1,
+ * PLD2...PLDN, PL1, PL2,...PLN, PADDING.
+ *
+ * Because we need the send payload descriptors and then payloads and
+ * because it is kind of expensive to do scatterlists in USB (one URB
+ * per node), it becomes cheaper to append all the data to a FIFO
+ * (copying to a FIFO potentially in cache is cheaper).
+ *
+ * Then the bus-specific code takes the parts of that FIFO that are
+ * written and passes them to the device.
+ *
+ * So the concepts to keep in mind there are:
+ *
+ * We use a FIFO to queue the data in a linear buffer. We first append
+ * a MSG-HDR, space for I2400M_TX_PLD_MAX payload descriptors and then
+ * go appending payloads until we run out of space or of payload
+ * descriptors. Then we append padding to make the whole transaction a
+ * multiple of i2400m->bus_tx_block_size (as defined by the bus layer).
+ *
+ * - A TX message: a combination of a message header, payload
+ *   descriptors and payloads.
+ *
+ *     Open: it is marked as active (i2400m->tx_msg is valid) and we
+ *       can keep adding payloads to it.
+ *
+ *     Closed: we are not appending more payloads to this TX message
+ *       (exahusted space in the queue, too many payloads or
+ *       whichever).  We have appended padding so the whole message
+ *       length is aligned to i2400m->bus_tx_block_size (as set by the
+ *       bus/transport layer).
+ *
+ * - Most of the time we keep a TX message open to which we append
+ *   payloads.
+ *
+ * - If we are going to append and there is no more space (we are at
+ *   the end of the FIFO), we close the message, mark the rest of the
+ *   FIFO space unusable (skip_tail), create a new message at the
+ *   beginning of the FIFO (if there is space) and append the message
+ *   there.
+ *
+ *   This is because we need to give linear TX messages to the bus
+ *   engine. So we don't write a message to the remaining FIFO space
+ *   until the tail and continue at the head of it.
+ *
+ * - We overload one of the fields in the message header to use it as
+ *   'size' of the TX message, so we can iterate over them. It also
+ *   contains a flag that indicates if we have to skip it or not.
+ *   When we send the buffer, we update that to its real on-the-wire
+ *   value.
+ *
+ * - The MSG-HDR PLD1...PLD2 stuff has to be a size multiple of 16.
+ *
+ *   It follows that if MSG-HDR says we have N messages, the whole
+ *   header + descriptors is 16 + 4*N; for those to be a multiple of
+ *   16, it follows that N can be 4, 8, 12, ... (32, 48, 64, 80...
+ *   bytes).
+ *
+ *   So if we have only 1 payload, we have to submit a header that in
+ *   all truth has space for 4.
+ *
+ *   The implication is that we reserve space for 12 (64 bytes); but
+ *   if we fill up only (eg) 2, our header becomes 32 bytes only. So
+ *   the TX engine has to shift those 32 bytes of msg header and 2
+ *   payloads and padding so that right after it the payloads start
+ *   and the TX engine has to know about that.
+ *
+ *   It is cheaper to move the header up than the whole payloads down.
+ *
+ *   We do this in i2400m_tx_close(). See 'i2400m_msg_hdr->offset'.
+ *
+ * - Each payload has to be size-padded to 16 bytes; before appending
+ *   it, we just do it.
+ *
+ * - The whole message has to be padded to i2400m->bus_tx_block_size;
+ *   we do this at close time. Thus, when reserving space for the
+ *   payload, we always make sure there is also free space for this
+ *   padding that sooner or later will happen.
+ *
+ * When we append a message, we tell the bus specific code to kick in
+ * TXs. It will TX (in parallel) until the buffer is exhausted--hence
+ * the lockin we do. The TX code will only send a TX message at the
+ * time (which remember, might contain more than one payload). Of
+ * course, when the bus-specific driver attempts to TX a message that
+ * is still open, it gets closed first.
+ *
+ * Gee, this is messy; well a picture. In the example below we have a
+ * partially full FIFO, with a closed message ready to be delivered
+ * (with a moved message header to make sure it is size-aligned to
+ * 16), TAIL room that was unusable (and thus is marked with a message
+ * header that says 'skip this') and at the head of the buffer, an
+ * incomplete message with a couple of payloads.
+ *
+ * N   ___________________________________________________
+ *    |                                                   |
+ *    |     TAIL room                                     |
+ *    |                                                   |
+ *    |  msg_hdr to skip (size |= 0x80000)                |
+ *    |---------------------------------------------------|-------
+ *    |                                                   |  /|\
+ *    |                                                   |   |
+ *    |  TX message padding                               |   |
+ *    |                                                   |   |
+ *    |                                                   |   |
+ *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|   |
+ *    |                                                   |   |
+ *    |  payload 1                                        |   |
+ *    |                                                   | N * tx_block_size
+ *    |                                                   |   |
+ *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|   |
+ *    |                                                   |   |
+ *    |  payload 1                                        |   |
+ *    |                                                   |   |
+ *    |                                                   |   |
+ *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|- -|- - - -
+ *    |  padding 3                  /|\                   |   |   /|\
+ *    |  padding 2                   |                    |   |    |
+ *    |  pld 1                32 bytes (2 * 16)           |   |    |
+ *    |  pld 0                       |                    |   |    |
+ *    |  moved msg_hdr              \|/                   |  \|/   |
+ *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|- - -   |
+ *    |                                                   |    _PLD_SIZE
+ *    |  unused                                           |        |
+ *    |                                                   |        |
+ *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|        |
+ *    |  msg_hdr (size X)       [this message is closed]  |       \|/
+ *    |===================================================|========== <=== OUT
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |          Free rooom                               |
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |===================================================|========== <=== IN
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |                                                   |
+ *    |  payload 1                                        |
+ *    |                                                   |
+ *    |                                                   |
+ *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|
+ *    |                                                   |
+ *    |  payload 0                                        |
+ *    |                                                   |
+ *    |                                                   |
+ *    |- - - - - - - - - - - - - - - - - - - - - - - - - -|
+ *    |  pld 11                     /|\                   |
+ *    |  ...                         |                    |
+ *    |  pld 1                64 bytes (2 * 16)           |
+ *    |  pld 0                       |                    |
+ *    |  msg_hdr (size X)           \|/ [message is open] |
+ * 0   ---------------------------------------------------
+ *
+ *
+ * ROADMAP
+ *
+ * i2400m_tx_setup()           Called by i2400m_setup
+ * i2400m_tx_release()         Called by i2400m_release()
+ *
+ *  i2400m_tx()                 Called to send data or control frames
+ *    i2400m_tx_fifo_push()     Allocates append-space in the FIFO
+ *    i2400m_tx_new()           Opens a new message in the FIFO
+ *    i2400m_tx_fits()          Checks if a new payload fits in the message
+ *    i2400m_tx_close()         Closes an open message in the FIFO
+ *    i2400m_tx_skip_tail()     Marks unusable FIFO tail space
+ *    i2400m->bus_tx_kick()
+ *
+ * Now i2400m->bus_tx_kick() is the the bus-specific driver backend
+ * implementation; that would do:
+ *
+ * i2400m->bus_tx_kick()
+ *   i2400m_tx_msg_get()	Gets first message ready to go
+ *   ...sends it...
+ *   i2400m_tx_msg_sent()       Ack the message is sent; repeat from
+ *                              _tx_msg_get() until it returns NULL
+ *                               (FIFO empty).
+ */
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include "i2400m.h"
+
+
+#define D_SUBMODULE tx
+#include "debug-levels.h"
+
+enum {
+	/**
+	 * TX Buffer size
+	 *
+	 * Doc says maximum transaction is 16KiB. If we had 16KiB en
+	 * route and 16KiB being queued, it boils down to needing
+	 * 32KiB.
+	 * 32KiB is insufficient for 1400 MTU, hence increasing
+	 * tx buffer size to 64KiB.
+	 */
+	I2400M_TX_BUF_SIZE = 65536,
+	/**
+	 * Message header and payload descriptors have to be 16
+	 * aligned (16 + 4 * N = 16 * M). If we take that average sent
+	 * packets are MTU size (~1400-~1500) it follows that we could
+	 * fit at most 10-11 payloads in one transaction. To meet the
+	 * alignment requirement, that means we need to leave space
+	 * for 12 (64 bytes). To simplify, we leave space for that. If
+	 * at the end there are less, we pad up to the nearest
+	 * multiple of 16.
+	 */
+	/*
+	 * According to Intel Wimax i3200, i5x50 and i6x50 specification
+	 * documents, the maximum number of payloads per message can be
+	 * up to 60. Increasing the number of payloads to 60 per message
+	 * helps to accommodate smaller payloads in a single transaction.
+	 */
+	I2400M_TX_PLD_MAX = 60,
+	I2400M_TX_PLD_SIZE = sizeof(struct i2400m_msg_hdr)
+	+ I2400M_TX_PLD_MAX * sizeof(struct i2400m_pld),
+	I2400M_TX_SKIP = 0x80000000,
+	/*
+	 * According to Intel Wimax i3200, i5x50 and i6x50 specification
+	 * documents, the maximum size of each message can be up to 16KiB.
+	 */
+	I2400M_TX_MSG_SIZE = 16384,
+};
+
+#define TAIL_FULL ((void *)~(unsigned long)NULL)
+
+/*
+ * Calculate how much tail room is available
+ *
+ * Note the trick here. This path is ONLY caleed for Case A (see
+ * i2400m_tx_fifo_push() below), where we have:
+ *
+ *       Case A
+ * N  ___________
+ *   | tail room |
+ *   |           |
+ *   |<-  IN   ->|
+ *   |           |
+ *   |   data    |
+ *   |           |
+ *   |<-  OUT  ->|
+ *   |           |
+ *   | head room |
+ * 0  -----------
+ *
+ * When calculating the tail_room, tx_in might get to be zero if
+ * i2400m->tx_in is right at the end of the buffer (really full
+ * buffer) if there is no head room. In this case, tail_room would be
+ * I2400M_TX_BUF_SIZE, although it is actually zero. Hence the final
+ * mod (%) operation. However, when doing this kind of optimization,
+ * i2400m->tx_in being zero would fail, so we treat is an a special
+ * case.
+ */
+static inline
+size_t __i2400m_tx_tail_room(struct i2400m *i2400m)
+{
+	size_t tail_room;
+	size_t tx_in;
+
+	if (unlikely(i2400m->tx_in == 0))
+		return I2400M_TX_BUF_SIZE;
+	tx_in = i2400m->tx_in % I2400M_TX_BUF_SIZE;
+	tail_room = I2400M_TX_BUF_SIZE - tx_in;
+	tail_room %= I2400M_TX_BUF_SIZE;
+	return tail_room;
+}
+
+
+/*
+ * Allocate @size bytes in the TX fifo, return a pointer to it
+ *
+ * @i2400m: device descriptor
+ * @size: size of the buffer we need to allocate
+ * @padding: ensure that there is at least this many bytes of free
+ *     contiguous space in the fifo. This is needed because later on
+ *     we might need to add padding.
+ * @try_head: specify either to allocate head room or tail room space
+ *     in the TX FIFO. This boolean is required to avoids a system hang
+ *     due to an infinite loop caused by i2400m_tx_fifo_push().
+ *     The caller must always try to allocate tail room space first by
+ *     calling this routine with try_head = 0. In case if there
+ *     is not enough tail room space but there is enough head room space,
+ *     (i2400m_tx_fifo_push() returns TAIL_FULL) try to allocate head
+ *     room space, by calling this routine again with try_head = 1.
+ *
+ * Returns:
+ *
+ *     Pointer to the allocated space. NULL if there is no
+ *     space. TAIL_FULL if there is no space at the tail but there is at
+ *     the head (Case B below).
+ *
+ * These are the two basic cases we need to keep an eye for -- it is
+ * much better explained in linux/kernel/kfifo.c, but this code
+ * basically does the same. No rocket science here.
+ *
+ *       Case A               Case B
+ * N  ___________          ___________
+ *   | tail room |        |   data    |
+ *   |           |        |           |
+ *   |<-  IN   ->|        |<-  OUT  ->|
+ *   |           |        |           |
+ *   |   data    |        |   room    |
+ *   |           |        |           |
+ *   |<-  OUT  ->|        |<-  IN   ->|
+ *   |           |        |           |
+ *   | head room |        |   data    |
+ * 0  -----------          -----------
+ *
+ * We allocate only *contiguous* space.
+ *
+ * We can allocate only from 'room'. In Case B, it is simple; in case
+ * A, we only try from the tail room; if it is not enough, we just
+ * fail and return TAIL_FULL and let the caller figure out if we wants to
+ * skip the tail room and try to allocate from the head.
+ *
+ * There is a corner case, wherein i2400m_tx_new() can get into
+ * an infinite loop calling i2400m_tx_fifo_push().
+ * In certain situations, tx_in would have reached on the top of TX FIFO
+ * and i2400m_tx_tail_room() returns 0, as described below:
+ *
+ * N  ___________ tail room is zero
+ *   |<-  IN   ->|
+ *   |           |
+ *   |           |
+ *   |           |
+ *   |   data    |
+ *   |<-  OUT  ->|
+ *   |           |
+ *   |           |
+ *   | head room |
+ * 0  -----------
+ * During such a time, where tail room is zero in the TX FIFO and if there
+ * is a request to add a payload to TX FIFO, which calls:
+ * i2400m_tx()
+ *         ->calls i2400m_tx_close()
+ *         ->calls i2400m_tx_skip_tail()
+ *         goto try_new;
+ *         ->calls i2400m_tx_new()
+ *                    |----> [try_head:]
+ *     infinite loop  |     ->calls i2400m_tx_fifo_push()
+ *                    |                if (tail_room < needed)
+ *                    |                   if (head_room => needed)
+ *                    |                       return TAIL_FULL;
+ *                    |<----  goto try_head;
+ *
+ * i2400m_tx() calls i2400m_tx_close() to close the message, since there
+ * is no tail room to accommodate the payload and calls
+ * i2400m_tx_skip_tail() to skip the tail space. Now i2400m_tx() calls
+ * i2400m_tx_new() to allocate space for new message header calling
+ * i2400m_tx_fifo_push() that returns TAIL_FULL, since there is no tail space
+ * to accommodate the message header, but there is enough head space.
+ * The i2400m_tx_new() keeps re-retrying by calling i2400m_tx_fifo_push()
+ * ending up in a loop causing system freeze.
+ *
+ * This corner case is avoided by using a try_head boolean,
+ * as an argument to i2400m_tx_fifo_push().
+ *
+ * Note:
+ *
+ *     Assumes i2400m->tx_lock is taken, and we use that as a barrier
+ *
+ *     The indexes keep increasing and we reset them to zero when we
+ *     pop data off the queue
+ */
+static
+void *i2400m_tx_fifo_push(struct i2400m *i2400m, size_t size,
+			  size_t padding, bool try_head)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	size_t room, tail_room, needed_size;
+	void *ptr;
+
+	needed_size = size + padding;
+	room = I2400M_TX_BUF_SIZE - (i2400m->tx_in - i2400m->tx_out);
+	if (room < needed_size)	{ /* this takes care of Case B */
+		d_printf(2, dev, "fifo push %zu/%zu: no space\n",
+			 size, padding);
+		return NULL;
+	}
+	/* Is there space at the tail? */
+	tail_room = __i2400m_tx_tail_room(i2400m);
+	if (!try_head && tail_room < needed_size) {
+		/*
+		 * If the tail room space is not enough to push the message
+		 * in the TX FIFO, then there are two possibilities:
+		 * 1. There is enough head room space to accommodate
+		 * this message in the TX FIFO.
+		 * 2. There is not enough space in the head room and
+		 * in tail room of the TX FIFO to accommodate the message.
+		 * In the case (1), return TAIL_FULL so that the caller
+		 * can figure out, if the caller wants to push the message
+		 * into the head room space.
+		 * In the case (2), return NULL, indicating that the TX FIFO
+		 * cannot accommodate the message.
+		 */
+		if (room - tail_room >= needed_size) {
+			d_printf(2, dev, "fifo push %zu/%zu: tail full\n",
+				 size, padding);
+			return TAIL_FULL;	/* There might be head space */
+		} else {
+			d_printf(2, dev, "fifo push %zu/%zu: no head space\n",
+				 size, padding);
+			return NULL;	/* There is no space */
+		}
+	}
+	ptr = i2400m->tx_buf + i2400m->tx_in % I2400M_TX_BUF_SIZE;
+	d_printf(2, dev, "fifo push %zu/%zu: at @%zu\n", size, padding,
+		 i2400m->tx_in % I2400M_TX_BUF_SIZE);
+	i2400m->tx_in += size;
+	return ptr;
+}
+
+
+/*
+ * Mark the tail of the FIFO buffer as 'to-skip'
+ *
+ * We should never hit the BUG_ON() because all the sizes we push to
+ * the FIFO are padded to be a multiple of 16 -- the size of *msg
+ * (I2400M_PL_PAD for the payloads, I2400M_TX_PLD_SIZE for the
+ * header).
+ *
+ * Tail room can get to be zero if a message was opened when there was
+ * space only for a header. _tx_close() will mark it as to-skip (as it
+ * will have no payloads) and there will be no more space to flush, so
+ * nothing has to be done here. This is probably cheaper than ensuring
+ * in _tx_new() that there is some space for payloads...as we could
+ * always possibly hit the same problem if the payload wouldn't fit.
+ *
+ * Note:
+ *
+ *     Assumes i2400m->tx_lock is taken, and we use that as a barrier
+ *
+ *     This path is only taken for Case A FIFO situations [see
+ *     i2400m_tx_fifo_push()]
+ */
+static
+void i2400m_tx_skip_tail(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	size_t tx_in = i2400m->tx_in % I2400M_TX_BUF_SIZE;
+	size_t tail_room = __i2400m_tx_tail_room(i2400m);
+	struct i2400m_msg_hdr *msg = i2400m->tx_buf + tx_in;
+	if (unlikely(tail_room == 0))
+		return;
+	BUG_ON(tail_room < sizeof(*msg));
+	msg->size = tail_room | I2400M_TX_SKIP;
+	d_printf(2, dev, "skip tail: skipping %zu bytes @%zu\n",
+		 tail_room, tx_in);
+	i2400m->tx_in += tail_room;
+}
+
+
+/*
+ * Check if a skb will fit in the TX queue's current active TX
+ * message (if there are still descriptors left unused).
+ *
+ * Returns:
+ *     0 if the message won't fit, 1 if it will.
+ *
+ * Note:
+ *
+ *     Assumes a TX message is active (i2400m->tx_msg).
+ *
+ *     Assumes i2400m->tx_lock is taken, and we use that as a barrier
+ */
+static
+unsigned i2400m_tx_fits(struct i2400m *i2400m)
+{
+	struct i2400m_msg_hdr *msg_hdr = i2400m->tx_msg;
+	return le16_to_cpu(msg_hdr->num_pls) < I2400M_TX_PLD_MAX;
+
+}
+
+
+/*
+ * Start a new TX message header in the queue.
+ *
+ * Reserve memory from the base FIFO engine and then just initialize
+ * the message header.
+ *
+ * We allocate the biggest TX message header we might need (one that'd
+ * fit I2400M_TX_PLD_MAX payloads) -- when it is closed it will be
+ * 'ironed it out' and the unneeded parts removed.
+ *
+ * NOTE:
+ *
+ *     Assumes that the previous message is CLOSED (eg: either
+ *     there was none or 'i2400m_tx_close()' was called on it).
+ *
+ *     Assumes i2400m->tx_lock is taken, and we use that as a barrier
+ */
+static
+void i2400m_tx_new(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_msg_hdr *tx_msg;
+	bool try_head = false;
+	BUG_ON(i2400m->tx_msg != NULL);
+	/*
+	 * In certain situations, TX queue might have enough space to
+	 * accommodate the new message header I2400M_TX_PLD_SIZE, but
+	 * might not have enough space to accommodate the payloads.
+	 * Adding bus_tx_room_min padding while allocating a new TX message
+	 * increases the possibilities of including at least one payload of the
+	 * size <= bus_tx_room_min.
+	 */
+try_head:
+	tx_msg = i2400m_tx_fifo_push(i2400m, I2400M_TX_PLD_SIZE,
+				     i2400m->bus_tx_room_min, try_head);
+	if (tx_msg == NULL)
+		goto out;
+	else if (tx_msg == TAIL_FULL) {
+		i2400m_tx_skip_tail(i2400m);
+		d_printf(2, dev, "new TX message: tail full, trying head\n");
+		try_head = true;
+		goto try_head;
+	}
+	memset(tx_msg, 0, I2400M_TX_PLD_SIZE);
+	tx_msg->size = I2400M_TX_PLD_SIZE;
+out:
+	i2400m->tx_msg = tx_msg;
+	d_printf(2, dev, "new TX message: %p @%zu\n",
+		 tx_msg, (void *) tx_msg - i2400m->tx_buf);
+}
+
+
+/*
+ * Finalize the current TX message header
+ *
+ * Sets the message header to be at the proper location depending on
+ * how many descriptors we have (check documentation at the file's
+ * header for more info on that).
+ *
+ * Appends padding bytes to make sure the whole TX message (counting
+ * from the 'relocated' message header) is aligned to
+ * tx_block_size. We assume the _append() code has left enough space
+ * in the FIFO for that. If there are no payloads, just pass, as it
+ * won't be transferred.
+ *
+ * The amount of padding bytes depends on how many payloads are in the
+ * TX message, as the "msg header and payload descriptors" will be
+ * shifted up in the buffer.
+ */
+static
+void i2400m_tx_close(struct i2400m *i2400m)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_msg_hdr *tx_msg = i2400m->tx_msg;
+	struct i2400m_msg_hdr *tx_msg_moved;
+	size_t aligned_size, padding, hdr_size;
+	void *pad_buf;
+	unsigned num_pls;
+
+	if (tx_msg->size & I2400M_TX_SKIP)	/* a skipper? nothing to do */
+		goto out;
+	num_pls = le16_to_cpu(tx_msg->num_pls);
+	/* We can get this situation when a new message was started
+	 * and there was no space to add payloads before hitting the
+	 tail (and taking padding into consideration). */
+	if (num_pls == 0) {
+		tx_msg->size |= I2400M_TX_SKIP;
+		goto out;
+	}
+	/* Relocate the message header
+	 *
+	 * Find the current header size, align it to 16 and if we need
+	 * to move it so the tail is next to the payloads, move it and
+	 * set the offset.
+	 *
+	 * If it moved, this header is good only for transmission; the
+	 * original one (it is kept if we moved) is still used to
+	 * figure out where the next TX message starts (and where the
+	 * offset to the moved header is).
+	 */
+	hdr_size = struct_size(tx_msg, pld, le16_to_cpu(tx_msg->num_pls));
+	hdr_size = ALIGN(hdr_size, I2400M_PL_ALIGN);
+	tx_msg->offset = I2400M_TX_PLD_SIZE - hdr_size;
+	tx_msg_moved = (void *) tx_msg + tx_msg->offset;
+	memmove(tx_msg_moved, tx_msg, hdr_size);
+	tx_msg_moved->size -= tx_msg->offset;
+	/*
+	 * Now figure out how much we have to add to the (moved!)
+	 * message so the size is a multiple of i2400m->bus_tx_block_size.
+	 */
+	aligned_size = ALIGN(tx_msg_moved->size, i2400m->bus_tx_block_size);
+	padding = aligned_size - tx_msg_moved->size;
+	if (padding > 0) {
+		pad_buf = i2400m_tx_fifo_push(i2400m, padding, 0, 0);
+		if (WARN_ON(pad_buf == NULL || pad_buf == TAIL_FULL)) {
+			/* This should not happen -- append should verify
+			 * there is always space left at least to append
+			 * tx_block_size */
+			dev_err(dev,
+				"SW BUG! Possible data leakage from memory the "
+				"device should not read for padding - "
+				"size %lu aligned_size %zu tx_buf %p in "
+				"%zu out %zu\n",
+				(unsigned long) tx_msg_moved->size,
+				aligned_size, i2400m->tx_buf, i2400m->tx_in,
+				i2400m->tx_out);
+		} else
+			memset(pad_buf, 0xad, padding);
+	}
+	tx_msg_moved->padding = cpu_to_le16(padding);
+	tx_msg_moved->size += padding;
+	if (tx_msg != tx_msg_moved)
+		tx_msg->size += padding;
+out:
+	i2400m->tx_msg = NULL;
+}
+
+
+/**
+ * i2400m_tx - send the data in a buffer to the device
+ *
+ * @buf: pointer to the buffer to transmit
+ *
+ * @buf_len: buffer size
+ *
+ * @pl_type: type of the payload we are sending.
+ *
+ * Returns:
+ *     0 if ok, < 0 errno code on error (-ENOSPC, if there is no more
+ *     room for the message in the queue).
+ *
+ * Appends the buffer to the TX FIFO and notifies the bus-specific
+ * part of the driver that there is new data ready to transmit.
+ * Once this function returns, the buffer has been copied, so it can
+ * be reused.
+ *
+ * The steps followed to append are explained in detail in the file
+ * header.
+ *
+ * Whenever we write to a message, we increase msg->size, so it
+ * reflects exactly how big the message is. This is needed so that if
+ * we concatenate two messages before they can be sent, the code that
+ * sends the messages can find the boundaries (and it will replace the
+ * size with the real barker before sending).
+ *
+ * Note:
+ *
+ *     Cold and warm reset payloads need to be sent as a single
+ *     payload, so we handle that.
+ */
+int i2400m_tx(struct i2400m *i2400m, const void *buf, size_t buf_len,
+	      enum i2400m_pt pl_type)
+{
+	int result = -ENOSPC;
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned long flags;
+	size_t padded_len;
+	void *ptr;
+	bool try_head = false;
+	unsigned is_singleton = pl_type == I2400M_PT_RESET_WARM
+		|| pl_type == I2400M_PT_RESET_COLD;
+
+	d_fnstart(3, dev, "(i2400m %p skb %p [%zu bytes] pt %u)\n",
+		  i2400m, buf, buf_len, pl_type);
+	padded_len = ALIGN(buf_len, I2400M_PL_ALIGN);
+	d_printf(5, dev, "padded_len %zd buf_len %zd\n", padded_len, buf_len);
+	/* If there is no current TX message, create one; if the
+	 * current one is out of payload slots or we have a singleton,
+	 * close it and start a new one */
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	/* If tx_buf is NULL, device is shutdown */
+	if (i2400m->tx_buf == NULL) {
+		result = -ESHUTDOWN;
+		goto error_tx_new;
+	}
+try_new:
+	if (unlikely(i2400m->tx_msg == NULL))
+		i2400m_tx_new(i2400m);
+	else if (unlikely(!i2400m_tx_fits(i2400m)
+			  || (is_singleton && i2400m->tx_msg->num_pls != 0))) {
+		d_printf(2, dev, "closing TX message (fits %u singleton "
+			 "%u num_pls %u)\n", i2400m_tx_fits(i2400m),
+			 is_singleton, i2400m->tx_msg->num_pls);
+		i2400m_tx_close(i2400m);
+		i2400m_tx_new(i2400m);
+	}
+	if (i2400m->tx_msg == NULL)
+		goto error_tx_new;
+	/*
+	 * Check if this skb will fit in the TX queue's current active
+	 * TX message. The total message size must not exceed the maximum
+	 * size of each message I2400M_TX_MSG_SIZE. If it exceeds,
+	 * close the current message and push this skb into the new message.
+	 */
+	if (i2400m->tx_msg->size + padded_len > I2400M_TX_MSG_SIZE) {
+		d_printf(2, dev, "TX: message too big, going new\n");
+		i2400m_tx_close(i2400m);
+		i2400m_tx_new(i2400m);
+	}
+	if (i2400m->tx_msg == NULL)
+		goto error_tx_new;
+	/* So we have a current message header; now append space for
+	 * the message -- if there is not enough, try the head */
+	ptr = i2400m_tx_fifo_push(i2400m, padded_len,
+				  i2400m->bus_tx_block_size, try_head);
+	if (ptr == TAIL_FULL) {	/* Tail is full, try head */
+		d_printf(2, dev, "pl append: tail full\n");
+		i2400m_tx_close(i2400m);
+		i2400m_tx_skip_tail(i2400m);
+		try_head = true;
+		goto try_new;
+	} else if (ptr == NULL) {	/* All full */
+		result = -ENOSPC;
+		d_printf(2, dev, "pl append: all full\n");
+	} else {			/* Got space, copy it, set padding */
+		struct i2400m_msg_hdr *tx_msg = i2400m->tx_msg;
+		unsigned num_pls = le16_to_cpu(tx_msg->num_pls);
+		memcpy(ptr, buf, buf_len);
+		memset(ptr + buf_len, 0xad, padded_len - buf_len);
+		i2400m_pld_set(&tx_msg->pld[num_pls], buf_len, pl_type);
+		d_printf(3, dev, "pld 0x%08x (type 0x%1x len 0x%04zx\n",
+			 le32_to_cpu(tx_msg->pld[num_pls].val),
+			 pl_type, buf_len);
+		tx_msg->num_pls = le16_to_cpu(num_pls+1);
+		tx_msg->size += padded_len;
+		d_printf(2, dev, "TX: appended %zu b (up to %u b) pl #%u\n",
+			padded_len, tx_msg->size, num_pls+1);
+		d_printf(2, dev,
+			 "TX: appended hdr @%zu %zu b pl #%u @%zu %zu/%zu b\n",
+			 (void *)tx_msg - i2400m->tx_buf, (size_t)tx_msg->size,
+			 num_pls+1, ptr - i2400m->tx_buf, buf_len, padded_len);
+		result = 0;
+		if (is_singleton)
+			i2400m_tx_close(i2400m);
+	}
+error_tx_new:
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	/* kick in most cases, except when the TX subsys is down, as
+	 * it might free space */
+	if (likely(result != -ESHUTDOWN))
+		i2400m->bus_tx_kick(i2400m);
+	d_fnend(3, dev, "(i2400m %p skb %p [%zu bytes] pt %u) = %d\n",
+		i2400m, buf, buf_len, pl_type, result);
+	return result;
+}
+EXPORT_SYMBOL_GPL(i2400m_tx);
+
+
+/**
+ * i2400m_tx_msg_get - Get the first TX message in the FIFO to start sending it
+ *
+ * @i2400m: device descriptors
+ * @bus_size: where to place the size of the TX message
+ *
+ * Called by the bus-specific driver to get the first TX message at
+ * the FIF that is ready for transmission.
+ *
+ * It sets the state in @i2400m to indicate the bus-specific driver is
+ * transferring that message (i2400m->tx_msg_size).
+ *
+ * Once the transfer is completed, call i2400m_tx_msg_sent().
+ *
+ * Notes:
+ *
+ *     The size of the TX message to be transmitted might be smaller than
+ *     that of the TX message in the FIFO (in case the header was
+ *     shorter). Hence, we copy it in @bus_size, for the bus layer to
+ *     use. We keep the message's size in i2400m->tx_msg_size so that
+ *     when the bus later is done transferring we know how much to
+ *     advance the fifo.
+ *
+ *     We collect statistics here as all the data is available and we
+ *     assume it is going to work [see i2400m_tx_msg_sent()].
+ */
+struct i2400m_msg_hdr *i2400m_tx_msg_get(struct i2400m *i2400m,
+					 size_t *bus_size)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400m_msg_hdr *tx_msg, *tx_msg_moved;
+	unsigned long flags, pls;
+
+	d_fnstart(3, dev, "(i2400m %p bus_size %p)\n", i2400m, bus_size);
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	tx_msg_moved = NULL;
+	if (i2400m->tx_buf == NULL)
+		goto out_unlock;
+skip:
+	tx_msg_moved = NULL;
+	if (i2400m->tx_in == i2400m->tx_out) {	/* Empty FIFO? */
+		i2400m->tx_in = 0;
+		i2400m->tx_out = 0;
+		d_printf(2, dev, "TX: FIFO empty: resetting\n");
+		goto out_unlock;
+	}
+	tx_msg = i2400m->tx_buf + i2400m->tx_out % I2400M_TX_BUF_SIZE;
+	if (tx_msg->size & I2400M_TX_SKIP) {	/* skip? */
+		d_printf(2, dev, "TX: skip: msg @%zu (%zu b)\n",
+			 i2400m->tx_out % I2400M_TX_BUF_SIZE,
+			 (size_t) tx_msg->size & ~I2400M_TX_SKIP);
+		i2400m->tx_out += tx_msg->size & ~I2400M_TX_SKIP;
+		goto skip;
+	}
+
+	if (tx_msg->num_pls == 0) {		/* No payloads? */
+		if (tx_msg == i2400m->tx_msg) {	/* open, we are done */
+			d_printf(2, dev,
+				 "TX: FIFO empty: open msg w/o payloads @%zu\n",
+				 (void *) tx_msg - i2400m->tx_buf);
+			tx_msg = NULL;
+			goto out_unlock;
+		} else {			/* closed, skip it */
+			d_printf(2, dev,
+				 "TX: skip msg w/o payloads @%zu (%zu b)\n",
+				 (void *) tx_msg - i2400m->tx_buf,
+				 (size_t) tx_msg->size);
+			i2400m->tx_out += tx_msg->size & ~I2400M_TX_SKIP;
+			goto skip;
+		}
+	}
+	if (tx_msg == i2400m->tx_msg)		/* open msg? */
+		i2400m_tx_close(i2400m);
+
+	/* Now we have a valid TX message (with payloads) to TX */
+	tx_msg_moved = (void *) tx_msg + tx_msg->offset;
+	i2400m->tx_msg_size = tx_msg->size;
+	*bus_size = tx_msg_moved->size;
+	d_printf(2, dev, "TX: pid %d msg hdr at @%zu offset +@%zu "
+		 "size %zu bus_size %zu\n",
+		 current->pid, (void *) tx_msg - i2400m->tx_buf,
+		 (size_t) tx_msg->offset, (size_t) tx_msg->size,
+		 (size_t) tx_msg_moved->size);
+	tx_msg_moved->barker = le32_to_cpu(I2400M_H2D_PREVIEW_BARKER);
+	tx_msg_moved->sequence = le32_to_cpu(i2400m->tx_sequence++);
+
+	pls = le32_to_cpu(tx_msg_moved->num_pls);
+	i2400m->tx_pl_num += pls;		/* Update stats */
+	if (pls > i2400m->tx_pl_max)
+		i2400m->tx_pl_max = pls;
+	if (pls < i2400m->tx_pl_min)
+		i2400m->tx_pl_min = pls;
+	i2400m->tx_num++;
+	i2400m->tx_size_acc += *bus_size;
+	if (*bus_size < i2400m->tx_size_min)
+		i2400m->tx_size_min = *bus_size;
+	if (*bus_size > i2400m->tx_size_max)
+		i2400m->tx_size_max = *bus_size;
+out_unlock:
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	d_fnstart(3, dev, "(i2400m %p bus_size %p [%zu]) = %p\n",
+		  i2400m, bus_size, *bus_size, tx_msg_moved);
+	return tx_msg_moved;
+}
+EXPORT_SYMBOL_GPL(i2400m_tx_msg_get);
+
+
+/**
+ * i2400m_tx_msg_sent - indicate the transmission of a TX message
+ *
+ * @i2400m: device descriptor
+ *
+ * Called by the bus-specific driver when a message has been sent;
+ * this pops it from the FIFO; and as there is space, start the queue
+ * in case it was stopped.
+ *
+ * Should be called even if the message send failed and we are
+ * dropping this TX message.
+ */
+void i2400m_tx_msg_sent(struct i2400m *i2400m)
+{
+	unsigned n;
+	unsigned long flags;
+	struct device *dev = i2400m_dev(i2400m);
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	if (i2400m->tx_buf == NULL)
+		goto out_unlock;
+	i2400m->tx_out += i2400m->tx_msg_size;
+	d_printf(2, dev, "TX: sent %zu b\n", (size_t) i2400m->tx_msg_size);
+	i2400m->tx_msg_size = 0;
+	BUG_ON(i2400m->tx_out > i2400m->tx_in);
+	/* level them FIFO markers off */
+	n = i2400m->tx_out / I2400M_TX_BUF_SIZE;
+	i2400m->tx_out %= I2400M_TX_BUF_SIZE;
+	i2400m->tx_in -= n * I2400M_TX_BUF_SIZE;
+out_unlock:
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
+}
+EXPORT_SYMBOL_GPL(i2400m_tx_msg_sent);
+
+
+/**
+ * i2400m_tx_setup - Initialize the TX queue and infrastructure
+ *
+ * Make sure we reset the TX sequence to zero, as when this function
+ * is called, the firmware has been just restarted. Same rational
+ * for tx_in, tx_out, tx_msg_size and tx_msg. We reset them since
+ * the memory for TX queue is reallocated.
+ */
+int i2400m_tx_setup(struct i2400m *i2400m)
+{
+	int result = 0;
+	void *tx_buf;
+	unsigned long flags;
+
+	/* Do this here only once -- can't do on
+	 * i2400m_hard_start_xmit() as we'll cause race conditions if
+	 * the WS was scheduled on another CPU */
+	INIT_WORK(&i2400m->wake_tx_ws, i2400m_wake_tx_work);
+
+	tx_buf = kmalloc(I2400M_TX_BUF_SIZE, GFP_ATOMIC);
+	if (tx_buf == NULL) {
+		result = -ENOMEM;
+		goto error_kmalloc;
+	}
+
+	/*
+	 * Fail the build if we can't fit at least two maximum size messages
+	 * on the TX FIFO [one being delivered while one is constructed].
+	 */
+	BUILD_BUG_ON(2 * I2400M_TX_MSG_SIZE > I2400M_TX_BUF_SIZE);
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	i2400m->tx_sequence = 0;
+	i2400m->tx_in = 0;
+	i2400m->tx_out = 0;
+	i2400m->tx_msg_size = 0;
+	i2400m->tx_msg = NULL;
+	i2400m->tx_buf = tx_buf;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	/* Huh? the bus layer has to define this... */
+	BUG_ON(i2400m->bus_tx_block_size == 0);
+error_kmalloc:
+	return result;
+
+}
+
+
+/**
+ * i2400m_tx_release - Tear down the TX queue and infrastructure
+ */
+void i2400m_tx_release(struct i2400m *i2400m)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	kfree(i2400m->tx_buf);
+	i2400m->tx_buf = NULL;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+}
diff --git a/drivers/staging/wimax/i2400m/usb-debug-levels.h b/drivers/staging/wimax/i2400m/usb-debug-levels.h
new file mode 100644
index 000000000000..8fd0111560f6
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/usb-debug-levels.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Debug levels control file for the i2400m-usb module
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ */
+#ifndef __debug_levels__h__
+#define __debug_levels__h__
+
+/* Maximum compile and run time debug level for all submodules */
+#define D_MODULENAME i2400m_usb
+#define D_MASTER CONFIG_WIMAX_I2400M_DEBUG_LEVEL
+
+#include "../linux-wimax-debug.h"
+
+/* List of all the enabled modules */
+enum d_module {
+	D_SUBMODULE_DECLARE(usb),
+	D_SUBMODULE_DECLARE(fw),
+	D_SUBMODULE_DECLARE(notif),
+	D_SUBMODULE_DECLARE(rx),
+	D_SUBMODULE_DECLARE(tx),
+};
+
+
+#endif /* #ifndef __debug_levels__h__ */
diff --git a/drivers/staging/wimax/i2400m/usb-fw.c b/drivers/staging/wimax/i2400m/usb-fw.c
new file mode 100644
index 000000000000..27ab233650d5
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/usb-fw.c
@@ -0,0 +1,365 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Firmware uploader's USB specifics
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Initial implementation
+ *
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - bus generic/specific split
+ *
+ * THE PROCEDURE
+ *
+ * See fw.c for the generic description of this procedure.
+ *
+ * This file implements only the USB specifics. It boils down to how
+ * to send a command and waiting for an acknowledgement from the
+ * device.
+ *
+ * This code (and process) is single threaded. It assumes it is the
+ * only thread poking around (guaranteed by fw.c).
+ *
+ * COMMAND EXECUTION
+ *
+ * A write URB is posted with the buffer to the bulk output endpoint.
+ *
+ * ACK RECEPTION
+ *
+ * We just post a URB to the notification endpoint and wait for
+ * data. We repeat until we get all the data we expect (as indicated
+ * by the call from the bus generic code).
+ *
+ * The data is not read from the bulk in endpoint for boot mode.
+ *
+ * ROADMAP
+ *
+ * i2400mu_bus_bm_cmd_send
+ *   i2400m_bm_cmd_prepare...
+ *   i2400mu_tx_bulk_out
+ *
+ * i2400mu_bus_bm_wait_for_ack
+ *   i2400m_notif_submit
+ */
+#include <linux/usb.h>
+#include <linux/gfp.h>
+#include "i2400m-usb.h"
+
+
+#define D_SUBMODULE fw
+#include "usb-debug-levels.h"
+
+
+/*
+ * Synchronous write to the device
+ *
+ * Takes care of updating EDC counts and thus, handle device errors.
+ */
+static
+ssize_t i2400mu_tx_bulk_out(struct i2400mu *i2400mu, void *buf, size_t buf_size)
+{
+	int result;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	int len;
+	struct usb_endpoint_descriptor *epd;
+	int pipe, do_autopm = 1;
+
+	result = usb_autopm_get_interface(i2400mu->usb_iface);
+	if (result < 0) {
+		dev_err(dev, "BM-CMD: can't get autopm: %d\n", result);
+		do_autopm = 0;
+	}
+	epd = usb_get_epd(i2400mu->usb_iface, i2400mu->endpoint_cfg.bulk_out);
+	pipe = usb_sndbulkpipe(i2400mu->usb_dev, epd->bEndpointAddress);
+retry:
+	result = usb_bulk_msg(i2400mu->usb_dev, pipe, buf, buf_size, &len, 200);
+	switch (result) {
+	case 0:
+		if (len != buf_size) {
+			dev_err(dev, "BM-CMD: short write (%u B vs %zu "
+				"expected)\n", len, buf_size);
+			result = -EIO;
+			break;
+		}
+		result = len;
+		break;
+	case -EPIPE:
+		/*
+		 * Stall -- maybe the device is choking with our
+		 * requests. Clear it and give it some time. If they
+		 * happen to often, it might be another symptom, so we
+		 * reset.
+		 *
+		 * No error handling for usb_clear_halt(0; if it
+		 * works, the retry works; if it fails, this switch
+		 * does the error handling for us.
+		 */
+		if (edc_inc(&i2400mu->urb_edc,
+			    10 * EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
+			dev_err(dev, "BM-CMD: too many stalls in "
+				"URB; resetting device\n");
+			usb_queue_reset_device(i2400mu->usb_iface);
+		} else {
+			usb_clear_halt(i2400mu->usb_dev, pipe);
+			msleep(10);	/* give the device some time */
+			goto retry;
+		}
+		fallthrough;
+	case -EINVAL:			/* while removing driver */
+	case -ENODEV:			/* dev disconnect ... */
+	case -ENOENT:			/* just ignore it */
+	case -ESHUTDOWN:		/* and exit */
+	case -ECONNRESET:
+		result = -ESHUTDOWN;
+		break;
+	case -ETIMEDOUT:			/* bah... */
+		break;
+	default:				/* any other? */
+		if (edc_inc(&i2400mu->urb_edc,
+			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
+				dev_err(dev, "BM-CMD: maximum errors in "
+					"URB exceeded; resetting device\n");
+				usb_queue_reset_device(i2400mu->usb_iface);
+				result = -ENODEV;
+				break;
+		}
+		dev_err(dev, "BM-CMD: URB error %d, retrying\n",
+			result);
+		goto retry;
+	}
+	if (do_autopm)
+		usb_autopm_put_interface(i2400mu->usb_iface);
+	return result;
+}
+
+
+/*
+ * Send a boot-mode command over the bulk-out pipe
+ *
+ * Command can be a raw command, which requires no preparation (and
+ * which might not even be following the command format). Checks that
+ * the right amount of data was transferred.
+ *
+ * To satisfy USB requirements (no onstack, vmalloc or in data segment
+ * buffers), we copy the command to i2400m->bm_cmd_buf and send it from
+ * there.
+ *
+ * @flags: pass thru from i2400m_bm_cmd()
+ * @return: cmd_size if ok, < 0 errno code on error.
+ */
+ssize_t i2400mu_bus_bm_cmd_send(struct i2400m *i2400m,
+				const struct i2400m_bootrom_header *_cmd,
+				size_t cmd_size, int flags)
+{
+	ssize_t result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
+	int opcode = _cmd == NULL ? -1 : i2400m_brh_get_opcode(_cmd);
+	struct i2400m_bootrom_header *cmd;
+	size_t cmd_size_a = ALIGN(cmd_size, 16);	/* USB restriction */
+
+	d_fnstart(8, dev, "(i2400m %p cmd %p size %zu)\n",
+		  i2400m, _cmd, cmd_size);
+	result = -E2BIG;
+	if (cmd_size > I2400M_BM_CMD_BUF_SIZE)
+		goto error_too_big;
+	if (_cmd != i2400m->bm_cmd_buf)
+		memmove(i2400m->bm_cmd_buf, _cmd, cmd_size);
+	cmd = i2400m->bm_cmd_buf;
+	if (cmd_size_a > cmd_size)			/* Zero pad space */
+		memset(i2400m->bm_cmd_buf + cmd_size, 0, cmd_size_a - cmd_size);
+	if ((flags & I2400M_BM_CMD_RAW) == 0) {
+		if (WARN_ON(i2400m_brh_get_response_required(cmd) == 0))
+			dev_warn(dev, "SW BUG: response_required == 0\n");
+		i2400m_bm_cmd_prepare(cmd);
+	}
+	result = i2400mu_tx_bulk_out(i2400mu, i2400m->bm_cmd_buf, cmd_size);
+	if (result < 0) {
+		dev_err(dev, "boot-mode cmd %d: cannot send: %zd\n",
+			opcode, result);
+		goto error_cmd_send;
+	}
+	if (result != cmd_size) {		/* all was transferred? */
+		dev_err(dev, "boot-mode cmd %d: incomplete transfer "
+			"(%zd vs %zu submitted)\n",  opcode, result, cmd_size);
+		result = -EIO;
+		goto error_cmd_size;
+	}
+error_cmd_size:
+error_cmd_send:
+error_too_big:
+	d_fnend(8, dev, "(i2400m %p cmd %p size %zu) = %zd\n",
+		i2400m, _cmd, cmd_size, result);
+	return result;
+}
+
+
+static
+void __i2400mu_bm_notif_cb(struct urb *urb)
+{
+	complete(urb->context);
+}
+
+
+/*
+ * submit a read to the notification endpoint
+ *
+ * @i2400m: device descriptor
+ * @urb: urb to use
+ * @completion: completion variable to complete when done
+ *
+ * Data is always read to i2400m->bm_ack_buf
+ */
+static
+int i2400mu_notif_submit(struct i2400mu *i2400mu, struct urb *urb,
+			 struct completion *completion)
+{
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct usb_endpoint_descriptor *epd;
+	int pipe;
+
+	epd = usb_get_epd(i2400mu->usb_iface,
+			  i2400mu->endpoint_cfg.notification);
+	pipe = usb_rcvintpipe(i2400mu->usb_dev, epd->bEndpointAddress);
+	usb_fill_int_urb(urb, i2400mu->usb_dev, pipe,
+			 i2400m->bm_ack_buf, I2400M_BM_ACK_BUF_SIZE,
+			 __i2400mu_bm_notif_cb, completion,
+			 epd->bInterval);
+	return usb_submit_urb(urb, GFP_KERNEL);
+}
+
+
+/*
+ * Read an ack from  the notification endpoint
+ *
+ * @i2400m:
+ * @_ack: pointer to where to store the read data
+ * @ack_size: how many bytes we should read
+ *
+ * Returns: < 0 errno code on error; otherwise, amount of received bytes.
+ *
+ * Submits a notification read, appends the read data to the given ack
+ * buffer and then repeats (until @ack_size bytes have been
+ * received).
+ */
+ssize_t i2400mu_bus_bm_wait_for_ack(struct i2400m *i2400m,
+				    struct i2400m_bootrom_header *_ack,
+				    size_t ack_size)
+{
+	ssize_t result = -ENOMEM;
+	struct device *dev = i2400m_dev(i2400m);
+	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
+	struct urb notif_urb;
+	void *ack = _ack;
+	size_t offset, len;
+	long val;
+	int do_autopm = 1;
+	DECLARE_COMPLETION_ONSTACK(notif_completion);
+
+	d_fnstart(8, dev, "(i2400m %p ack %p size %zu)\n",
+		  i2400m, ack, ack_size);
+	BUG_ON(_ack == i2400m->bm_ack_buf);
+	result = usb_autopm_get_interface(i2400mu->usb_iface);
+	if (result < 0) {
+		dev_err(dev, "BM-ACK: can't get autopm: %d\n", (int) result);
+		do_autopm = 0;
+	}
+	usb_init_urb(&notif_urb);	/* ready notifications */
+	usb_get_urb(&notif_urb);
+	offset = 0;
+	while (offset < ack_size) {
+		init_completion(&notif_completion);
+		result = i2400mu_notif_submit(i2400mu, &notif_urb,
+					      &notif_completion);
+		if (result < 0)
+			goto error_notif_urb_submit;
+		val = wait_for_completion_interruptible_timeout(
+			&notif_completion, HZ);
+		if (val == 0) {
+			result = -ETIMEDOUT;
+			usb_kill_urb(&notif_urb);	/* Timedout */
+			goto error_notif_wait;
+		}
+		if (val == -ERESTARTSYS) {
+			result = -EINTR;		/* Interrupted */
+			usb_kill_urb(&notif_urb);
+			goto error_notif_wait;
+		}
+		result = notif_urb.status;		/* How was the ack? */
+		switch (result) {
+		case 0:
+			break;
+		case -EINVAL:			/* while removing driver */
+		case -ENODEV:			/* dev disconnect ... */
+		case -ENOENT:			/* just ignore it */
+		case -ESHUTDOWN:		/* and exit */
+		case -ECONNRESET:
+			result = -ESHUTDOWN;
+			goto error_dev_gone;
+		default:				/* any other? */
+			usb_kill_urb(&notif_urb);	/* Timedout */
+			if (edc_inc(&i2400mu->urb_edc,
+				    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME))
+				goto error_exceeded;
+			dev_err(dev, "BM-ACK: URB error %d, "
+				"retrying\n", notif_urb.status);
+			continue;	/* retry */
+		}
+		if (notif_urb.actual_length == 0) {
+			d_printf(6, dev, "ZLP received, retrying\n");
+			continue;
+		}
+		/* Got data, append it to the buffer */
+		len = min(ack_size - offset, (size_t) notif_urb.actual_length);
+		memcpy(ack + offset, i2400m->bm_ack_buf, len);
+		offset += len;
+	}
+	result = offset;
+error_notif_urb_submit:
+error_notif_wait:
+error_dev_gone:
+out:
+	if (do_autopm)
+		usb_autopm_put_interface(i2400mu->usb_iface);
+	d_fnend(8, dev, "(i2400m %p ack %p size %zu) = %ld\n",
+		i2400m, ack, ack_size, (long) result);
+	usb_put_urb(&notif_urb);
+	return result;
+
+error_exceeded:
+	dev_err(dev, "bm: maximum errors in notification URB exceeded; "
+		"resetting device\n");
+	usb_queue_reset_device(i2400mu->usb_iface);
+	goto out;
+}
diff --git a/drivers/staging/wimax/i2400m/usb-notif.c b/drivers/staging/wimax/i2400m/usb-notif.c
new file mode 100644
index 000000000000..5d429f816125
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/usb-notif.c
@@ -0,0 +1,258 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m over USB
+ * Notification handling
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Initial implementation
+ *
+ *
+ * The notification endpoint is active when the device is not in boot
+ * mode; in here we just read and get notifications; based on those,
+ * we act to either reinitialize the device after a reboot or to
+ * submit a RX request.
+ *
+ * ROADMAP
+ *
+ * i2400mu_usb_notification_setup()
+ *
+ * i2400mu_usb_notification_release()
+ *
+ * i2400mu_usb_notification_cb()	Called when a URB is ready
+ *   i2400mu_notif_grok()
+ *     i2400m_is_boot_barker()
+ *     i2400m_dev_reset_handle()
+ *     i2400mu_rx_kick()
+ */
+#include <linux/usb.h>
+#include <linux/slab.h>
+#include "i2400m-usb.h"
+
+
+#define D_SUBMODULE notif
+#include "usb-debug-levels.h"
+
+
+static const
+__le32 i2400m_ZERO_BARKER[4] = { 0, 0, 0, 0 };
+
+
+/*
+ * Process a received notification
+ *
+ * In normal operation mode, we can only receive two types of payloads
+ * on the notification endpoint:
+ *
+ *   - a reboot barker, we do a bootstrap (the device has reseted).
+ *
+ *   - a block of zeroes: there is pending data in the IN endpoint
+ */
+static
+int i2400mu_notification_grok(struct i2400mu *i2400mu, const void *buf,
+				 size_t buf_len)
+{
+	int ret;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+
+	d_fnstart(4, dev, "(i2400m %p buf %p buf_len %zu)\n",
+		  i2400mu, buf, buf_len);
+	ret = -EIO;
+	if (buf_len < sizeof(i2400m_ZERO_BARKER))
+		/* Not a bug, just ignore */
+		goto error_bad_size;
+	ret = 0;
+	if (!memcmp(i2400m_ZERO_BARKER, buf, sizeof(i2400m_ZERO_BARKER))) {
+		i2400mu_rx_kick(i2400mu);
+		goto out;
+	}
+	ret = i2400m_is_boot_barker(i2400m, buf, buf_len);
+	if (unlikely(ret >= 0))
+		ret = i2400m_dev_reset_handle(i2400m, "device rebooted");
+	else	/* Unknown or unexpected data in the notif message */
+		i2400m_unknown_barker(i2400m, buf, buf_len);
+error_bad_size:
+out:
+	d_fnend(4, dev, "(i2400m %p buf %p buf_len %zu) = %d\n",
+		i2400mu, buf, buf_len, ret);
+	return ret;
+}
+
+
+/*
+ * URB callback for the notification endpoint
+ *
+ * @urb: the urb received from the notification endpoint
+ *
+ * This function will just process the USB side of the transaction,
+ * checking everything is fine, pass the processing to
+ * i2400m_notification_grok() and resubmit the URB.
+ */
+static
+void i2400mu_notification_cb(struct urb *urb)
+{
+	int ret;
+	struct i2400mu *i2400mu = urb->context;
+	struct device *dev = &i2400mu->usb_iface->dev;
+
+	d_fnstart(4, dev, "(urb %p status %d actual_length %d)\n",
+		  urb, urb->status, urb->actual_length);
+	ret = urb->status;
+	switch (ret) {
+	case 0:
+		ret = i2400mu_notification_grok(i2400mu, urb->transfer_buffer,
+						urb->actual_length);
+		if (ret == -EIO && edc_inc(&i2400mu->urb_edc, EDC_MAX_ERRORS,
+					   EDC_ERROR_TIMEFRAME))
+			goto error_exceeded;
+		if (ret == -ENOMEM)	/* uff...power cycle? shutdown? */
+			goto error_exceeded;
+		break;
+	case -EINVAL:			/* while removing driver */
+	case -ENODEV:			/* dev disconnect ... */
+	case -ENOENT:			/* ditto */
+	case -ESHUTDOWN:		/* URB killed */
+	case -ECONNRESET:		/* disconnection */
+		goto out;		/* Notify around */
+	default:			/* Some error? */
+		if (edc_inc(&i2400mu->urb_edc,
+			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME))
+			goto error_exceeded;
+		dev_err(dev, "notification: URB error %d, retrying\n",
+			urb->status);
+	}
+	usb_mark_last_busy(i2400mu->usb_dev);
+	ret = usb_submit_urb(i2400mu->notif_urb, GFP_ATOMIC);
+	switch (ret) {
+	case 0:
+	case -EINVAL:			/* while removing driver */
+	case -ENODEV:			/* dev disconnect ... */
+	case -ENOENT:			/* ditto */
+	case -ESHUTDOWN:		/* URB killed */
+	case -ECONNRESET:		/* disconnection */
+		break;			/* just ignore */
+	default:			/* Some error? */
+		dev_err(dev, "notification: cannot submit URB: %d\n", ret);
+		goto error_submit;
+	}
+	d_fnend(4, dev, "(urb %p status %d actual_length %d) = void\n",
+		urb, urb->status, urb->actual_length);
+	return;
+
+error_exceeded:
+	dev_err(dev, "maximum errors in notification URB exceeded; "
+		"resetting device\n");
+error_submit:
+	usb_queue_reset_device(i2400mu->usb_iface);
+out:
+	d_fnend(4, dev, "(urb %p status %d actual_length %d) = void\n",
+		urb, urb->status, urb->actual_length);
+}
+
+
+/*
+ * setup the notification endpoint
+ *
+ * @i2400m: device descriptor
+ *
+ * This procedure prepares the notification urb and handler for receiving
+ * unsolicited barkers from the device.
+ */
+int i2400mu_notification_setup(struct i2400mu *i2400mu)
+{
+	struct device *dev = &i2400mu->usb_iface->dev;
+	int usb_pipe, ret = 0;
+	struct usb_endpoint_descriptor *epd;
+	char *buf;
+
+	d_fnstart(4, dev, "(i2400m %p)\n", i2400mu);
+	buf = kmalloc(I2400MU_MAX_NOTIFICATION_LEN, GFP_KERNEL | GFP_DMA);
+	if (buf == NULL) {
+		ret = -ENOMEM;
+		goto error_buf_alloc;
+	}
+
+	i2400mu->notif_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!i2400mu->notif_urb) {
+		ret = -ENOMEM;
+		goto error_alloc_urb;
+	}
+	epd = usb_get_epd(i2400mu->usb_iface,
+			  i2400mu->endpoint_cfg.notification);
+	usb_pipe = usb_rcvintpipe(i2400mu->usb_dev, epd->bEndpointAddress);
+	usb_fill_int_urb(i2400mu->notif_urb, i2400mu->usb_dev, usb_pipe,
+			 buf, I2400MU_MAX_NOTIFICATION_LEN,
+			 i2400mu_notification_cb, i2400mu, epd->bInterval);
+	ret = usb_submit_urb(i2400mu->notif_urb, GFP_KERNEL);
+	if (ret != 0) {
+		dev_err(dev, "notification: cannot submit URB: %d\n", ret);
+		goto error_submit;
+	}
+	d_fnend(4, dev, "(i2400m %p) = %d\n", i2400mu, ret);
+	return ret;
+
+error_submit:
+	usb_free_urb(i2400mu->notif_urb);
+error_alloc_urb:
+	kfree(buf);
+error_buf_alloc:
+	d_fnend(4, dev, "(i2400m %p) = %d\n", i2400mu, ret);
+	return ret;
+}
+
+
+/*
+ * Tear down of the notification mechanism
+ *
+ * @i2400m: device descriptor
+ *
+ * Kill the interrupt endpoint urb, free any allocated resources.
+ *
+ * We need to check if we have done it before as for example,
+ * _suspend() call this; if after a suspend() we get a _disconnect()
+ * (as the case is when hibernating), nothing bad happens.
+ */
+void i2400mu_notification_release(struct i2400mu *i2400mu)
+{
+	struct device *dev = &i2400mu->usb_iface->dev;
+
+	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
+	if (i2400mu->notif_urb != NULL) {
+		usb_kill_urb(i2400mu->notif_urb);
+		kfree(i2400mu->notif_urb->transfer_buffer);
+		usb_free_urb(i2400mu->notif_urb);
+		i2400mu->notif_urb = NULL;
+	}
+	d_fnend(4, dev, "(i2400mu %p)\n", i2400mu);
+}
diff --git a/drivers/staging/wimax/i2400m/usb-rx.c b/drivers/staging/wimax/i2400m/usb-rx.c
new file mode 100644
index 000000000000..5b64bda7d9e7
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/usb-rx.c
@@ -0,0 +1,462 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * USB RX handling
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ *  - Initial implementation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Use skb_clone(), break up processing in chunks
+ *  - Split transport/device specific
+ *  - Make buffer size dynamic to exert less memory pressure
+ *
+ *
+ * This handles the RX path on USB.
+ *
+ * When a notification is received that says 'there is RX data ready',
+ * we call i2400mu_rx_kick(); that wakes up the RX kthread, which
+ * reads a buffer from USB and passes it to i2400m_rx() in the generic
+ * handling code. The RX buffer has an specific format that is
+ * described in rx.c.
+ *
+ * We use a kernel thread in a loop because:
+ *
+ *  - we want to be able to call the USB power management get/put
+ *    functions (blocking) before each transaction.
+ *
+ *  - We might get a lot of notifications and we don't want to submit
+ *    a zillion reads; by serializing, we are throttling.
+ *
+ *  - RX data processing can get heavy enough so that it is not
+ *    appropriate for doing it in the USB callback; thus we run it in a
+ *    process context.
+ *
+ * We provide a read buffer of an arbitrary size (short of a page); if
+ * the callback reports -EOVERFLOW, it means it was too small, so we
+ * just double the size and retry (being careful to append, as
+ * sometimes the device provided some data). Every now and then we
+ * check if the average packet size is smaller than the current packet
+ * size and if so, we halve it. At the end, the size of the
+ * preallocated buffer should be following the average received
+ * transaction size, adapting dynamically to it.
+ *
+ * ROADMAP
+ *
+ * i2400mu_rx_kick()		   Called from notif.c when we get a
+ *   			           'data ready' notification
+ * i2400mu_rxd()                   Kernel RX daemon
+ *   i2400mu_rx()                  Receive USB data
+ *   i2400m_rx()                   Send data to generic i2400m RX handling
+ *
+ * i2400mu_rx_setup()              called from i2400mu_bus_dev_start()
+ *
+ * i2400mu_rx_release()            called from i2400mu_bus_dev_stop()
+ */
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/usb.h>
+#include "i2400m-usb.h"
+
+
+#define D_SUBMODULE rx
+#include "usb-debug-levels.h"
+
+/*
+ * Dynamic RX size
+ *
+ * We can't let the rx_size be a multiple of 512 bytes (the RX
+ * endpoint's max packet size). On some USB host controllers (we
+ * haven't been able to fully characterize which), if the device is
+ * about to send (for example) X bytes and we only post a buffer to
+ * receive n*512, it will fail to mark that as babble (so that
+ * i2400mu_rx() [case -EOVERFLOW] can resize the buffer and get the
+ * rest).
+ *
+ * So on growing or shrinking, if it is a multiple of the
+ * maxpacketsize, we remove some (instead of incresing some, so in a
+ * buddy allocator we try to waste less space).
+ *
+ * Note we also need a hook for this on i2400mu_rx() -- when we do the
+ * first read, we are sure we won't hit this spot because
+ * i240mm->rx_size has been set properly. However, if we have to
+ * double because of -EOVERFLOW, when we launch the read to get the
+ * rest of the data, we *have* to make sure that also is not a
+ * multiple of the max_pkt_size.
+ */
+
+static
+size_t i2400mu_rx_size_grow(struct i2400mu *i2400mu)
+{
+	struct device *dev = &i2400mu->usb_iface->dev;
+	size_t rx_size;
+	const size_t max_pkt_size = 512;
+
+	rx_size = 2 * i2400mu->rx_size;
+	if (rx_size % max_pkt_size == 0) {
+		rx_size -= 8;
+		d_printf(1, dev,
+			 "RX: expected size grew to %zu [adjusted -8] "
+			 "from %zu\n",
+			 rx_size, i2400mu->rx_size);
+	} else
+		d_printf(1, dev,
+			 "RX: expected size grew to %zu from %zu\n",
+			 rx_size, i2400mu->rx_size);
+	return rx_size;
+}
+
+
+static
+void i2400mu_rx_size_maybe_shrink(struct i2400mu *i2400mu)
+{
+	const size_t max_pkt_size = 512;
+	struct device *dev = &i2400mu->usb_iface->dev;
+
+	if (unlikely(i2400mu->rx_size_cnt >= 100
+		     && i2400mu->rx_size_auto_shrink)) {
+		size_t avg_rx_size =
+			i2400mu->rx_size_acc / i2400mu->rx_size_cnt;
+		size_t new_rx_size = i2400mu->rx_size / 2;
+		if (avg_rx_size < new_rx_size) {
+			if (new_rx_size % max_pkt_size == 0) {
+				new_rx_size -= 8;
+				d_printf(1, dev,
+					 "RX: expected size shrank to %zu "
+					 "[adjusted -8] from %zu\n",
+					 new_rx_size, i2400mu->rx_size);
+			} else
+				d_printf(1, dev,
+					 "RX: expected size shrank to %zu "
+					 "from %zu\n",
+					 new_rx_size, i2400mu->rx_size);
+			i2400mu->rx_size = new_rx_size;
+			i2400mu->rx_size_cnt = 0;
+			i2400mu->rx_size_acc = i2400mu->rx_size;
+		}
+	}
+}
+
+/*
+ * Receive a message with payloads from the USB bus into an skb
+ *
+ * @i2400mu: USB device descriptor
+ * @rx_skb: skb where to place the received message
+ *
+ * Deals with all the USB-specifics of receiving, dynamically
+ * increasing the buffer size if so needed. Returns the payload in the
+ * skb, ready to process. On a zero-length packet, we retry.
+ *
+ * On soft USB errors, we retry (until they become too frequent and
+ * then are promoted to hard); on hard USB errors, we reset the
+ * device. On other errors (skb realloacation, we just drop it and
+ * hope for the next invocation to solve it).
+ *
+ * Returns: pointer to the skb if ok, ERR_PTR on error.
+ *   NOTE: this function might realloc the skb (if it is too small),
+ *   so always update with the one returned.
+ *   ERR_PTR() is < 0 on error.
+ *   Will return NULL if it cannot reallocate -- this can be
+ *   considered a transient retryable error.
+ */
+static
+struct sk_buff *i2400mu_rx(struct i2400mu *i2400mu, struct sk_buff *rx_skb)
+{
+	int result = 0;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	int usb_pipe, read_size, rx_size, do_autopm;
+	struct usb_endpoint_descriptor *epd;
+	const size_t max_pkt_size = 512;
+
+	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
+	do_autopm = atomic_read(&i2400mu->do_autopm);
+	result = do_autopm ?
+		usb_autopm_get_interface(i2400mu->usb_iface) : 0;
+	if (result < 0) {
+		dev_err(dev, "RX: can't get autopm: %d\n", result);
+		do_autopm = 0;
+	}
+	epd = usb_get_epd(i2400mu->usb_iface, i2400mu->endpoint_cfg.bulk_in);
+	usb_pipe = usb_rcvbulkpipe(i2400mu->usb_dev, epd->bEndpointAddress);
+retry:
+	rx_size = skb_end_pointer(rx_skb) - rx_skb->data - rx_skb->len;
+	if (unlikely(rx_size % max_pkt_size == 0)) {
+		rx_size -= 8;
+		d_printf(1, dev, "RX: rx_size adapted to %d [-8]\n", rx_size);
+	}
+	result = usb_bulk_msg(
+		i2400mu->usb_dev, usb_pipe, rx_skb->data + rx_skb->len,
+		rx_size, &read_size, 200);
+	usb_mark_last_busy(i2400mu->usb_dev);
+	switch (result) {
+	case 0:
+		if (read_size == 0)
+			goto retry;	/* ZLP, just resubmit */
+		skb_put(rx_skb, read_size);
+		break;
+	case -EPIPE:
+		/*
+		 * Stall -- maybe the device is choking with our
+		 * requests. Clear it and give it some time. If they
+		 * happen to often, it might be another symptom, so we
+		 * reset.
+		 *
+		 * No error handling for usb_clear_halt(0; if it
+		 * works, the retry works; if it fails, this switch
+		 * does the error handling for us.
+		 */
+		if (edc_inc(&i2400mu->urb_edc,
+			    10 * EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
+			dev_err(dev, "BM-CMD: too many stalls in "
+				"URB; resetting device\n");
+			goto do_reset;
+		}
+		usb_clear_halt(i2400mu->usb_dev, usb_pipe);
+		msleep(10);	/* give the device some time */
+		goto retry;
+	case -EINVAL:			/* while removing driver */
+	case -ENODEV:			/* dev disconnect ... */
+	case -ENOENT:			/* just ignore it */
+	case -ESHUTDOWN:
+	case -ECONNRESET:
+		break;
+	case -EOVERFLOW: {		/* too small, reallocate */
+		struct sk_buff *new_skb;
+		rx_size = i2400mu_rx_size_grow(i2400mu);
+		if (rx_size <= (1 << 16))	/* cap it */
+			i2400mu->rx_size = rx_size;
+		else if (printk_ratelimit()) {
+			dev_err(dev, "BUG? rx_size up to %d\n", rx_size);
+			result = -EINVAL;
+			goto out;
+		}
+		skb_put(rx_skb, read_size);
+		new_skb = skb_copy_expand(rx_skb, 0, rx_size - rx_skb->len,
+					  GFP_KERNEL);
+		if (new_skb == NULL) {
+			kfree_skb(rx_skb);
+			rx_skb = NULL;
+			goto out;	/* drop it...*/
+		}
+		kfree_skb(rx_skb);
+		rx_skb = new_skb;
+		i2400mu->rx_size_cnt = 0;
+		i2400mu->rx_size_acc = i2400mu->rx_size;
+		d_printf(1, dev, "RX: size changed to %d, received %d, "
+			 "copied %d, capacity %ld\n",
+			 rx_size, read_size, rx_skb->len,
+			 (long) skb_end_offset(new_skb));
+		goto retry;
+	}
+		/* In most cases, it happens due to the hardware scheduling a
+		 * read when there was no data - unfortunately, we have no way
+		 * to tell this timeout from a USB timeout. So we just ignore
+		 * it. */
+	case -ETIMEDOUT:
+		dev_err(dev, "RX: timeout: %d\n", result);
+		result = 0;
+		break;
+	default:			/* Any error */
+		if (edc_inc(&i2400mu->urb_edc,
+			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME))
+			goto error_reset;
+		dev_err(dev, "RX: error receiving URB: %d, retrying\n", result);
+		goto retry;
+	}
+out:
+	if (do_autopm)
+		usb_autopm_put_interface(i2400mu->usb_iface);
+	d_fnend(4, dev, "(i2400mu %p) = %p\n", i2400mu, rx_skb);
+	return rx_skb;
+
+error_reset:
+	dev_err(dev, "RX: maximum errors in URB exceeded; "
+		"resetting device\n");
+do_reset:
+	usb_queue_reset_device(i2400mu->usb_iface);
+	rx_skb = ERR_PTR(result);
+	goto out;
+}
+
+
+/*
+ * Kernel thread for USB reception of data
+ *
+ * This thread waits for a kick; once kicked, it will allocate an skb
+ * and receive a single message to it from USB (using
+ * i2400mu_rx()). Once received, it is passed to the generic i2400m RX
+ * code for processing.
+ *
+ * When done processing, it runs some dirty statistics to verify if
+ * the last 100 messages received were smaller than half of the
+ * current RX buffer size. In that case, the RX buffer size is
+ * halved. This will helps lowering the pressure on the memory
+ * allocator.
+ *
+ * Hard errors force the thread to exit.
+ */
+static
+int i2400mu_rxd(void *_i2400mu)
+{
+	int result = 0;
+	struct i2400mu *i2400mu = _i2400mu;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	size_t pending;
+	int rx_size;
+	struct sk_buff *rx_skb;
+	unsigned long flags;
+
+	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	BUG_ON(i2400mu->rx_kthread != NULL);
+	i2400mu->rx_kthread = current;
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	while (1) {
+		d_printf(2, dev, "RX: waiting for messages\n");
+		pending = 0;
+		wait_event_interruptible(
+			i2400mu->rx_wq,
+			(kthread_should_stop()	/* check this first! */
+			 || (pending = atomic_read(&i2400mu->rx_pending_count)))
+			);
+		if (kthread_should_stop())
+			break;
+		if (pending == 0)
+			continue;
+		rx_size = i2400mu->rx_size;
+		d_printf(2, dev, "RX: reading up to %d bytes\n", rx_size);
+		rx_skb = __netdev_alloc_skb(net_dev, rx_size, GFP_KERNEL);
+		if (rx_skb == NULL) {
+			dev_err(dev, "RX: can't allocate skb [%d bytes]\n",
+				rx_size);
+			msleep(50);	/* give it some time? */
+			continue;
+		}
+
+		/* Receive the message with the payloads */
+		rx_skb = i2400mu_rx(i2400mu, rx_skb);
+		result = PTR_ERR(rx_skb);
+		if (IS_ERR(rx_skb))
+			goto out;
+		atomic_dec(&i2400mu->rx_pending_count);
+		if (rx_skb == NULL || rx_skb->len == 0) {
+			/* some "ignorable" condition */
+			kfree_skb(rx_skb);
+			continue;
+		}
+
+		/* Deliver the message to the generic i2400m code */
+		i2400mu->rx_size_cnt++;
+		i2400mu->rx_size_acc += rx_skb->len;
+		result = i2400m_rx(i2400m, rx_skb);
+		if (result == -EIO
+		    && edc_inc(&i2400mu->urb_edc,
+			       EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
+			goto error_reset;
+		}
+
+		/* Maybe adjust RX buffer size */
+		i2400mu_rx_size_maybe_shrink(i2400mu);
+	}
+	result = 0;
+out:
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	i2400mu->rx_kthread = NULL;
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	d_fnend(4, dev, "(i2400mu %p) = %d\n", i2400mu, result);
+	return result;
+
+error_reset:
+	dev_err(dev, "RX: maximum errors in received buffer exceeded; "
+		"resetting device\n");
+	usb_queue_reset_device(i2400mu->usb_iface);
+	goto out;
+}
+
+
+/*
+ * Start reading from the device
+ *
+ * @i2400m: device instance
+ *
+ * Notify the RX thread that there is data pending.
+ */
+void i2400mu_rx_kick(struct i2400mu *i2400mu)
+{
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct device *dev = &i2400mu->usb_iface->dev;
+
+	d_fnstart(3, dev, "(i2400mu %p)\n", i2400m);
+	atomic_inc(&i2400mu->rx_pending_count);
+	wake_up_all(&i2400mu->rx_wq);
+	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
+}
+
+
+int i2400mu_rx_setup(struct i2400mu *i2400mu)
+{
+	int result = 0;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	struct task_struct *kthread;
+
+	kthread = kthread_run(i2400mu_rxd, i2400mu, "%s-rx",
+			      wimax_dev->name);
+	/* the kthread function sets i2400mu->rx_thread */
+	if (IS_ERR(kthread)) {
+		result = PTR_ERR(kthread);
+		dev_err(dev, "RX: cannot start thread: %d\n", result);
+	}
+	return result;
+}
+
+
+void i2400mu_rx_release(struct i2400mu *i2400mu)
+{
+	unsigned long flags;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct device *dev = i2400m_dev(i2400m);
+	struct task_struct *kthread;
+
+	spin_lock_irqsave(&i2400m->rx_lock, flags);
+	kthread = i2400mu->rx_kthread;
+	i2400mu->rx_kthread = NULL;
+	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
+	if (kthread)
+		kthread_stop(kthread);
+	else
+		d_printf(1, dev, "RX: kthread had already exited\n");
+}
+
diff --git a/drivers/staging/wimax/i2400m/usb-tx.c b/drivers/staging/wimax/i2400m/usb-tx.c
new file mode 100644
index 000000000000..3ba9d70cca1b
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/usb-tx.c
@@ -0,0 +1,273 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * USB specific TX handling
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ *  - Initial implementation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Split transport/device specific
+ *
+ *
+ * Takes the TX messages in the i2400m's driver TX FIFO and sends them
+ * to the device until there are no more.
+ *
+ * If we fail sending the message, we just drop it. There isn't much
+ * we can do at this point. We could also retry, but the USB stack has
+ * already retried and still failed, so there is not much of a
+ * point. As well, most of the traffic is network, which has recovery
+ * methods for dropped packets.
+ *
+ * For sending we just obtain a FIFO buffer to send, send it to the
+ * USB bulk out, tell the TX FIFO code we have sent it; query for
+ * another one, etc... until done.
+ *
+ * We use a thread so we can call usb_autopm_enable() and
+ * usb_autopm_disable() for each transaction; this way when the device
+ * goes idle, it will suspend. It also has less overhead than a
+ * dedicated workqueue, as it is being used for a single task.
+ *
+ * ROADMAP
+ *
+ * i2400mu_tx_setup()
+ * i2400mu_tx_release()
+ *
+ * i2400mu_bus_tx_kick()	- Called by the tx.c code when there
+ *                                is new data in the FIFO.
+ * i2400mu_txd()
+ *   i2400m_tx_msg_get()
+ *   i2400m_tx_msg_sent()
+ */
+#include "i2400m-usb.h"
+
+
+#define D_SUBMODULE tx
+#include "usb-debug-levels.h"
+
+
+/*
+ * Get the next TX message in the TX FIFO and send it to the device
+ *
+ * Note that any iteration consumes a message to be sent, no matter if
+ * it succeeds or fails (we have no real way to retry or complain).
+ *
+ * Return: 0 if ok, < 0 errno code on hard error.
+ */
+static
+int i2400mu_tx(struct i2400mu *i2400mu, struct i2400m_msg_hdr *tx_msg,
+	       size_t tx_msg_size)
+{
+	int result = 0;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	int usb_pipe, sent_size, do_autopm;
+	struct usb_endpoint_descriptor *epd;
+
+	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
+	do_autopm = atomic_read(&i2400mu->do_autopm);
+	result = do_autopm ?
+		usb_autopm_get_interface(i2400mu->usb_iface) : 0;
+	if (result < 0) {
+		dev_err(dev, "TX: can't get autopm: %d\n", result);
+		do_autopm = 0;
+	}
+	epd = usb_get_epd(i2400mu->usb_iface, i2400mu->endpoint_cfg.bulk_out);
+	usb_pipe = usb_sndbulkpipe(i2400mu->usb_dev, epd->bEndpointAddress);
+retry:
+	result = usb_bulk_msg(i2400mu->usb_dev, usb_pipe,
+			      tx_msg, tx_msg_size, &sent_size, 200);
+	usb_mark_last_busy(i2400mu->usb_dev);
+	switch (result) {
+	case 0:
+		if (sent_size != tx_msg_size) {	/* Too short? drop it */
+			dev_err(dev, "TX: short write (%d B vs %zu "
+				"expected)\n", sent_size, tx_msg_size);
+			result = -EIO;
+		}
+		break;
+	case -EPIPE:
+		/*
+		 * Stall -- maybe the device is choking with our
+		 * requests. Clear it and give it some time. If they
+		 * happen to often, it might be another symptom, so we
+		 * reset.
+		 *
+		 * No error handling for usb_clear_halt(0; if it
+		 * works, the retry works; if it fails, this switch
+		 * does the error handling for us.
+		 */
+		if (edc_inc(&i2400mu->urb_edc,
+			    10 * EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
+			dev_err(dev, "BM-CMD: too many stalls in "
+				"URB; resetting device\n");
+			usb_queue_reset_device(i2400mu->usb_iface);
+		} else {
+			usb_clear_halt(i2400mu->usb_dev, usb_pipe);
+			msleep(10);	/* give the device some time */
+			goto retry;
+		}
+		fallthrough;
+	case -EINVAL:			/* while removing driver */
+	case -ENODEV:			/* dev disconnect ... */
+	case -ENOENT:			/* just ignore it */
+	case -ESHUTDOWN:		/* and exit */
+	case -ECONNRESET:
+		result = -ESHUTDOWN;
+		break;
+	default:			/* Some error? */
+		if (edc_inc(&i2400mu->urb_edc,
+			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
+			dev_err(dev, "TX: maximum errors in URB "
+				"exceeded; resetting device\n");
+			usb_queue_reset_device(i2400mu->usb_iface);
+		} else {
+			dev_err(dev, "TX: cannot send URB; retrying. "
+				"tx_msg @%zu %zu B [%d sent]: %d\n",
+				(void *) tx_msg - i2400m->tx_buf,
+				tx_msg_size, sent_size, result);
+			goto retry;
+		}
+	}
+	if (do_autopm)
+		usb_autopm_put_interface(i2400mu->usb_iface);
+	d_fnend(4, dev, "(i2400mu %p) = result\n", i2400mu);
+	return result;
+}
+
+
+/*
+ * Get the next TX message in the TX FIFO and send it to the device
+ *
+ * Note we exit the loop if i2400mu_tx() fails; that function only
+ * fails on hard error (failing to tx a buffer not being one of them,
+ * see its doc).
+ *
+ * Return: 0
+ */
+static
+int i2400mu_txd(void *_i2400mu)
+{
+	struct i2400mu *i2400mu = _i2400mu;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	struct i2400m_msg_hdr *tx_msg;
+	size_t tx_msg_size;
+	unsigned long flags;
+
+	d_fnstart(4, dev, "(i2400mu %p)\n", i2400mu);
+
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	BUG_ON(i2400mu->tx_kthread != NULL);
+	i2400mu->tx_kthread = current;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+
+	while (1) {
+		d_printf(2, dev, "TX: waiting for messages\n");
+		tx_msg = NULL;
+		wait_event_interruptible(
+			i2400mu->tx_wq,
+			(kthread_should_stop()	/* check this first! */
+			 || (tx_msg = i2400m_tx_msg_get(i2400m, &tx_msg_size)))
+			);
+		if (kthread_should_stop())
+			break;
+		WARN_ON(tx_msg == NULL);	/* should not happen...*/
+		d_printf(2, dev, "TX: submitting %zu bytes\n", tx_msg_size);
+		d_dump(5, dev, tx_msg, tx_msg_size);
+		/* Yeah, we ignore errors ... not much we can do */
+		i2400mu_tx(i2400mu, tx_msg, tx_msg_size);
+		i2400m_tx_msg_sent(i2400m);	/* ack it, advance the FIFO */
+	}
+
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	i2400mu->tx_kthread = NULL;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+
+	d_fnend(4, dev, "(i2400mu %p)\n", i2400mu);
+	return 0;
+}
+
+
+/*
+ * i2400m TX engine notifies us that there is data in the FIFO ready
+ * for TX
+ *
+ * If there is a URB in flight, don't do anything; when it finishes,
+ * it will see there is data in the FIFO and send it. Else, just
+ * submit a write.
+ */
+void i2400mu_bus_tx_kick(struct i2400m *i2400m)
+{
+	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
+	struct device *dev = &i2400mu->usb_iface->dev;
+
+	d_fnstart(3, dev, "(i2400m %p) = void\n", i2400m);
+	wake_up_all(&i2400mu->tx_wq);
+	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
+}
+
+
+int i2400mu_tx_setup(struct i2400mu *i2400mu)
+{
+	int result = 0;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
+	struct task_struct *kthread;
+
+	kthread = kthread_run(i2400mu_txd, i2400mu, "%s-tx",
+			      wimax_dev->name);
+	/* the kthread function sets i2400mu->tx_thread */
+	if (IS_ERR(kthread)) {
+		result = PTR_ERR(kthread);
+		dev_err(dev, "TX: cannot start thread: %d\n", result);
+	}
+	return result;
+}
+
+void i2400mu_tx_release(struct i2400mu *i2400mu)
+{
+	unsigned long flags;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct device *dev = i2400m_dev(i2400m);
+	struct task_struct *kthread;
+
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	kthread = i2400mu->tx_kthread;
+	i2400mu->tx_kthread = NULL;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	if (kthread)
+		kthread_stop(kthread);
+	else
+		d_printf(1, dev, "TX: kthread had already exited\n");
+}
diff --git a/drivers/staging/wimax/i2400m/usb.c b/drivers/staging/wimax/i2400m/usb.c
new file mode 100644
index 000000000000..3b84dd7b5567
--- /dev/null
+++ b/drivers/staging/wimax/i2400m/usb.c
@@ -0,0 +1,764 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Linux driver model glue for USB device, reset & fw upload
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ * Yanir Lubetkin <yanirx.lubetkin@intel.com>
+ *
+ * See i2400m-usb.h for a general description of this driver.
+ *
+ * This file implements driver model glue, and hook ups for the
+ * generic driver to implement the bus-specific functions (device
+ * communication setup/tear down, firmware upload and resetting).
+ *
+ * ROADMAP
+ *
+ * i2400mu_probe()
+ *   alloc_netdev()...
+ *     i2400mu_netdev_setup()
+ *       i2400mu_init()
+ *       i2400m_netdev_setup()
+ *   i2400m_setup()...
+ *
+ * i2400mu_disconnect
+ *   i2400m_release()
+ *   free_netdev()
+ *
+ * i2400mu_suspend()
+ *   i2400m_cmd_enter_powersave()
+ *   i2400mu_notification_release()
+ *
+ * i2400mu_resume()
+ *   i2400mu_notification_setup()
+ *
+ * i2400mu_bus_dev_start()        Called by i2400m_dev_start() [who is
+ *   i2400mu_tx_setup()           called by i2400m_setup()]
+ *   i2400mu_rx_setup()
+ *   i2400mu_notification_setup()
+ *
+ * i2400mu_bus_dev_stop()         Called by i2400m_dev_stop() [who is
+ *   i2400mu_notification_release()  called by i2400m_release()]
+ *   i2400mu_rx_release()
+ *   i2400mu_tx_release()
+ *
+ * i2400mu_bus_reset()            Called by i2400m_reset
+ *   __i2400mu_reset()
+ *     __i2400mu_send_barker()
+ *   usb_reset_device()
+ */
+#include "i2400m-usb.h"
+#include "linux-wimax-i2400m.h"
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+
+#define D_SUBMODULE usb
+#include "usb-debug-levels.h"
+
+static char i2400mu_debug_params[128];
+module_param_string(debug, i2400mu_debug_params, sizeof(i2400mu_debug_params),
+		    0644);
+MODULE_PARM_DESC(debug,
+		 "String of space-separated NAME:VALUE pairs, where NAMEs "
+		 "are the different debug submodules and VALUE are the "
+		 "initial debug value to set.");
+
+/* Our firmware file name */
+static const char *i2400mu_bus_fw_names_5x50[] = {
+#define I2400MU_FW_FILE_NAME_v1_5 "i2400m-fw-usb-1.5.sbcf"
+	I2400MU_FW_FILE_NAME_v1_5,
+#define I2400MU_FW_FILE_NAME_v1_4 "i2400m-fw-usb-1.4.sbcf"
+	I2400MU_FW_FILE_NAME_v1_4,
+	NULL,
+};
+
+
+static const char *i2400mu_bus_fw_names_6050[] = {
+#define I6050U_FW_FILE_NAME_v1_5 "i6050-fw-usb-1.5.sbcf"
+	I6050U_FW_FILE_NAME_v1_5,
+	NULL,
+};
+
+
+static
+int i2400mu_bus_dev_start(struct i2400m *i2400m)
+{
+	int result;
+	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
+	struct device *dev = &i2400mu->usb_iface->dev;
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	result = i2400mu_tx_setup(i2400mu);
+	if (result < 0)
+		goto error_usb_tx_setup;
+	result = i2400mu_rx_setup(i2400mu);
+	if (result < 0)
+		goto error_usb_rx_setup;
+	result = i2400mu_notification_setup(i2400mu);
+	if (result < 0)
+		goto error_notif_setup;
+	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
+	return result;
+
+error_notif_setup:
+	i2400mu_rx_release(i2400mu);
+error_usb_rx_setup:
+	i2400mu_tx_release(i2400mu);
+error_usb_tx_setup:
+	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
+	return result;
+}
+
+
+static
+void i2400mu_bus_dev_stop(struct i2400m *i2400m)
+{
+	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
+	struct device *dev = &i2400mu->usb_iface->dev;
+
+	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	i2400mu_notification_release(i2400mu);
+	i2400mu_rx_release(i2400mu);
+	i2400mu_tx_release(i2400mu);
+	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
+}
+
+
+/*
+ * Sends a barker buffer to the device
+ *
+ * This helper will allocate a kmalloced buffer and use it to transmit
+ * (then free it). Reason for this is that other arches cannot use
+ * stack/vmalloc/text areas for DMA transfers.
+ *
+ * Error recovery here is simpler: anything is considered a hard error
+ * and will move the reset code to use a last-resort bus-based reset.
+ */
+static
+int __i2400mu_send_barker(struct i2400mu *i2400mu,
+			  const __le32 *barker,
+			  size_t barker_size,
+			  unsigned endpoint)
+{
+	struct usb_endpoint_descriptor *epd = NULL;
+	int pipe, actual_len, ret;
+	struct device *dev = &i2400mu->usb_iface->dev;
+	void *buffer;
+	int do_autopm = 1;
+
+	ret = usb_autopm_get_interface(i2400mu->usb_iface);
+	if (ret < 0) {
+		dev_err(dev, "RESET: can't get autopm: %d\n", ret);
+		do_autopm = 0;
+	}
+	ret = -ENOMEM;
+	buffer = kmalloc(barker_size, GFP_KERNEL);
+	if (buffer == NULL)
+		goto error_kzalloc;
+	epd = usb_get_epd(i2400mu->usb_iface, endpoint);
+	pipe = usb_sndbulkpipe(i2400mu->usb_dev, epd->bEndpointAddress);
+	memcpy(buffer, barker, barker_size);
+retry:
+	ret = usb_bulk_msg(i2400mu->usb_dev, pipe, buffer, barker_size,
+			   &actual_len, 200);
+	switch (ret) {
+	case 0:
+		if (actual_len != barker_size) {	/* Too short? drop it */
+			dev_err(dev, "E: %s: short write (%d B vs %zu "
+				"expected)\n",
+				__func__, actual_len, barker_size);
+			ret = -EIO;
+		}
+		break;
+	case -EPIPE:
+		/*
+		 * Stall -- maybe the device is choking with our
+		 * requests. Clear it and give it some time. If they
+		 * happen to often, it might be another symptom, so we
+		 * reset.
+		 *
+		 * No error handling for usb_clear_halt(0; if it
+		 * works, the retry works; if it fails, this switch
+		 * does the error handling for us.
+		 */
+		if (edc_inc(&i2400mu->urb_edc,
+			    10 * EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
+			dev_err(dev, "E: %s: too many stalls in "
+				"URB; resetting device\n", __func__);
+			usb_queue_reset_device(i2400mu->usb_iface);
+			/* fallthrough */
+		} else {
+			usb_clear_halt(i2400mu->usb_dev, pipe);
+			msleep(10);	/* give the device some time */
+			goto retry;
+		}
+		fallthrough;
+	case -EINVAL:			/* while removing driver */
+	case -ENODEV:			/* dev disconnect ... */
+	case -ENOENT:			/* just ignore it */
+	case -ESHUTDOWN:		/* and exit */
+	case -ECONNRESET:
+		ret = -ESHUTDOWN;
+		break;
+	default:			/* Some error? */
+		if (edc_inc(&i2400mu->urb_edc,
+			    EDC_MAX_ERRORS, EDC_ERROR_TIMEFRAME)) {
+			dev_err(dev, "E: %s: maximum errors in URB "
+				"exceeded; resetting device\n",
+				__func__);
+			usb_queue_reset_device(i2400mu->usb_iface);
+		} else {
+			dev_warn(dev, "W: %s: cannot send URB: %d\n",
+				 __func__, ret);
+			goto retry;
+		}
+	}
+	kfree(buffer);
+error_kzalloc:
+	if (do_autopm)
+		usb_autopm_put_interface(i2400mu->usb_iface);
+	return ret;
+}
+
+
+/*
+ * Reset a device at different levels (warm, cold or bus)
+ *
+ * @i2400m: device descriptor
+ * @reset_type: soft, warm or bus reset (I2400M_RT_WARM/SOFT/BUS)
+ *
+ * Warm and cold resets get a USB reset if they fail.
+ *
+ * Warm reset:
+ *
+ * The device will be fully reset internally, but won't be
+ * disconnected from the USB bus (so no reenumeration will
+ * happen). Firmware upload will be necessary.
+ *
+ * The device will send a reboot barker in the notification endpoint
+ * that will trigger the driver to reinitialize the state
+ * automatically from notif.c:i2400m_notification_grok() into
+ * i2400m_dev_bootstrap_delayed().
+ *
+ * Cold and bus (USB) reset:
+ *
+ * The device will be fully reset internally, disconnected from the
+ * USB bus an a reenumeration will happen. Firmware upload will be
+ * necessary. Thus, we don't do any locking or struct
+ * reinitialization, as we are going to be fully disconnected and
+ * reenumerated.
+ *
+ * Note we need to return -ENODEV if a warm reset was requested and we
+ * had to resort to a bus reset. See i2400m_op_reset(), wimax_reset()
+ * and wimax_dev->op_reset.
+ *
+ * WARNING: no driver state saved/fixed
+ */
+static
+int i2400mu_bus_reset(struct i2400m *i2400m, enum i2400m_reset_type rt)
+{
+	int result;
+	struct i2400mu *i2400mu =
+		container_of(i2400m, struct i2400mu, i2400m);
+	struct device *dev = i2400m_dev(i2400m);
+	static const __le32 i2400m_WARM_BOOT_BARKER[4] = {
+		cpu_to_le32(I2400M_WARM_RESET_BARKER),
+		cpu_to_le32(I2400M_WARM_RESET_BARKER),
+		cpu_to_le32(I2400M_WARM_RESET_BARKER),
+		cpu_to_le32(I2400M_WARM_RESET_BARKER),
+	};
+	static const __le32 i2400m_COLD_BOOT_BARKER[4] = {
+		cpu_to_le32(I2400M_COLD_RESET_BARKER),
+		cpu_to_le32(I2400M_COLD_RESET_BARKER),
+		cpu_to_le32(I2400M_COLD_RESET_BARKER),
+		cpu_to_le32(I2400M_COLD_RESET_BARKER),
+	};
+
+	d_fnstart(3, dev, "(i2400m %p rt %u)\n", i2400m, rt);
+	if (rt == I2400M_RT_WARM)
+		result = __i2400mu_send_barker(
+			i2400mu, i2400m_WARM_BOOT_BARKER,
+			sizeof(i2400m_WARM_BOOT_BARKER),
+			i2400mu->endpoint_cfg.bulk_out);
+	else if (rt == I2400M_RT_COLD)
+		result = __i2400mu_send_barker(
+			i2400mu, i2400m_COLD_BOOT_BARKER,
+			sizeof(i2400m_COLD_BOOT_BARKER),
+			i2400mu->endpoint_cfg.reset_cold);
+	else if (rt == I2400M_RT_BUS) {
+		result = usb_reset_device(i2400mu->usb_dev);
+		switch (result) {
+		case 0:
+		case -EINVAL:	/* device is gone */
+		case -ENODEV:
+		case -ENOENT:
+		case -ESHUTDOWN:
+			result = 0;
+			break;	/* We assume the device is disconnected */
+		default:
+			dev_err(dev, "USB reset failed (%d), giving up!\n",
+				result);
+		}
+	} else {
+		result = -EINVAL;	/* shut gcc up in certain arches */
+		BUG();
+	}
+	if (result < 0
+	    && result != -EINVAL	/* device is gone */
+	    && rt != I2400M_RT_BUS) {
+		/*
+		 * Things failed -- resort to lower level reset, that
+		 * we queue in another context; the reason for this is
+		 * that the pre and post reset functionality requires
+		 * the i2400m->init_mutex; RT_WARM and RT_COLD can
+		 * come from areas where i2400m->init_mutex is taken.
+		 */
+		dev_err(dev, "%s reset failed (%d); trying USB reset\n",
+			rt == I2400M_RT_WARM ? "warm" : "cold", result);
+		usb_queue_reset_device(i2400mu->usb_iface);
+		result = -ENODEV;
+	}
+	d_fnend(3, dev, "(i2400m %p rt %u) = %d\n", i2400m, rt, result);
+	return result;
+}
+
+static void i2400mu_get_drvinfo(struct net_device *net_dev,
+                                struct ethtool_drvinfo *info)
+{
+	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
+	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
+	struct usb_device *udev = i2400mu->usb_dev;
+
+	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
+	strlcpy(info->fw_version, i2400m->fw_name ? : "",
+		sizeof(info->fw_version));
+	usb_make_path(udev, info->bus_info, sizeof(info->bus_info));
+}
+
+static const struct ethtool_ops i2400mu_ethtool_ops = {
+	.get_drvinfo = i2400mu_get_drvinfo,
+	.get_link = ethtool_op_get_link,
+};
+
+static
+void i2400mu_netdev_setup(struct net_device *net_dev)
+{
+	struct i2400m *i2400m = net_dev_to_i2400m(net_dev);
+	struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
+	i2400mu_init(i2400mu);
+	i2400m_netdev_setup(net_dev);
+	net_dev->ethtool_ops = &i2400mu_ethtool_ops;
+}
+
+
+/*
+ * Debug levels control; see debug.h
+ */
+struct d_level D_LEVEL[] = {
+	D_SUBMODULE_DEFINE(usb),
+	D_SUBMODULE_DEFINE(fw),
+	D_SUBMODULE_DEFINE(notif),
+	D_SUBMODULE_DEFINE(rx),
+	D_SUBMODULE_DEFINE(tx),
+};
+size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
+
+static
+void i2400mu_debugfs_add(struct i2400mu *i2400mu)
+{
+	struct dentry *dentry = i2400mu->i2400m.wimax_dev.debugfs_dentry;
+
+	dentry = debugfs_create_dir("i2400m-usb", dentry);
+	i2400mu->debugfs_dentry = dentry;
+
+	d_level_register_debugfs("dl_", usb, dentry);
+	d_level_register_debugfs("dl_", fw, dentry);
+	d_level_register_debugfs("dl_", notif, dentry);
+	d_level_register_debugfs("dl_", rx, dentry);
+	d_level_register_debugfs("dl_", tx, dentry);
+
+	/* Don't touch these if you don't know what you are doing */
+	debugfs_create_u8("rx_size_auto_shrink", 0600, dentry,
+			  &i2400mu->rx_size_auto_shrink);
+
+	debugfs_create_size_t("rx_size", 0600, dentry, &i2400mu->rx_size);
+}
+
+
+static struct device_type i2400mu_type = {
+	.name	= "wimax",
+};
+
+/*
+ * Probe a i2400m interface and register it
+ *
+ * @iface:   USB interface to link to
+ * @id:      USB class/subclass/protocol id
+ * @returns: 0 if ok, < 0 errno code on error.
+ *
+ * Alloc a net device, initialize the bus-specific details and then
+ * calls the bus-generic initialization routine. That will register
+ * the wimax and netdev devices, upload the firmware [using
+ * _bus_bm_*()], call _bus_dev_start() to finalize the setup of the
+ * communication with the device and then will start to talk to it to
+ * finnish setting it up.
+ */
+static
+int i2400mu_probe(struct usb_interface *iface,
+		  const struct usb_device_id *id)
+{
+	int result;
+	struct net_device *net_dev;
+	struct device *dev = &iface->dev;
+	struct i2400m *i2400m;
+	struct i2400mu *i2400mu;
+	struct usb_device *usb_dev = interface_to_usbdev(iface);
+
+	if (iface->cur_altsetting->desc.bNumEndpoints < 4)
+		return -ENODEV;
+
+	if (usb_dev->speed != USB_SPEED_HIGH)
+		dev_err(dev, "device not connected as high speed\n");
+
+	/* Allocate instance [calls i2400m_netdev_setup() on it]. */
+	result = -ENOMEM;
+	net_dev = alloc_netdev(sizeof(*i2400mu), "wmx%d", NET_NAME_UNKNOWN,
+			       i2400mu_netdev_setup);
+	if (net_dev == NULL) {
+		dev_err(dev, "no memory for network device instance\n");
+		goto error_alloc_netdev;
+	}
+	SET_NETDEV_DEV(net_dev, dev);
+	SET_NETDEV_DEVTYPE(net_dev, &i2400mu_type);
+	i2400m = net_dev_to_i2400m(net_dev);
+	i2400mu = container_of(i2400m, struct i2400mu, i2400m);
+	i2400m->wimax_dev.net_dev = net_dev;
+	i2400mu->usb_dev = usb_get_dev(usb_dev);
+	i2400mu->usb_iface = iface;
+	usb_set_intfdata(iface, i2400mu);
+
+	i2400m->bus_tx_block_size = I2400MU_BLK_SIZE;
+	/*
+	 * Room required in the Tx queue for USB message to accommodate
+	 * a smallest payload while allocating header space is 16 bytes.
+	 * Adding this room  for the new tx message increases the
+	 * possibilities of including any payload with size <= 16 bytes.
+	 */
+	i2400m->bus_tx_room_min = I2400MU_BLK_SIZE;
+	i2400m->bus_pl_size_max = I2400MU_PL_SIZE_MAX;
+	i2400m->bus_setup = NULL;
+	i2400m->bus_dev_start = i2400mu_bus_dev_start;
+	i2400m->bus_dev_stop = i2400mu_bus_dev_stop;
+	i2400m->bus_release = NULL;
+	i2400m->bus_tx_kick = i2400mu_bus_tx_kick;
+	i2400m->bus_reset = i2400mu_bus_reset;
+	i2400m->bus_bm_retries = I2400M_USB_BOOT_RETRIES;
+	i2400m->bus_bm_cmd_send = i2400mu_bus_bm_cmd_send;
+	i2400m->bus_bm_wait_for_ack = i2400mu_bus_bm_wait_for_ack;
+	i2400m->bus_bm_mac_addr_impaired = 0;
+
+	switch (id->idProduct) {
+	case USB_DEVICE_ID_I6050:
+	case USB_DEVICE_ID_I6050_2:
+	case USB_DEVICE_ID_I6150:
+	case USB_DEVICE_ID_I6150_2:
+	case USB_DEVICE_ID_I6150_3:
+	case USB_DEVICE_ID_I6250:
+		i2400mu->i6050 = 1;
+		break;
+	default:
+		break;
+	}
+
+	if (i2400mu->i6050) {
+		i2400m->bus_fw_names = i2400mu_bus_fw_names_6050;
+		i2400mu->endpoint_cfg.bulk_out = 0;
+		i2400mu->endpoint_cfg.notification = 3;
+		i2400mu->endpoint_cfg.reset_cold = 2;
+		i2400mu->endpoint_cfg.bulk_in = 1;
+	} else {
+		i2400m->bus_fw_names = i2400mu_bus_fw_names_5x50;
+		i2400mu->endpoint_cfg.bulk_out = 0;
+		i2400mu->endpoint_cfg.notification = 1;
+		i2400mu->endpoint_cfg.reset_cold = 2;
+		i2400mu->endpoint_cfg.bulk_in = 3;
+	}
+#ifdef CONFIG_PM
+	iface->needs_remote_wakeup = 1;		/* autosuspend (15s delay) */
+	device_init_wakeup(dev, 1);
+	pm_runtime_set_autosuspend_delay(&usb_dev->dev, 15000);
+	usb_enable_autosuspend(usb_dev);
+#endif
+
+	result = i2400m_setup(i2400m, I2400M_BRI_MAC_REINIT);
+	if (result < 0) {
+		dev_err(dev, "cannot setup device: %d\n", result);
+		goto error_setup;
+	}
+	i2400mu_debugfs_add(i2400mu);
+	return 0;
+
+error_setup:
+	usb_set_intfdata(iface, NULL);
+	usb_put_dev(i2400mu->usb_dev);
+	free_netdev(net_dev);
+error_alloc_netdev:
+	return result;
+}
+
+
+/*
+ * Disconnect a i2400m from the system.
+ *
+ * i2400m_stop() has been called before, so al the rx and tx contexts
+ * have been taken down already. Make sure the queue is stopped,
+ * unregister netdev and i2400m, free and kill.
+ */
+static
+void i2400mu_disconnect(struct usb_interface *iface)
+{
+	struct i2400mu *i2400mu = usb_get_intfdata(iface);
+	struct i2400m *i2400m = &i2400mu->i2400m;
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	struct device *dev = &iface->dev;
+
+	d_fnstart(3, dev, "(iface %p i2400m %p)\n", iface, i2400m);
+
+	debugfs_remove_recursive(i2400mu->debugfs_dentry);
+	i2400m_release(i2400m);
+	usb_set_intfdata(iface, NULL);
+	usb_put_dev(i2400mu->usb_dev);
+	free_netdev(net_dev);
+	d_fnend(3, dev, "(iface %p i2400m %p) = void\n", iface, i2400m);
+}
+
+
+/*
+ * Get the device ready for USB port or system standby and hibernation
+ *
+ * USB port and system standby are handled the same.
+ *
+ * When the system hibernates, the USB device is powered down and then
+ * up, so we don't really have to do much here, as it will be seen as
+ * a reconnect. Still for simplicity we consider this case the same as
+ * suspend, so that the device has a chance to do notify the base
+ * station (if connected).
+ *
+ * So at the end, the three cases require common handling.
+ *
+ * If at the time of this call the device's firmware is not loaded,
+ * nothing has to be done. Note we can be "loose" about not reading
+ * i2400m->updown under i2400m->init_mutex. If it happens to change
+ * inmediately, other parts of the call flow will fail and effectively
+ * catch it.
+ *
+ * If the firmware is loaded, we need to:
+ *
+ *  - tell the device to go into host interface power save mode, wait
+ *    for it to ack
+ *
+ *    This is quite more interesting than it is; we need to execute a
+ *    command, but this time, we don't want the code in usb-{tx,rx}.c
+ *    to call the usb_autopm_get/put_interface() barriers as it'd
+ *    deadlock, so we need to decrement i2400mu->do_autopm, that acts
+ *    as a poor man's semaphore. Ugly, but it works.
+ *
+ *    As well, the device might refuse going to sleep for whichever
+ *    reason. In this case we just fail. For system suspend/hibernate,
+ *    we *can't* fail. We check PMSG_IS_AUTO to see if the
+ *    suspend call comes from the USB stack or from the system and act
+ *    in consequence.
+ *
+ *  - stop the notification endpoint polling
+ */
+static
+int i2400mu_suspend(struct usb_interface *iface, pm_message_t pm_msg)
+{
+	int result = 0;
+	struct device *dev = &iface->dev;
+	struct i2400mu *i2400mu = usb_get_intfdata(iface);
+	unsigned is_autosuspend = 0;
+	struct i2400m *i2400m = &i2400mu->i2400m;
+
+#ifdef CONFIG_PM
+	if (PMSG_IS_AUTO(pm_msg))
+		is_autosuspend = 1;
+#endif
+
+	d_fnstart(3, dev, "(iface %p pm_msg %u)\n", iface, pm_msg.event);
+	rmb();		/* see i2400m->updown's documentation  */
+	if (i2400m->updown == 0)
+		goto no_firmware;
+	if (i2400m->state == I2400M_SS_DATA_PATH_CONNECTED && is_autosuspend) {
+		/* ugh -- the device is connected and this suspend
+		 * request is an autosuspend one (not a system standby
+		 * / hibernate).
+		 *
+		 * The only way the device can go to standby is if the
+		 * link with the base station is in IDLE mode; that
+		 * were the case, we'd be in status
+		 * I2400M_SS_CONNECTED_IDLE. But we are not.
+		 *
+		 * If we *tell* him to go power save now, it'll reset
+		 * as a precautionary measure, so if this is an
+		 * autosuspend thing, say no and it'll come back
+		 * later, when the link is IDLE
+		 */
+		result = -EBADF;
+		d_printf(1, dev, "fw up, link up, not-idle, autosuspend: "
+			 "not entering powersave\n");
+		goto error_not_now;
+	}
+	d_printf(1, dev, "fw up: entering powersave\n");
+	atomic_dec(&i2400mu->do_autopm);
+	result = i2400m_cmd_enter_powersave(i2400m);
+	atomic_inc(&i2400mu->do_autopm);
+	if (result < 0 && !is_autosuspend) {
+		/* System suspend, can't fail */
+		dev_err(dev, "failed to suspend, will reset on resume\n");
+		result = 0;
+	}
+	if (result < 0)
+		goto error_enter_powersave;
+	i2400mu_notification_release(i2400mu);
+	d_printf(1, dev, "powersave requested\n");
+error_enter_powersave:
+error_not_now:
+no_firmware:
+	d_fnend(3, dev, "(iface %p pm_msg %u) = %d\n",
+		iface, pm_msg.event, result);
+	return result;
+}
+
+
+static
+int i2400mu_resume(struct usb_interface *iface)
+{
+	int ret = 0;
+	struct device *dev = &iface->dev;
+	struct i2400mu *i2400mu = usb_get_intfdata(iface);
+	struct i2400m *i2400m = &i2400mu->i2400m;
+
+	d_fnstart(3, dev, "(iface %p)\n", iface);
+	rmb();		/* see i2400m->updown's documentation  */
+	if (i2400m->updown == 0) {
+		d_printf(1, dev, "fw was down, no resume needed\n");
+		goto out;
+	}
+	d_printf(1, dev, "fw was up, resuming\n");
+	i2400mu_notification_setup(i2400mu);
+	/* USB has flow control, so we don't need to give it time to
+	 * come back; otherwise, we'd use something like a get-state
+	 * command... */
+out:
+	d_fnend(3, dev, "(iface %p) = %d\n", iface, ret);
+	return ret;
+}
+
+
+static
+int i2400mu_reset_resume(struct usb_interface *iface)
+{
+	int result;
+	struct device *dev = &iface->dev;
+	struct i2400mu *i2400mu = usb_get_intfdata(iface);
+	struct i2400m *i2400m = &i2400mu->i2400m;
+
+	d_fnstart(3, dev, "(iface %p)\n", iface);
+	result = i2400m_dev_reset_handle(i2400m, "device reset on resume");
+	d_fnend(3, dev, "(iface %p) = %d\n", iface, result);
+	return result < 0 ? result : 0;
+}
+
+
+/*
+ * Another driver or user space is triggering a reset on the device
+ * which contains the interface passed as an argument. Cease IO and
+ * save any device state you need to restore.
+ *
+ * If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if
+ * you are in atomic context.
+ */
+static
+int i2400mu_pre_reset(struct usb_interface *iface)
+{
+	struct i2400mu *i2400mu = usb_get_intfdata(iface);
+	return i2400m_pre_reset(&i2400mu->i2400m);
+}
+
+
+/*
+ * The reset has completed.  Restore any saved device state and begin
+ * using the device again.
+ *
+ * If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if
+ * you are in atomic context.
+ */
+static
+int i2400mu_post_reset(struct usb_interface *iface)
+{
+	struct i2400mu *i2400mu = usb_get_intfdata(iface);
+	return i2400m_post_reset(&i2400mu->i2400m);
+}
+
+
+static
+struct usb_device_id i2400mu_id_table[] = {
+	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6050) },
+	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6050_2) },
+	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150) },
+	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150_2) },
+	{ USB_DEVICE(0x8087, USB_DEVICE_ID_I6150_3) },
+	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6250) },
+	{ USB_DEVICE(0x8086, 0x0181) },
+	{ USB_DEVICE(0x8086, 0x1403) },
+	{ USB_DEVICE(0x8086, 0x1405) },
+	{ USB_DEVICE(0x8086, 0x0180) },
+	{ USB_DEVICE(0x8086, 0x0182) },
+	{ USB_DEVICE(0x8086, 0x1406) },
+	{ USB_DEVICE(0x8086, 0x1403) },
+	{ },
+};
+MODULE_DEVICE_TABLE(usb, i2400mu_id_table);
+
+
+static
+struct usb_driver i2400mu_driver = {
+	.name = KBUILD_MODNAME,
+	.suspend = i2400mu_suspend,
+	.resume = i2400mu_resume,
+	.reset_resume = i2400mu_reset_resume,
+	.probe = i2400mu_probe,
+	.disconnect = i2400mu_disconnect,
+	.pre_reset = i2400mu_pre_reset,
+	.post_reset = i2400mu_post_reset,
+	.id_table = i2400mu_id_table,
+	.supports_autosuspend = 1,
+};
+
+static
+int __init i2400mu_driver_init(void)
+{
+	d_parse_params(D_LEVEL, D_LEVEL_SIZE, i2400mu_debug_params,
+		       "i2400m_usb.debug");
+	return usb_register(&i2400mu_driver);
+}
+module_init(i2400mu_driver_init);
+
+
+static
+void __exit i2400mu_driver_exit(void)
+{
+	usb_deregister(&i2400mu_driver);
+}
+module_exit(i2400mu_driver_exit);
+
+MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
+MODULE_DESCRIPTION("Driver for USB based Intel Wireless WiMAX Connection 2400M "
+		   "(5x50 & 6050)");
+MODULE_LICENSE("GPL");
+MODULE_FIRMWARE(I2400MU_FW_FILE_NAME_v1_5);
+MODULE_FIRMWARE(I6050U_FW_FILE_NAME_v1_5);
diff --git a/drivers/staging/wimax/id-table.c b/drivers/staging/wimax/id-table.c
new file mode 100644
index 000000000000..0e6f4aa87bc9
--- /dev/null
+++ b/drivers/staging/wimax/id-table.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux WiMAX
+ * Mappping of generic netlink family IDs to net devices
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * We assign a single generic netlink family ID to each device (to
+ * simplify lookup).
+ *
+ * We need a way to map family ID to a wimax_dev pointer.
+ *
+ * The idea is to use a very simple lookup. Using a netlink attribute
+ * with (for example) the interface name implies a heavier search over
+ * all the network devices; seemed kind of a waste given that we know
+ * we are looking for a WiMAX device and that most systems will have
+ * just a single WiMAX adapter.
+ *
+ * We put all the WiMAX devices in the system in a linked list and
+ * match the generic link family ID against the list.
+ *
+ * By using a linked list, the case of a single adapter in the system
+ * becomes (almost) no overhead, while still working for many more. If
+ * it ever goes beyond two, I'll be surprised.
+ */
+#include <linux/device.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include "linux-wimax.h"
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE id_table
+#include "debug-levels.h"
+
+
+static DEFINE_SPINLOCK(wimax_id_table_lock);
+static struct list_head wimax_id_table = LIST_HEAD_INIT(wimax_id_table);
+
+
+/*
+ * wimax_id_table_add - add a gennetlink familiy ID / wimax_dev mapping
+ *
+ * @wimax_dev: WiMAX device descriptor to associate to the Generic
+ *     Netlink family ID.
+ *
+ * Look for an empty spot in the ID table; if none found, double the
+ * table's size and get the first spot.
+ */
+void wimax_id_table_add(struct wimax_dev *wimax_dev)
+{
+	d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+	spin_lock(&wimax_id_table_lock);
+	list_add(&wimax_dev->id_table_node, &wimax_id_table);
+	spin_unlock(&wimax_id_table_lock);
+	d_fnend(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+}
+
+
+/*
+ * wimax_get_netdev_by_info - lookup a wimax_dev from the gennetlink info
+ *
+ * The generic netlink family ID has been filled out in the
+ * nlmsghdr->nlmsg_type field, so we pull it from there, look it up in
+ * the mapping table and reference the wimax_dev.
+ *
+ * When done, the reference should be dropped with
+ * 'dev_put(wimax_dev->net_dev)'.
+ */
+struct wimax_dev *wimax_dev_get_by_genl_info(
+	struct genl_info *info, int ifindex)
+{
+	struct wimax_dev *wimax_dev = NULL;
+
+	d_fnstart(3, NULL, "(info %p ifindex %d)\n", info, ifindex);
+	spin_lock(&wimax_id_table_lock);
+	list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
+		if (wimax_dev->net_dev->ifindex == ifindex) {
+			dev_hold(wimax_dev->net_dev);
+			goto found;
+		}
+	}
+	wimax_dev = NULL;
+	d_printf(1, NULL, "wimax: no devices found with ifindex %d\n",
+		 ifindex);
+found:
+	spin_unlock(&wimax_id_table_lock);
+	d_fnend(3, NULL, "(info %p ifindex %d) = %p\n",
+		info, ifindex, wimax_dev);
+	return wimax_dev;
+}
+
+
+/*
+ * wimax_id_table_rm - Remove a gennetlink familiy ID / wimax_dev mapping
+ *
+ * @id: family ID to remove from the table
+ */
+void wimax_id_table_rm(struct wimax_dev *wimax_dev)
+{
+	spin_lock(&wimax_id_table_lock);
+	list_del_init(&wimax_dev->id_table_node);
+	spin_unlock(&wimax_id_table_lock);
+}
+
+
+/*
+ * Release the gennetlink family id / mapping table
+ *
+ * On debug, verify that the table is empty upon removal. We want the
+ * code always compiled, to ensure it doesn't bit rot. It will be
+ * compiled out if CONFIG_BUG is disabled.
+ */
+void wimax_id_table_release(void)
+{
+	struct wimax_dev *wimax_dev;
+
+#ifndef CONFIG_BUG
+	return;
+#endif
+	spin_lock(&wimax_id_table_lock);
+	list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
+		pr_err("BUG: %s wimax_dev %p ifindex %d not cleared\n",
+		       __func__, wimax_dev, wimax_dev->net_dev->ifindex);
+		WARN_ON(1);
+	}
+	spin_unlock(&wimax_id_table_lock);
+}
diff --git a/drivers/staging/wimax/linux-wimax-debug.h b/drivers/staging/wimax/linux-wimax-debug.h
new file mode 100644
index 000000000000..5b5ec405143b
--- /dev/null
+++ b/drivers/staging/wimax/linux-wimax-debug.h
@@ -0,0 +1,491 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Linux WiMAX
+ * Collection of tools to manage debug operations.
+ *
+ * Copyright (C) 2005-2007 Intel Corporation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * Don't #include this file directly, read on!
+ *
+ * EXECUTING DEBUGGING ACTIONS OR NOT
+ *
+ * The main thing this framework provides is decission power to take a
+ * debug action (like printing a message) if the current debug level
+ * allows it.
+ *
+ * The decission power is at two levels: at compile-time (what does
+ * not make it is compiled out) and at run-time. The run-time
+ * selection is done per-submodule (as they are declared by the user
+ * of the framework).
+ *
+ * A call to d_test(L) (L being the target debug level) returns true
+ * if the action should be taken because the current debug levels
+ * allow it (both compile and run time).
+ *
+ * It follows that a call to d_test() that can be determined to be
+ * always false at compile time will get the code depending on it
+ * compiled out by optimization.
+ *
+ * DEBUG LEVELS
+ *
+ * It is up to the caller to define how much a debugging level is.
+ *
+ * Convention sets 0 as "no debug" (so an action marked as debug level 0
+ * will always be taken). The increasing debug levels are used for
+ * increased verbosity.
+ *
+ * USAGE
+ *
+ * Group the code in modules and submodules inside each module [which
+ * in most cases maps to Linux modules and .c files that compose
+ * those].
+ *
+ * For each module, there is:
+ *
+ *  - a MODULENAME (single word, legal C identifier)
+ *
+ *  - a debug-levels.h header file that declares the list of
+ *    submodules and that is included by all .c files that use
+ *    the debugging tools. The file name can be anything.
+ *
+ *  - some (optional) .c code to manipulate the runtime debug levels
+ *    through debugfs.
+ *
+ * The debug-levels.h file would look like:
+ *
+ *     #ifndef __debug_levels__h__
+ *     #define __debug_levels__h__
+ *
+ *     #define D_MODULENAME modulename
+ *     #define D_MASTER 10
+ *
+ *     #include "linux-wimax-debug.h"
+ *
+ *     enum d_module {
+ *             D_SUBMODULE_DECLARE(submodule_1),
+ *             D_SUBMODULE_DECLARE(submodule_2),
+ *             ...
+ *             D_SUBMODULE_DECLARE(submodule_N)
+ *     };
+ *
+ *     #endif
+ *
+ * D_MASTER is the maximum compile-time debug level; any debug actions
+ * above this will be out. D_MODULENAME is the module name (legal C
+ * identifier), which has to be unique for each module (to avoid
+ * namespace collisions during linkage). Note those #defines need to
+ * be done before #including debug.h
+ *
+ * We declare N different submodules whose debug level can be
+ * independently controlled during runtime.
+ *
+ * In a .c file of the module (and only in one of them), define the
+ * following code:
+ *
+ *     struct d_level D_LEVEL[] = {
+ *             D_SUBMODULE_DEFINE(submodule_1),
+ *             D_SUBMODULE_DEFINE(submodule_2),
+ *             ...
+ *             D_SUBMODULE_DEFINE(submodule_N),
+ *     };
+ *     size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
+ *
+ * Externs for d_level_MODULENAME and d_level_size_MODULENAME are used
+ * and declared in this file using the D_LEVEL and D_LEVEL_SIZE macros
+ * #defined also in this file.
+ *
+ * To manipulate from user space the levels, create a debugfs dentry
+ * and then register each submodule with:
+ *
+ *     d_level_register_debugfs("PREFIX_", submodule_X, parent);
+ *
+ * Where PREFIX_ is a name of your chosing. This will create debugfs
+ * file with a single numeric value that can be use to tweak it. To
+ * remove the entires, just use debugfs_remove_recursive() on 'parent'.
+ *
+ * NOTE: remember that even if this will show attached to some
+ *     particular instance of a device, the settings are *global*.
+ *
+ * On each submodule (for example, .c files), the debug infrastructure
+ * should be included like this:
+ *
+ *     #define D_SUBMODULE submodule_x     // matches one in debug-levels.h
+ *     #include "debug-levels.h"
+ *
+ * after #including all your include files.
+ *
+ * Now you can use the d_*() macros below [d_test(), d_fnstart(),
+ * d_fnend(), d_printf(), d_dump()].
+ *
+ * If their debug level is greater than D_MASTER, they will be
+ * compiled out.
+ *
+ * If their debug level is lower or equal than D_MASTER but greater
+ * than the current debug level of their submodule, they'll be
+ * ignored.
+ *
+ * Otherwise, the action will be performed.
+ */
+#ifndef __debug__h__
+#define __debug__h__
+
+#include <linux/types.h>
+#include <linux/slab.h>
+
+struct device;
+
+/* Backend stuff */
+
+/*
+ * Debug backend: generate a message header from a 'struct device'
+ *
+ * @head: buffer where to place the header
+ * @head_size: length of @head
+ * @dev: pointer to device used to generate a header from. If NULL,
+ *     an empty ("") header is generated.
+ */
+static inline
+void __d_head(char *head, size_t head_size,
+	      struct device *dev)
+{
+	if (dev == NULL)
+		head[0] = 0;
+	else if ((unsigned long)dev < 4096) {
+		printk(KERN_ERR "E: Corrupt dev %p\n", dev);
+		WARN_ON(1);
+	} else
+		snprintf(head, head_size, "%s %s: ",
+			 dev_driver_string(dev), dev_name(dev));
+}
+
+
+/*
+ * Debug backend: log some message if debugging is enabled
+ *
+ * @l: intended debug level
+ * @tag: tag to prefix the message with
+ * @dev: 'struct device' associated to this message
+ * @f: printf-like format and arguments
+ *
+ * Note this is optimized out if it doesn't pass the compile-time
+ * check; however, it is *always* compiled. This is useful to make
+ * sure the printf-like formats and variables are always checked and
+ * they don't get bit rot if you have all the debugging disabled.
+ */
+#define _d_printf(l, tag, dev, f, a...)					\
+do {									\
+	char head[64];							\
+	if (!d_test(l))							\
+		break;							\
+	__d_head(head, sizeof(head), dev);				\
+	printk(KERN_ERR "%s%s%s: " f, head, __func__, tag, ##a);	\
+} while (0)
+
+
+/*
+ * CPP syntactic sugar to generate A_B like symbol names when one of
+ * the arguments is a preprocessor #define.
+ */
+#define __D_PASTE__(varname, modulename) varname##_##modulename
+#define __D_PASTE(varname, modulename) (__D_PASTE__(varname, modulename))
+#define _D_SUBMODULE_INDEX(_name) (D_SUBMODULE_DECLARE(_name))
+
+
+/*
+ * Store a submodule's runtime debug level and name
+ */
+struct d_level {
+	u8 level;
+	const char *name;
+};
+
+
+/*
+ * List of available submodules and their debug levels
+ *
+ * We call them d_level_MODULENAME and d_level_size_MODULENAME; the
+ * macros D_LEVEL and D_LEVEL_SIZE contain the name already for
+ * convenience.
+ *
+ * This array and the size are defined on some .c file that is part of
+ * the current module.
+ */
+#define D_LEVEL __D_PASTE(d_level, D_MODULENAME)
+#define D_LEVEL_SIZE __D_PASTE(d_level_size, D_MODULENAME)
+
+extern struct d_level D_LEVEL[];
+extern size_t D_LEVEL_SIZE;
+
+
+/*
+ * Frontend stuff
+ *
+ *
+ * Stuff you need to declare prior to using the actual "debug" actions
+ * (defined below).
+ */
+
+#ifndef D_MODULENAME
+#error D_MODULENAME is not defined in your debug-levels.h file
+/**
+ * D_MODULE - Name of the current module
+ *
+ * #define in your module's debug-levels.h, making sure it is
+ * unique. This has to be a legal C identifier.
+ */
+#define D_MODULENAME undefined_modulename
+#endif
+
+
+#ifndef D_MASTER
+#warning D_MASTER not defined, but debug.h included! [see docs]
+/**
+ * D_MASTER - Compile time maximum debug level
+ *
+ * #define in your debug-levels.h file to the maximum debug level the
+ * runtime code will be allowed to have. This allows you to provide a
+ * main knob.
+ *
+ * Anything above that level will be optimized out of the compile.
+ *
+ * Defaults to zero (no debug code compiled in).
+ *
+ * Maximum one definition per module (at the debug-levels.h file).
+ */
+#define D_MASTER 0
+#endif
+
+#ifndef D_SUBMODULE
+#error D_SUBMODULE not defined, but debug.h included! [see docs]
+/**
+ * D_SUBMODULE - Name of the current submodule
+ *
+ * #define in your submodule .c file before #including debug-levels.h
+ * to the name of the current submodule as previously declared and
+ * defined with D_SUBMODULE_DECLARE() (in your module's
+ * debug-levels.h) and D_SUBMODULE_DEFINE().
+ *
+ * This is used to provide runtime-control over the debug levels.
+ *
+ * Maximum one per .c file! Can be shared among different .c files
+ * (meaning they belong to the same submodule categorization).
+ */
+#define D_SUBMODULE undefined_module
+#endif
+
+
+/**
+ * D_SUBMODULE_DECLARE - Declare a submodule for runtime debug level control
+ *
+ * @_name: name of the submodule, restricted to the chars that make up a
+ *     valid C identifier ([a-zA-Z0-9_]).
+ *
+ * Declare in the module's debug-levels.h header file as:
+ *
+ * enum d_module {
+ *         D_SUBMODULE_DECLARE(submodule_1),
+ *         D_SUBMODULE_DECLARE(submodule_2),
+ *         D_SUBMODULE_DECLARE(submodule_3),
+ * };
+ *
+ * Some corresponding .c file needs to have a matching
+ * D_SUBMODULE_DEFINE().
+ */
+#define D_SUBMODULE_DECLARE(_name) __D_SUBMODULE_##_name
+
+
+/**
+ * D_SUBMODULE_DEFINE - Define a submodule for runtime debug level control
+ *
+ * @_name: name of the submodule, restricted to the chars that make up a
+ *     valid C identifier ([a-zA-Z0-9_]).
+ *
+ * Use once per module (in some .c file) as:
+ *
+ * static
+ * struct d_level d_level_SUBMODULENAME[] = {
+ *         D_SUBMODULE_DEFINE(submodule_1),
+ *         D_SUBMODULE_DEFINE(submodule_2),
+ *         D_SUBMODULE_DEFINE(submodule_3),
+ * };
+ * size_t d_level_size_SUBDMODULENAME = ARRAY_SIZE(d_level_SUBDMODULENAME);
+ *
+ * Matching D_SUBMODULE_DECLARE()s have to be present in a
+ * debug-levels.h header file.
+ */
+#define D_SUBMODULE_DEFINE(_name)		\
+[__D_SUBMODULE_##_name] = {			\
+	.level = 0,				\
+	.name = #_name				\
+}
+
+
+
+/* The actual "debug" operations */
+
+
+/**
+ * d_test - Returns true if debugging should be enabled
+ *
+ * @l: intended debug level (unsigned)
+ *
+ * If the master debug switch is enabled and the current settings are
+ * higher or equal to the requested level, then debugging
+ * output/actions should be enabled.
+ *
+ * NOTE:
+ *
+ * This needs to be coded so that it can be evaluated in compile
+ * time; this is why the ugly BUG_ON() is placed in there, so the
+ * D_MASTER evaluation compiles all out if it is compile-time false.
+ */
+#define d_test(l)							\
+({									\
+	unsigned __l = l;	/* type enforcer */			\
+	(D_MASTER) >= __l						\
+	&& ({								\
+		BUG_ON(_D_SUBMODULE_INDEX(D_SUBMODULE) >= D_LEVEL_SIZE);\
+		D_LEVEL[_D_SUBMODULE_INDEX(D_SUBMODULE)].level >= __l;	\
+	});								\
+})
+
+
+/**
+ * d_fnstart - log message at function start if debugging enabled
+ *
+ * @l: intended debug level
+ * @_dev: 'struct device' pointer, NULL if none (for context)
+ * @f: printf-like format and arguments
+ */
+#define d_fnstart(l, _dev, f, a...) _d_printf(l, " FNSTART", _dev, f, ## a)
+
+
+/**
+ * d_fnend - log message at function end if debugging enabled
+ *
+ * @l: intended debug level
+ * @_dev: 'struct device' pointer, NULL if none (for context)
+ * @f: printf-like format and arguments
+ */
+#define d_fnend(l, _dev, f, a...) _d_printf(l, " FNEND", _dev, f, ## a)
+
+
+/**
+ * d_printf - log message if debugging enabled
+ *
+ * @l: intended debug level
+ * @_dev: 'struct device' pointer, NULL if none (for context)
+ * @f: printf-like format and arguments
+ */
+#define d_printf(l, _dev, f, a...) _d_printf(l, "", _dev, f, ## a)
+
+
+/**
+ * d_dump - log buffer hex dump if debugging enabled
+ *
+ * @l: intended debug level
+ * @_dev: 'struct device' pointer, NULL if none (for context)
+ * @f: printf-like format and arguments
+ */
+#define d_dump(l, dev, ptr, size)			\
+do {							\
+	char head[64];					\
+	if (!d_test(l))					\
+		break;					\
+	__d_head(head, sizeof(head), dev);		\
+	print_hex_dump(KERN_ERR, head, 0, 16, 1,	\
+		       ((void *) ptr), (size), 0);	\
+} while (0)
+
+
+/**
+ * Export a submodule's debug level over debugfs as PREFIXSUBMODULE
+ *
+ * @prefix: string to prefix the name with
+ * @submodule: name of submodule (not a string, just the name)
+ * @dentry: debugfs parent dentry
+ *
+ * For removing, just use debugfs_remove_recursive() on the parent.
+ */
+#define d_level_register_debugfs(prefix, name, parent)			\
+({									\
+	debugfs_create_u8(						\
+		prefix #name, 0600, parent,				\
+		&(D_LEVEL[__D_SUBMODULE_ ## name].level));		\
+})
+
+
+static inline
+void d_submodule_set(struct d_level *d_level, size_t d_level_size,
+		     const char *submodule, u8 level, const char *tag)
+{
+	struct d_level *itr, *top;
+	int index = -1;
+
+	for (itr = d_level, top = itr + d_level_size; itr < top; itr++) {
+		index++;
+		if (itr->name == NULL) {
+			printk(KERN_ERR "%s: itr->name NULL?? (%p, #%d)\n",
+			       tag, itr, index);
+			continue;
+		}
+		if (!strcmp(itr->name, submodule)) {
+			itr->level = level;
+			return;
+		}
+	}
+	printk(KERN_ERR "%s: unknown submodule %s\n", tag, submodule);
+}
+
+
+/**
+ * d_parse_params - Parse a string with debug parameters from the
+ * command line
+ *
+ * @d_level: level structure (D_LEVEL)
+ * @d_level_size: number of items in the level structure
+ *     (D_LEVEL_SIZE).
+ * @_params: string with the parameters; this is a space (not tab!)
+ *     separated list of NAME:VALUE, where value is the debug level
+ *     and NAME is the name of the submodule.
+ * @tag: string for error messages (example: MODULE.ARGNAME).
+ */
+static inline
+void d_parse_params(struct d_level *d_level, size_t d_level_size,
+		    const char *_params, const char *tag)
+{
+	char submodule[130], *params, *params_orig, *token, *colon;
+	unsigned level, tokens;
+
+	if (_params == NULL)
+		return;
+	params_orig = kstrdup(_params, GFP_KERNEL);
+	params = params_orig;
+	while (1) {
+		token = strsep(&params, " ");
+		if (token == NULL)
+			break;
+		if (*token == '\0')	/* eat joint spaces */
+			continue;
+		/* kernel's sscanf %s eats until whitespace, so we
+		 * replace : by \n so it doesn't get eaten later by
+		 * strsep */
+		colon = strchr(token, ':');
+		if (colon != NULL)
+			*colon = '\n';
+		tokens = sscanf(token, "%s\n%u", submodule, &level);
+		if (colon != NULL)
+			*colon = ':';	/* set back, for error messages */
+		if (tokens == 2)
+			d_submodule_set(d_level, d_level_size,
+					submodule, level, tag);
+		else
+			printk(KERN_ERR "%s: can't parse '%s' as a "
+			       "SUBMODULE:LEVEL (%d tokens)\n",
+			       tag, token, tokens);
+	}
+	kfree(params_orig);
+}
+
+#endif /* #ifndef __debug__h__ */
diff --git a/drivers/staging/wimax/linux-wimax.h b/drivers/staging/wimax/linux-wimax.h
new file mode 100644
index 000000000000..9f6b77af2f6d
--- /dev/null
+++ b/drivers/staging/wimax/linux-wimax.h
@@ -0,0 +1,239 @@
+/*
+ * Linux WiMax
+ * API for user space
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *  - Initial implementation
+ *
+ *
+ * This file declares the user/kernel protocol that is spoken over
+ * Generic Netlink, as well as any type declaration that is to be used
+ * by kernel and user space.
+ *
+ * It is intended for user space to clone it verbatim to use it as a
+ * primary reference for definitions.
+ *
+ * Stuff intended for kernel usage as well as full protocol and stack
+ * documentation is rooted in include/net/wimax.h.
+ */
+
+#ifndef __LINUX__WIMAX_H__
+#define __LINUX__WIMAX_H__
+
+#include <linux/types.h>
+
+enum {
+	/**
+	 * Version of the interface (unsigned decimal, MMm, max 25.5)
+	 * M - Major: change if removing or modifying an existing call.
+	 * m - minor: change when adding a new call
+	 */
+	WIMAX_GNL_VERSION = 01,
+	/* Generic NetLink attributes */
+	WIMAX_GNL_ATTR_INVALID = 0x00,
+	WIMAX_GNL_ATTR_MAX = 10,
+};
+
+
+/*
+ * Generic NetLink operations
+ *
+ * Most of these map to an API call; _OP_ stands for operation, _RP_
+ * for reply and _RE_ for report (aka: signal).
+ */
+enum {
+	WIMAX_GNL_OP_MSG_FROM_USER,	/* User to kernel message */
+	WIMAX_GNL_OP_MSG_TO_USER,	/* Kernel to user message */
+	WIMAX_GNL_OP_RFKILL,	/* Run wimax_rfkill() */
+	WIMAX_GNL_OP_RESET,	/* Run wimax_rfkill() */
+	WIMAX_GNL_RE_STATE_CHANGE,	/* Report: status change */
+	WIMAX_GNL_OP_STATE_GET,		/* Request for current state */
+};
+
+
+/* Message from user / to user */
+enum {
+	WIMAX_GNL_MSG_IFIDX = 1,
+	WIMAX_GNL_MSG_PIPE_NAME,
+	WIMAX_GNL_MSG_DATA,
+};
+
+
+/*
+ * wimax_rfkill()
+ *
+ * The state of the radio (ON/OFF) is mapped to the rfkill subsystem's
+ * switch state (DISABLED/ENABLED).
+ */
+enum wimax_rf_state {
+	WIMAX_RF_OFF = 0,	/* Radio is off, rfkill on/enabled */
+	WIMAX_RF_ON = 1,	/* Radio is on, rfkill off/disabled */
+	WIMAX_RF_QUERY = 2,
+};
+
+/* Attributes */
+enum {
+	WIMAX_GNL_RFKILL_IFIDX = 1,
+	WIMAX_GNL_RFKILL_STATE,
+};
+
+
+/* Attributes for wimax_reset() */
+enum {
+	WIMAX_GNL_RESET_IFIDX = 1,
+};
+
+/* Attributes for wimax_state_get() */
+enum {
+	WIMAX_GNL_STGET_IFIDX = 1,
+};
+
+/*
+ * Attributes for the Report State Change
+ *
+ * For now we just have the old and new states; new attributes might
+ * be added later on.
+ */
+enum {
+	WIMAX_GNL_STCH_IFIDX = 1,
+	WIMAX_GNL_STCH_STATE_OLD,
+	WIMAX_GNL_STCH_STATE_NEW,
+};
+
+
+/**
+ * enum wimax_st - The different states of a WiMAX device
+ * @__WIMAX_ST_NULL: The device structure has been allocated and zeroed,
+ *     but still wimax_dev_add() hasn't been called. There is no state.
+ *
+ * @WIMAX_ST_DOWN: The device has been registered with the WiMAX and
+ *     networking stacks, but it is not initialized (normally that is
+ *     done with 'ifconfig DEV up' [or equivalent], which can upload
+ *     firmware and enable communications with the device).
+ *     In this state, the device is powered down and using as less
+ *     power as possible.
+ *     This state is the default after a call to wimax_dev_add(). It
+ *     is ok to have drivers move directly to %WIMAX_ST_UNINITIALIZED
+ *     or %WIMAX_ST_RADIO_OFF in _probe() after the call to
+ *     wimax_dev_add().
+ *     It is recommended that the driver leaves this state when
+ *     calling 'ifconfig DEV up' and enters it back on 'ifconfig DEV
+ *     down'.
+ *
+ * @__WIMAX_ST_QUIESCING: The device is being torn down, so no API
+ *     operations are allowed to proceed except the ones needed to
+ *     complete the device clean up process.
+ *
+ * @WIMAX_ST_UNINITIALIZED: [optional] Communication with the device
+ *     is setup, but the device still requires some configuration
+ *     before being operational.
+ *     Some WiMAX API calls might work.
+ *
+ * @WIMAX_ST_RADIO_OFF: The device is fully up; radio is off (wether
+ *     by hardware or software switches).
+ *     It is recommended to always leave the device in this state
+ *     after initialization.
+ *
+ * @WIMAX_ST_READY: The device is fully up and radio is on.
+ *
+ * @WIMAX_ST_SCANNING: [optional] The device has been instructed to
+ *     scan. In this state, the device cannot be actively connected to
+ *     a network.
+ *
+ * @WIMAX_ST_CONNECTING: The device is connecting to a network. This
+ *     state exists because in some devices, the connect process can
+ *     include a number of negotiations between user space, kernel
+ *     space and the device. User space needs to know what the device
+ *     is doing. If the connect sequence in a device is atomic and
+ *     fast, the device can transition directly to CONNECTED
+ *
+ * @WIMAX_ST_CONNECTED: The device is connected to a network.
+ *
+ * @__WIMAX_ST_INVALID: This is an invalid state used to mark the
+ *     maximum numeric value of states.
+ *
+ * Description:
+ *
+ * Transitions from one state to another one are atomic and can only
+ * be caused in kernel space with wimax_state_change(). To read the
+ * state, use wimax_state_get().
+ *
+ * States starting with __ are internal and shall not be used or
+ * referred to by drivers or userspace. They look ugly, but that's the
+ * point -- if any use is made non-internal to the stack, it is easier
+ * to catch on review.
+ *
+ * All API operations [with well defined exceptions] will take the
+ * device mutex before starting and then check the state. If the state
+ * is %__WIMAX_ST_NULL, %WIMAX_ST_DOWN, %WIMAX_ST_UNINITIALIZED or
+ * %__WIMAX_ST_QUIESCING, it will drop the lock and quit with
+ * -%EINVAL, -%ENOMEDIUM, -%ENOTCONN or -%ESHUTDOWN.
+ *
+ * The order of the definitions is important, so we can do numerical
+ * comparisons (eg: < %WIMAX_ST_RADIO_OFF means the device is not ready
+ * to operate).
+ */
+/*
+ * The allowed state transitions are described in the table below
+ * (states in rows can go to states in columns where there is an X):
+ *
+ *                                  UNINI   RADIO READY SCAN CONNEC CONNEC
+ *             NULL DOWN QUIESCING TIALIZED  OFF        NING  TING   TED
+ * NULL         -    x
+ * DOWN              -      x        x       x
+ * QUIESCING         x      -
+ * UNINITIALIZED            x        -       x
+ * RADIO_OFF                x                -     x
+ * READY                    x                x     -     x     x      x
+ * SCANNING                 x                x     x     -     x      x
+ * CONNECTING               x                x     x     x     -      x
+ * CONNECTED                x                x     x                  -
+ *
+ * This table not available in kernel-doc because the formatting messes it up.
+ */
+ enum wimax_st {
+	__WIMAX_ST_NULL = 0,
+	WIMAX_ST_DOWN,
+	__WIMAX_ST_QUIESCING,
+	WIMAX_ST_UNINITIALIZED,
+	WIMAX_ST_RADIO_OFF,
+	WIMAX_ST_READY,
+	WIMAX_ST_SCANNING,
+	WIMAX_ST_CONNECTING,
+	WIMAX_ST_CONNECTED,
+	__WIMAX_ST_INVALID			/* Always keep last */
+};
+
+
+#endif /* #ifndef __LINUX__WIMAX_H__ */
diff --git a/drivers/staging/wimax/net-wimax.h b/drivers/staging/wimax/net-wimax.h
new file mode 100644
index 000000000000..f578e345e2bd
--- /dev/null
+++ b/drivers/staging/wimax/net-wimax.h
@@ -0,0 +1,503 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Linux WiMAX
+ * Kernel space API for accessing WiMAX devices
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * The WiMAX stack provides an API for controlling and managing the
+ * system's WiMAX devices. This API affects the control plane; the
+ * data plane is accessed via the network stack (netdev).
+ *
+ * Parts of the WiMAX stack API and notifications are exported to
+ * user space via Generic Netlink. In user space, libwimax (part of
+ * the wimax-tools package) provides a shim layer for accessing those
+ * calls.
+ *
+ * The API is standarized for all WiMAX devices and different drivers
+ * implement the backend support for it. However, device-specific
+ * messaging pipes are provided that can be used to issue commands and
+ * receive notifications in free form.
+ *
+ * Currently the messaging pipes are the only means of control as it
+ * is not known (due to the lack of more devices in the market) what
+ * will be a good abstraction layer. Expect this to change as more
+ * devices show in the market. This API is designed to be growable in
+ * order to address this problem.
+ *
+ * USAGE
+ *
+ * Embed a `struct wimax_dev` at the beginning of the device's
+ * private structure, initialize and register it. For details, see
+ * `struct wimax_dev`s documentation.
+ *
+ * Once this is done, wimax-tools's libwimaxll can be used to
+ * communicate with the driver from user space. You user space
+ * application does not have to forcibily use libwimaxll and can talk
+ * the generic netlink protocol directly if desired.
+ *
+ * Remember this is a very low level API that will to provide all of
+ * WiMAX features. Other daemons and services running in user space
+ * are the expected clients of it. They offer a higher level API that
+ * applications should use (an example of this is the Intel's WiMAX
+ * Network Service for the i2400m).
+ *
+ * DESIGN
+ *
+ * Although not set on final stone, this very basic interface is
+ * mostly completed. Remember this is meant to grow as new common
+ * operations are decided upon. New operations will be added to the
+ * interface, intent being on keeping backwards compatibility as much
+ * as possible.
+ *
+ * This layer implements a set of calls to control a WiMAX device,
+ * exposing a frontend to the rest of the kernel and user space (via
+ * generic netlink) and a backend implementation in the driver through
+ * function pointers.
+ *
+ * WiMAX devices have a state, and a kernel-only API allows the
+ * drivers to manipulate that state. State transitions are atomic, and
+ * only some of them are allowed (see `enum wimax_st`).
+ *
+ * Most API calls will set the state automatically; in most cases
+ * drivers have to only report state changes due to external
+ * conditions.
+ *
+ * All API operations are 'atomic', serialized through a mutex in the
+ * `struct wimax_dev`.
+ *
+ * EXPORTING TO USER SPACE THROUGH GENERIC NETLINK
+ *
+ * The API is exported to user space using generic netlink (other
+ * methods can be added as needed).
+ *
+ * There is a Generic Netlink Family named "WiMAX", where interfaces
+ * supporting the WiMAX interface receive commands and broadcast their
+ * signals over a multicast group named "msg".
+ *
+ * Mapping to the source/destination interface is done by an interface
+ * index attribute.
+ *
+ * For user-to-kernel traffic (commands) we use a function call
+ * marshalling mechanism, where a message X with attributes A, B, C
+ * sent from user space to kernel space means executing the WiMAX API
+ * call wimax_X(A, B, C), sending the results back as a message.
+ *
+ * Kernel-to-user (notifications or signals) communication is sent
+ * over multicast groups. This allows to have multiple applications
+ * monitoring them.
+ *
+ * Each command/signal gets assigned it's own attribute policy. This
+ * way the validator will verify that all the attributes in there are
+ * only the ones that should be for each command/signal. Thing of an
+ * attribute mapping to a type+argumentname for each command/signal.
+ *
+ * If we had a single policy for *all* commands/signals, after running
+ * the validator we'd have to check "does this attribute belong in
+ * here"?  for each one. It can be done manually, but it's just easier
+ * to have the validator do that job with multiple policies. As well,
+ * it makes it easier to later expand each command/signal signature
+ * without affecting others and keeping the namespace more or less
+ * sane. Not that it is too complicated, but it makes it even easier.
+ *
+ * No state information is maintained in the kernel for each user
+ * space connection (the connection is stateless).
+ *
+ * TESTING FOR THE INTERFACE AND VERSIONING
+ *
+ * If network interface X is a WiMAX device, there will be a Generic
+ * Netlink family named "WiMAX X" and the device will present a
+ * "wimax" directory in it's network sysfs directory
+ * (/sys/class/net/DEVICE/wimax) [used by HAL].
+ *
+ * The inexistence of any of these means the device does not support
+ * this WiMAX API.
+ *
+ * By querying the generic netlink controller, versioning information
+ * and the multicast groups available can be found. Applications using
+ * the interface can either rely on that or use the generic netlink
+ * controller to figure out which generic netlink commands/signals are
+ * supported.
+ *
+ * NOTE: this versioning is a last resort to avoid hard
+ *    incompatibilities. It is the intention of the design of this
+ *    stack not to introduce backward incompatible changes.
+ *
+ * The version code has to fit in one byte (restrictions imposed by
+ * generic netlink); we use `version / 10` for the major version and
+ * `version % 10` for the minor. This gives 9 minors for each major
+ * and 25 majors.
+ *
+ * The version change protocol is as follow:
+ *
+ * - Major versions: needs to be increased if an existing message/API
+ *   call is changed or removed. Doesn't need to be changed if a new
+ *   message is added.
+ *
+ * - Minor version: needs to be increased if new messages/API calls are
+ *   being added or some other consideration that doesn't impact the
+ *   user-kernel interface too much (like some kind of bug fix) and
+ *   that is kind of left up in the air to common sense.
+ *
+ * User space code should not try to work if the major version it was
+ * compiled for differs from what the kernel offers. As well, if the
+ * minor version of the kernel interface is lower than the one user
+ * space is expecting (the one it was compiled for), the kernel
+ * might be missing API calls; user space shall be ready to handle
+ * said condition. Use the generic netlink controller operations to
+ * find which ones are supported and which not.
+ *
+ * libwimaxll:wimaxll_open() takes care of checking versions.
+ *
+ * THE OPERATIONS:
+ *
+ * Each operation is defined in its on file (drivers/net/wimax/op-*.c)
+ * for clarity. The parts needed for an operation are:
+ *
+ *  - a function pointer in `struct wimax_dev`: optional, as the
+ *    operation might be implemented by the stack and not by the
+ *    driver.
+ *
+ *    All function pointers are named wimax_dev->op_*(), and drivers
+ *    must implement them except where noted otherwise.
+ *
+ *  - When exported to user space, a `struct nla_policy` to define the
+ *    attributes of the generic netlink command and a `struct genl_ops`
+ *    to define the operation.
+ *
+ * All the declarations for the operation codes (WIMAX_GNL_OP_<NAME>)
+ * and generic netlink attributes (WIMAX_GNL_<NAME>_*) are declared in
+ * include/linux/wimax.h; this file is intended to be cloned by user
+ * space to gain access to those declarations.
+ *
+ * A few caveats to remember:
+ *
+ *  - Need to define attribute numbers starting in 1; otherwise it
+ *    fails.
+ *
+ *  - the `struct genl_family` requires a maximum attribute id; when
+ *    defining the `struct nla_policy` for each message, it has to have
+ *    an array size of WIMAX_GNL_ATTR_MAX+1.
+ *
+ * The op_*() function pointers will not be called if the wimax_dev is
+ * in a state <= %WIMAX_ST_UNINITIALIZED. The exception is:
+ *
+ * - op_reset: can be called at any time after wimax_dev_add() has
+ *   been called.
+ *
+ * THE PIPE INTERFACE:
+ *
+ * This interface is kept intentionally simple. The driver can send
+ * and receive free-form messages to/from user space through a
+ * pipe. See drivers/net/wimax/op-msg.c for details.
+ *
+ * The kernel-to-user messages are sent with
+ * wimax_msg(). user-to-kernel messages are delivered via
+ * wimax_dev->op_msg_from_user().
+ *
+ * RFKILL:
+ *
+ * RFKILL support is built into the wimax_dev layer; the driver just
+ * needs to call wimax_report_rfkill_{hw,sw}() to inform of changes in
+ * the hardware or software RF kill switches. When the stack wants to
+ * turn the radio off, it will call wimax_dev->op_rfkill_sw_toggle(),
+ * which the driver implements.
+ *
+ * User space can set the software RF Kill switch by calling
+ * wimax_rfkill().
+ *
+ * The code for now only supports devices that don't require polling;
+ * If the device needs to be polled, create a self-rearming delayed
+ * work struct for polling or look into adding polled support to the
+ * WiMAX stack.
+ *
+ * When initializing the hardware (_probe), after calling
+ * wimax_dev_add(), query the device for it's RF Kill switches status
+ * and feed it back to the WiMAX stack using
+ * wimax_report_rfkill_{hw,sw}(). If any switch is missing, always
+ * report it as ON.
+ *
+ * NOTE: the wimax stack uses an inverted terminology to that of the
+ * RFKILL subsystem:
+ *
+ *  - ON: radio is ON, RFKILL is DISABLED or OFF.
+ *  - OFF: radio is OFF, RFKILL is ENABLED or ON.
+ *
+ * MISCELLANEOUS OPS:
+ *
+ * wimax_reset() can be used to reset the device to power on state; by
+ * default it issues a warm reset that maintains the same device
+ * node. If that is not possible, it falls back to a cold reset
+ * (device reconnect). The driver implements the backend to this
+ * through wimax_dev->op_reset().
+ */
+
+#ifndef __NET__WIMAX_H__
+#define __NET__WIMAX_H__
+
+#include "linux-wimax.h"
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+
+struct net_device;
+struct genl_info;
+struct wimax_dev;
+
+/**
+ * struct wimax_dev - Generic WiMAX device
+ *
+ * @net_dev: [fill] Pointer to the &struct net_device this WiMAX
+ *     device implements.
+ *
+ * @op_msg_from_user: [fill] Driver-specific operation to
+ *     handle a raw message from user space to the driver. The
+ *     driver can send messages to user space using with
+ *     wimax_msg_to_user().
+ *
+ * @op_rfkill_sw_toggle: [fill] Driver-specific operation to act on
+ *     userspace (or any other agent) requesting the WiMAX device to
+ *     change the RF Kill software switch (WIMAX_RF_ON or
+ *     WIMAX_RF_OFF).
+ *     If such hardware support is not present, it is assumed the
+ *     radio cannot be switched off and it is always on (and the stack
+ *     will error out when trying to switch it off). In such case,
+ *     this function pointer can be left as NULL.
+ *
+ * @op_reset: [fill] Driver specific operation to reset the
+ *     device.
+ *     This operation should always attempt first a warm reset that
+ *     does not disconnect the device from the bus and return 0.
+ *     If that fails, it should resort to some sort of cold or bus
+ *     reset (even if it implies a bus disconnection and device
+ *     disappearance). In that case, -ENODEV should be returned to
+ *     indicate the device is gone.
+ *     This operation has to be synchronous, and return only when the
+ *     reset is complete. In case of having had to resort to bus/cold
+ *     reset implying a device disconnection, the call is allowed to
+ *     return immediately.
+ *     NOTE: wimax_dev->mutex is NOT locked when this op is being
+ *     called; however, wimax_dev->mutex_reset IS locked to ensure
+ *     serialization of calls to wimax_reset().
+ *     See wimax_reset()'s documentation.
+ *
+ * @name: [fill] A way to identify this device. We need to register a
+ *     name with many subsystems (rfkill, workqueue creation, etc).
+ *     We can't use the network device name as that
+ *     might change and in some instances we don't know it yet (until
+ *     we don't call register_netdev()). So we generate an unique one
+ *     using the driver name and device bus id, place it here and use
+ *     it across the board. Recommended naming:
+ *     DRIVERNAME-BUSNAME:BUSID (dev->bus->name, dev->bus_id).
+ *
+ * @id_table_node: [private] link to the list of wimax devices kept by
+ *     id-table.c. Protected by it's own spinlock.
+ *
+ * @mutex: [private] Serializes all concurrent access and execution of
+ *     operations.
+ *
+ * @mutex_reset: [private] Serializes reset operations. Needs to be a
+ *     different mutex because as part of the reset operation, the
+ *     driver has to call back into the stack to do things such as
+ *     state change, that require wimax_dev->mutex.
+ *
+ * @state: [private] Current state of the WiMAX device.
+ *
+ * @rfkill: [private] integration into the RF-Kill infrastructure.
+ *
+ * @rf_sw: [private] State of the software radio switch (OFF/ON)
+ *
+ * @rf_hw: [private] State of the hardware radio switch (OFF/ON)
+ *
+ * @debugfs_dentry: [private] Used to hook up a debugfs entry. This
+ *     shows up in the debugfs root as wimax\:DEVICENAME.
+ *
+ * Description:
+ * This structure defines a common interface to access all WiMAX
+ * devices from different vendors and provides a common API as well as
+ * a free-form device-specific messaging channel.
+ *
+ * Usage:
+ *  1. Embed a &struct wimax_dev at *the beginning* the network
+ *     device structure so that netdev_priv() points to it.
+ *
+ *  2. memset() it to zero
+ *
+ *  3. Initialize with wimax_dev_init(). This will leave the WiMAX
+ *     device in the %__WIMAX_ST_NULL state.
+ *
+ *  4. Fill all the fields marked with [fill]; once called
+ *     wimax_dev_add(), those fields CANNOT be modified.
+ *
+ *  5. Call wimax_dev_add() *after* registering the network
+ *     device. This will leave the WiMAX device in the %WIMAX_ST_DOWN
+ *     state.
+ *     Protect the driver's net_device->open() against succeeding if
+ *     the wimax device state is lower than %WIMAX_ST_DOWN.
+ *
+ *  6. Select when the device is going to be turned on/initialized;
+ *     for example, it could be initialized on 'ifconfig up' (when the
+ *     netdev op 'open()' is called on the driver).
+ *
+ * When the device is initialized (at `ifconfig up` time, or right
+ * after calling wimax_dev_add() from _probe(), make sure the
+ * following steps are taken
+ *
+ *  a. Move the device to %WIMAX_ST_UNINITIALIZED. This is needed so
+ *     some API calls that shouldn't work until the device is ready
+ *     can be blocked.
+ *
+ *  b. Initialize the device. Make sure to turn the SW radio switch
+ *     off and move the device to state %WIMAX_ST_RADIO_OFF when
+ *     done. When just initialized, a device should be left in RADIO
+ *     OFF state until user space devices to turn it on.
+ *
+ *  c. Query the device for the state of the hardware rfkill switch
+ *     and call wimax_rfkill_report_hw() and wimax_rfkill_report_sw()
+ *     as needed. See below.
+ *
+ * wimax_dev_rm() undoes before unregistering the network device. Once
+ * wimax_dev_add() is called, the driver can get called on the
+ * wimax_dev->op_* function pointers
+ *
+ * CONCURRENCY:
+ *
+ * The stack provides a mutex for each device that will disallow API
+ * calls happening concurrently; thus, op calls into the driver
+ * through the wimax_dev->op*() function pointers will always be
+ * serialized and *never* concurrent.
+ *
+ * For locking, take wimax_dev->mutex is taken; (most) operations in
+ * the API have to check for wimax_dev_is_ready() to return 0 before
+ * continuing (this is done internally).
+ *
+ * REFERENCE COUNTING:
+ *
+ * The WiMAX device is reference counted by the associated network
+ * device. The only operation that can be used to reference the device
+ * is wimax_dev_get_by_genl_info(), and the reference it acquires has
+ * to be released with dev_put(wimax_dev->net_dev).
+ *
+ * RFKILL:
+ *
+ * At startup, both HW and SW radio switchess are assumed to be off.
+ *
+ * At initialization time [after calling wimax_dev_add()], have the
+ * driver query the device for the status of the software and hardware
+ * RF kill switches and call wimax_report_rfkill_hw() and
+ * wimax_rfkill_report_sw() to indicate their state. If any is
+ * missing, just call it to indicate it is ON (radio always on).
+ *
+ * Whenever the driver detects a change in the state of the RF kill
+ * switches, it should call wimax_report_rfkill_hw() or
+ * wimax_report_rfkill_sw() to report it to the stack.
+ */
+struct wimax_dev {
+	struct net_device *net_dev;
+	struct list_head id_table_node;
+	struct mutex mutex;		/* Protects all members and API calls */
+	struct mutex mutex_reset;
+	enum wimax_st state;
+
+	int (*op_msg_from_user)(struct wimax_dev *wimax_dev,
+				const char *,
+				const void *, size_t,
+				const struct genl_info *info);
+	int (*op_rfkill_sw_toggle)(struct wimax_dev *wimax_dev,
+				   enum wimax_rf_state);
+	int (*op_reset)(struct wimax_dev *wimax_dev);
+
+	struct rfkill *rfkill;
+	unsigned int rf_hw;
+	unsigned int rf_sw;
+	char name[32];
+
+	struct dentry *debugfs_dentry;
+};
+
+
+
+/*
+ * WiMAX stack public API for device drivers
+ * -----------------------------------------
+ *
+ * These functions are not exported to user space.
+ */
+void wimax_dev_init(struct wimax_dev *);
+int wimax_dev_add(struct wimax_dev *, struct net_device *);
+void wimax_dev_rm(struct wimax_dev *);
+
+static inline
+struct wimax_dev *net_dev_to_wimax(struct net_device *net_dev)
+{
+	return netdev_priv(net_dev);
+}
+
+static inline
+struct device *wimax_dev_to_dev(struct wimax_dev *wimax_dev)
+{
+	return wimax_dev->net_dev->dev.parent;
+}
+
+void wimax_state_change(struct wimax_dev *, enum wimax_st);
+enum wimax_st wimax_state_get(struct wimax_dev *);
+
+/*
+ * Radio Switch state reporting.
+ *
+ * enum wimax_rf_state is declared in linux/wimax.h so the exports
+ * to user space can use it.
+ */
+void wimax_report_rfkill_hw(struct wimax_dev *, enum wimax_rf_state);
+void wimax_report_rfkill_sw(struct wimax_dev *, enum wimax_rf_state);
+
+
+/*
+ * Free-form messaging to/from user space
+ *
+ * Sending a message:
+ *
+ *   wimax_msg(wimax_dev, pipe_name, buf, buf_size, GFP_KERNEL);
+ *
+ * Broken up:
+ *
+ *   skb = wimax_msg_alloc(wimax_dev, pipe_name, buf_size, GFP_KERNEL);
+ *   ...fill up skb...
+ *   wimax_msg_send(wimax_dev, pipe_name, skb);
+ *
+ * Be sure not to modify skb->data in the middle (ie: don't use
+ * skb_push()/skb_pull()/skb_reserve() on the skb).
+ *
+ * "pipe_name" is any string, that can be interpreted as the name of
+ * the pipe or recipient; the interpretation of it is driver
+ * specific, so the recipient can multiplex it as wished. It can be
+ * NULL, it won't be used - an example is using a "diagnostics" tag to
+ * send diagnostics information that a device-specific diagnostics
+ * tool would be interested in.
+ */
+struct sk_buff *wimax_msg_alloc(struct wimax_dev *, const char *, const void *,
+				size_t, gfp_t);
+int wimax_msg_send(struct wimax_dev *, struct sk_buff *);
+int wimax_msg(struct wimax_dev *, const char *, const void *, size_t, gfp_t);
+
+const void *wimax_msg_data_len(struct sk_buff *, size_t *);
+const void *wimax_msg_data(struct sk_buff *);
+ssize_t wimax_msg_len(struct sk_buff *);
+
+
+/*
+ * WiMAX stack user space API
+ * --------------------------
+ *
+ * This API is what gets exported to user space for general
+ * operations. As well, they can be called from within the kernel,
+ * (with a properly referenced `struct wimax_dev`).
+ *
+ * Properly referenced means: the 'struct net_device' that embeds the
+ * device's control structure and (as such) the 'struct wimax_dev' is
+ * referenced by the caller.
+ */
+int wimax_rfkill(struct wimax_dev *, enum wimax_rf_state);
+int wimax_reset(struct wimax_dev *);
+
+#endif /* #ifndef __NET__WIMAX_H__ */
diff --git a/drivers/staging/wimax/op-msg.c b/drivers/staging/wimax/op-msg.c
new file mode 100644
index 000000000000..e20ac7d84e82
--- /dev/null
+++ b/drivers/staging/wimax/op-msg.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux WiMAX
+ * Generic messaging interface between userspace and driver/device
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This implements a direct communication channel between user space and
+ * the driver/device, by which free form messages can be sent back and
+ * forth.
+ *
+ * This is intended for device-specific features, vendor quirks, etc.
+ *
+ * See include/net/wimax.h
+ *
+ * GENERIC NETLINK ENCODING AND CAPACITY
+ *
+ * A destination "pipe name" is added to each message; it is up to the
+ * drivers to assign or use those names (if using them at all).
+ *
+ * Messages are encoded as a binary netlink attribute using nla_put()
+ * using type NLA_UNSPEC (as some versions of libnl still in
+ * deployment don't yet understand NLA_BINARY).
+ *
+ * The maximum capacity of this transport is PAGESIZE per message (so
+ * the actual payload will be bit smaller depending on the
+ * netlink/generic netlink attributes and headers).
+ *
+ * RECEPTION OF MESSAGES
+ *
+ * When a message is received from user space, it is passed verbatim
+ * to the driver calling wimax_dev->op_msg_from_user(). The return
+ * value from this function is passed back to user space as an ack
+ * over the generic netlink protocol.
+ *
+ * The stack doesn't do any processing or interpretation of these
+ * messages.
+ *
+ * SENDING MESSAGES
+ *
+ * Messages can be sent with wimax_msg().
+ *
+ * If the message delivery needs to happen on a different context to
+ * that of its creation, wimax_msg_alloc() can be used to get a
+ * pointer to the message that can be delivered later on with
+ * wimax_msg_send().
+ *
+ * ROADMAP
+ *
+ * wimax_gnl_doit_msg_from_user()    Process a message from user space
+ *   wimax_dev_get_by_genl_info()
+ *   wimax_dev->op_msg_from_user()   Delivery of message to the driver
+ *
+ * wimax_msg()                       Send a message to user space
+ *   wimax_msg_alloc()
+ *   wimax_msg_send()
+ */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include "linux-wimax.h"
+#include <linux/security.h>
+#include <linux/export.h>
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE op_msg
+#include "debug-levels.h"
+
+
+/**
+ * wimax_msg_alloc - Create a new skb for sending a message to userspace
+ *
+ * @wimax_dev: WiMAX device descriptor
+ * @pipe_name: "named pipe" the message will be sent to
+ * @msg: pointer to the message data to send
+ * @size: size of the message to send (in bytes), including the header.
+ * @gfp_flags: flags for memory allocation.
+ *
+ * Returns: %0 if ok, negative errno code on error
+ *
+ * Description:
+ *
+ * Allocates an skb that will contain the message to send to user
+ * space over the messaging pipe and initializes it, copying the
+ * payload.
+ *
+ * Once this call is done, you can deliver it with
+ * wimax_msg_send().
+ *
+ * IMPORTANT:
+ *
+ * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
+ * wimax_msg_send() depends on skb->data being placed at the
+ * beginning of the user message.
+ *
+ * Unlike other WiMAX stack calls, this call can be used way early,
+ * even before wimax_dev_add() is called, as long as the
+ * wimax_dev->net_dev pointer is set to point to a proper
+ * net_dev. This is so that drivers can use it early in case they need
+ * to send stuff around or communicate with user space.
+ */
+struct sk_buff *wimax_msg_alloc(struct wimax_dev *wimax_dev,
+				const char *pipe_name,
+				const void *msg, size_t size,
+				gfp_t gfp_flags)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	size_t msg_size;
+	void *genl_msg;
+	struct sk_buff *skb;
+
+	msg_size = nla_total_size(size)
+		+ nla_total_size(sizeof(u32))
+		+ (pipe_name ? nla_total_size(strlen(pipe_name)) : 0);
+	result = -ENOMEM;
+	skb = genlmsg_new(msg_size, gfp_flags);
+	if (skb == NULL)
+		goto error_new;
+	genl_msg = genlmsg_put(skb, 0, 0, &wimax_gnl_family,
+			       0, WIMAX_GNL_OP_MSG_TO_USER);
+	if (genl_msg == NULL) {
+		dev_err(dev, "no memory to create generic netlink message\n");
+		goto error_genlmsg_put;
+	}
+	result = nla_put_u32(skb, WIMAX_GNL_MSG_IFIDX,
+			     wimax_dev->net_dev->ifindex);
+	if (result < 0) {
+		dev_err(dev, "no memory to add ifindex attribute\n");
+		goto error_nla_put;
+	}
+	if (pipe_name) {
+		result = nla_put_string(skb, WIMAX_GNL_MSG_PIPE_NAME,
+					pipe_name);
+		if (result < 0) {
+			dev_err(dev, "no memory to add pipe_name attribute\n");
+			goto error_nla_put;
+		}
+	}
+	result = nla_put(skb, WIMAX_GNL_MSG_DATA, size, msg);
+	if (result < 0) {
+		dev_err(dev, "no memory to add payload (msg %p size %zu) in "
+			"attribute: %d\n", msg, size, result);
+		goto error_nla_put;
+	}
+	genlmsg_end(skb, genl_msg);
+	return skb;
+
+error_nla_put:
+error_genlmsg_put:
+error_new:
+	nlmsg_free(skb);
+	return ERR_PTR(result);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_alloc);
+
+
+/**
+ * wimax_msg_data_len - Return a pointer and size of a message's payload
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ * @size: Pointer to where to store the message's size
+ *
+ * Returns the pointer to the message data.
+ */
+const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size)
+{
+	struct nlmsghdr *nlh = (void *) msg->head;
+	struct nlattr *nla;
+
+	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+			      WIMAX_GNL_MSG_DATA);
+	if (nla == NULL) {
+		pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+		return NULL;
+	}
+	*size = nla_len(nla);
+	return nla_data(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_data_len);
+
+
+/**
+ * wimax_msg_data - Return a pointer to a message's payload
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ */
+const void *wimax_msg_data(struct sk_buff *msg)
+{
+	struct nlmsghdr *nlh = (void *) msg->head;
+	struct nlattr *nla;
+
+	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+			      WIMAX_GNL_MSG_DATA);
+	if (nla == NULL) {
+		pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+		return NULL;
+	}
+	return nla_data(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_data);
+
+
+/**
+ * wimax_msg_len - Return a message's payload length
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ */
+ssize_t wimax_msg_len(struct sk_buff *msg)
+{
+	struct nlmsghdr *nlh = (void *) msg->head;
+	struct nlattr *nla;
+
+	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+			      WIMAX_GNL_MSG_DATA);
+	if (nla == NULL) {
+		pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+		return -EINVAL;
+	}
+	return nla_len(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_len);
+
+
+/**
+ * wimax_msg_send - Send a pre-allocated message to user space
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @skb: &struct sk_buff returned by wimax_msg_alloc(). Note the
+ *     ownership of @skb is transferred to this function.
+ *
+ * Returns: 0 if ok, < 0 errno code on error
+ *
+ * Description:
+ *
+ * Sends a free-form message that was preallocated with
+ * wimax_msg_alloc() and filled up.
+ *
+ * Assumes that once you pass an skb to this function for sending, it
+ * owns it and will release it when done (on success).
+ *
+ * IMPORTANT:
+ *
+ * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
+ * wimax_msg_send() depends on skb->data being placed at the
+ * beginning of the user message.
+ *
+ * Unlike other WiMAX stack calls, this call can be used way early,
+ * even before wimax_dev_add() is called, as long as the
+ * wimax_dev->net_dev pointer is set to point to a proper
+ * net_dev. This is so that drivers can use it early in case they need
+ * to send stuff around or communicate with user space.
+ */
+int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb)
+{
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	void *msg = skb->data;
+	size_t size = skb->len;
+	might_sleep();
+
+	d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size);
+	d_dump(2, dev, msg, size);
+	genlmsg_multicast(&wimax_gnl_family, skb, 0, 0, GFP_KERNEL);
+	d_printf(1, dev, "CTX: genl multicast done\n");
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wimax_msg_send);
+
+
+/**
+ * wimax_msg - Send a message to user space
+ *
+ * @wimax_dev: WiMAX device descriptor (properly referenced)
+ * @pipe_name: "named pipe" the message will be sent to
+ * @buf: pointer to the message to send.
+ * @size: size of the buffer pointed to by @buf (in bytes).
+ * @gfp_flags: flags for memory allocation.
+ *
+ * Returns: %0 if ok, negative errno code on error.
+ *
+ * Description:
+ *
+ * Sends a free-form message to user space on the device @wimax_dev.
+ *
+ * NOTES:
+ *
+ * Once the @skb is given to this function, who will own it and will
+ * release it when done (unless it returns error).
+ */
+int wimax_msg(struct wimax_dev *wimax_dev, const char *pipe_name,
+	      const void *buf, size_t size, gfp_t gfp_flags)
+{
+	int result = -ENOMEM;
+	struct sk_buff *skb;
+
+	skb = wimax_msg_alloc(wimax_dev, pipe_name, buf, size, gfp_flags);
+	if (IS_ERR(skb))
+		result = PTR_ERR(skb);
+	else
+		result = wimax_msg_send(wimax_dev, skb);
+	return result;
+}
+EXPORT_SYMBOL_GPL(wimax_msg);
+
+/*
+ * Relays a message from user space to the driver
+ *
+ * The skb is passed to the driver-specific function with the netlink
+ * and generic netlink headers already stripped.
+ *
+ * This call will block while handling/relaying the message.
+ */
+int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+	struct device *dev;
+	struct nlmsghdr *nlh = info->nlhdr;
+	char *pipe_name;
+	void *msg_buf;
+	size_t msg_len;
+
+	might_sleep();
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) {
+		pr_err("WIMAX_GNL_MSG_FROM_USER: can't find IFIDX attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	dev = wimax_dev_to_dev(wimax_dev);
+
+	/* Unpack arguments */
+	result = -EINVAL;
+	if (info->attrs[WIMAX_GNL_MSG_DATA] == NULL) {
+		dev_err(dev, "WIMAX_GNL_MSG_FROM_USER: can't find MSG_DATA "
+			"attribute\n");
+		goto error_no_data;
+	}
+	msg_buf = nla_data(info->attrs[WIMAX_GNL_MSG_DATA]);
+	msg_len = nla_len(info->attrs[WIMAX_GNL_MSG_DATA]);
+
+	if (info->attrs[WIMAX_GNL_MSG_PIPE_NAME] == NULL)
+		pipe_name = NULL;
+	else {
+		struct nlattr *attr = info->attrs[WIMAX_GNL_MSG_PIPE_NAME];
+		size_t attr_len = nla_len(attr);
+		/* libnl-1.1 does not yet support NLA_NUL_STRING */
+		result = -ENOMEM;
+		pipe_name = kstrndup(nla_data(attr), attr_len + 1, GFP_KERNEL);
+		if (pipe_name == NULL)
+			goto error_alloc;
+		pipe_name[attr_len] = 0;
+	}
+	mutex_lock(&wimax_dev->mutex);
+	result = wimax_dev_is_ready(wimax_dev);
+	if (result == -ENOMEDIUM)
+		result = 0;
+	if (result < 0)
+		goto error_not_ready;
+	result = -ENOSYS;
+	if (wimax_dev->op_msg_from_user == NULL)
+		goto error_noop;
+
+	d_printf(1, dev,
+		 "CRX: nlmsghdr len %u type %u flags 0x%04x seq 0x%x pid %u\n",
+		 nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags,
+		 nlh->nlmsg_seq, nlh->nlmsg_pid);
+	d_printf(1, dev, "CRX: wimax message %zu bytes\n", msg_len);
+	d_dump(2, dev, msg_buf, msg_len);
+
+	result = wimax_dev->op_msg_from_user(wimax_dev, pipe_name,
+					     msg_buf, msg_len, info);
+error_noop:
+error_not_ready:
+	mutex_unlock(&wimax_dev->mutex);
+error_alloc:
+	kfree(pipe_name);
+error_no_data:
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
diff --git a/drivers/staging/wimax/op-reset.c b/drivers/staging/wimax/op-reset.c
new file mode 100644
index 000000000000..b3f000cbe112
--- /dev/null
+++ b/drivers/staging/wimax/op-reset.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux WiMAX
+ * Implement and export a method for resetting a WiMAX device
+ *
+ * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This implements a simple synchronous call to reset a WiMAX device.
+ *
+ * Resets aim at being warm, keeping the device handles active;
+ * however, when that fails, it falls back to a cold reset (that will
+ * disconnect and reconnect the device).
+ */
+
+#include "net-wimax.h"
+#include <net/genetlink.h>
+#include "linux-wimax.h"
+#include <linux/security.h>
+#include <linux/export.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_reset
+#include "debug-levels.h"
+
+
+/**
+ * wimax_reset - Reset a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Returns:
+ *
+ * %0 if ok and a warm reset was done (the device still exists in
+ * the system).
+ *
+ * -%ENODEV if a cold/bus reset had to be done (device has
+ * disconnected and reconnected, so current handle is not valid
+ * any more).
+ *
+ * -%EINVAL if the device is not even registered.
+ *
+ * Any other negative error code shall be considered as
+ * non-recoverable.
+ *
+ * Description:
+ *
+ * Called when wanting to reset the device for any reason. Device is
+ * taken back to power on status.
+ *
+ * This call blocks; on successful return, the device has completed the
+ * reset process and is ready to operate.
+ */
+int wimax_reset(struct wimax_dev *wimax_dev)
+{
+	int result = -EINVAL;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st state;
+
+	might_sleep();
+	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+	mutex_lock(&wimax_dev->mutex);
+	dev_hold(wimax_dev->net_dev);
+	state = wimax_dev->state;
+	mutex_unlock(&wimax_dev->mutex);
+
+	if (state >= WIMAX_ST_DOWN) {
+		mutex_lock(&wimax_dev->mutex_reset);
+		result = wimax_dev->op_reset(wimax_dev);
+		mutex_unlock(&wimax_dev->mutex_reset);
+	}
+	dev_put(wimax_dev->net_dev);
+
+	d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
+	return result;
+}
+EXPORT_SYMBOL(wimax_reset);
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the reset command from user space, return error code.
+ *
+ * No attributes.
+ */
+int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) {
+		pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	/* Execute the operation and send the result back to user space */
+	result = wimax_reset(wimax_dev);
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
diff --git a/drivers/staging/wimax/op-rfkill.c b/drivers/staging/wimax/op-rfkill.c
new file mode 100644
index 000000000000..78b294481a59
--- /dev/null
+++ b/drivers/staging/wimax/op-rfkill.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux WiMAX
+ * RF-kill framework integration
+ *
+ * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This integrates into the Linux Kernel rfkill susbystem so that the
+ * drivers just have to do the bare minimal work, which is providing a
+ * method to set the software RF-Kill switch and to report changes in
+ * the software and hardware switch status.
+ *
+ * A non-polled generic rfkill device is embedded into the WiMAX
+ * subsystem's representation of a device.
+ *
+ * FIXME: Need polled support? Let drivers provide a poll routine
+ *	  and hand it to rfkill ops then?
+ *
+ * All device drivers have to do is after wimax_dev_init(), call
+ * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update
+ * initial state and then every time it changes. See wimax.h:struct
+ * wimax_dev for more information.
+ *
+ * ROADMAP
+ *
+ * wimax_gnl_doit_rfkill()      User space calling wimax_rfkill()
+ *   wimax_rfkill()             Kernel calling wimax_rfkill()
+ *     __wimax_rf_toggle_radio()
+ *
+ * wimax_rfkill_set_radio_block()  RF-Kill subsystem calling
+ *   __wimax_rf_toggle_radio()
+ *
+ * __wimax_rf_toggle_radio()
+ *   wimax_dev->op_rfkill_sw_toggle() Driver backend
+ *   __wimax_state_change()
+ *
+ * wimax_report_rfkill_sw()     Driver reports state change
+ *   __wimax_state_change()
+ *
+ * wimax_report_rfkill_hw()     Driver reports state change
+ *   __wimax_state_change()
+ *
+ * wimax_rfkill_add()           Initialize/shutdown rfkill support
+ * wimax_rfkill_rm()            [called by wimax_dev_add/rm()]
+ */
+
+#include "net-wimax.h"
+#include <net/genetlink.h>
+#include "linux-wimax.h"
+#include <linux/security.h>
+#include <linux/rfkill.h>
+#include <linux/export.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_rfkill
+#include "debug-levels.h"
+
+/**
+ * wimax_report_rfkill_hw - Reports changes in the hardware RF switch
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New state of the RF Kill switch. %WIMAX_RF_ON radio on,
+ *     %WIMAX_RF_OFF radio off.
+ *
+ * When the device detects a change in the state of thehardware RF
+ * switch, it must call this function to let the WiMAX kernel stack
+ * know that the state has changed so it can be properly propagated.
+ *
+ * The WiMAX stack caches the state (the driver doesn't need to). As
+ * well, as the change is propagated it will come back as a request to
+ * change the software state to mirror the hardware state.
+ *
+ * If the device doesn't have a hardware kill switch, just report
+ * it on initialization as always on (%WIMAX_RF_ON, radio on).
+ */
+void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
+			    enum wimax_rf_state state)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st wimax_state;
+
+	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+	BUG_ON(state == WIMAX_RF_QUERY);
+	BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
+
+	mutex_lock(&wimax_dev->mutex);
+	result = wimax_dev_is_ready(wimax_dev);
+	if (result < 0)
+		goto error_not_ready;
+
+	if (state != wimax_dev->rf_hw) {
+		wimax_dev->rf_hw = state;
+		if (wimax_dev->rf_hw == WIMAX_RF_ON &&
+		    wimax_dev->rf_sw == WIMAX_RF_ON)
+			wimax_state = WIMAX_ST_READY;
+		else
+			wimax_state = WIMAX_ST_RADIO_OFF;
+
+		result = rfkill_set_hw_state(wimax_dev->rfkill,
+					     state == WIMAX_RF_OFF);
+
+		__wimax_state_change(wimax_dev, wimax_state);
+	}
+error_not_ready:
+	mutex_unlock(&wimax_dev->mutex);
+	d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
+		wimax_dev, state, result);
+}
+EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw);
+
+
+/**
+ * wimax_report_rfkill_sw - Reports changes in the software RF switch
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on,
+ *     %WIMAX_RF_OFF radio off.
+ *
+ * Reports changes in the software RF switch state to the WiMAX stack.
+ *
+ * The main use is during initialization, so the driver can query the
+ * device for its current software radio kill switch state and feed it
+ * to the system.
+ *
+ * On the side, the device does not change the software state by
+ * itself. In practice, this can happen, as the device might decide to
+ * switch (in software) the radio off for different reasons.
+ */
+void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev,
+			    enum wimax_rf_state state)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st wimax_state;
+
+	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+	BUG_ON(state == WIMAX_RF_QUERY);
+	BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
+
+	mutex_lock(&wimax_dev->mutex);
+	result = wimax_dev_is_ready(wimax_dev);
+	if (result < 0)
+		goto error_not_ready;
+
+	if (state != wimax_dev->rf_sw) {
+		wimax_dev->rf_sw = state;
+		if (wimax_dev->rf_hw == WIMAX_RF_ON &&
+		    wimax_dev->rf_sw == WIMAX_RF_ON)
+			wimax_state = WIMAX_ST_READY;
+		else
+			wimax_state = WIMAX_ST_RADIO_OFF;
+		__wimax_state_change(wimax_dev, wimax_state);
+		rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
+	}
+error_not_ready:
+	mutex_unlock(&wimax_dev->mutex);
+	d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
+		wimax_dev, state, result);
+}
+EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw);
+
+
+/*
+ * Callback for the RF Kill toggle operation
+ *
+ * This function is called by:
+ *
+ * - The rfkill subsystem when the RF-Kill key is pressed in the
+ *   hardware and the driver notifies through
+ *   wimax_report_rfkill_hw(). The rfkill subsystem ends up calling back
+ *   here so the software RF Kill switch state is changed to reflect
+ *   the hardware switch state.
+ *
+ * - When the user sets the state through sysfs' rfkill/state file
+ *
+ * - When the user calls wimax_rfkill().
+ *
+ * This call blocks!
+ *
+ * WARNING! When we call rfkill_unregister(), this will be called with
+ * state 0!
+ *
+ * WARNING: wimax_dev must be locked
+ */
+static
+int __wimax_rf_toggle_radio(struct wimax_dev *wimax_dev,
+			    enum wimax_rf_state state)
+{
+	int result = 0;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st wimax_state;
+
+	might_sleep();
+	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+	if (wimax_dev->rf_sw == state)
+		goto out_no_change;
+	if (wimax_dev->op_rfkill_sw_toggle != NULL)
+		result = wimax_dev->op_rfkill_sw_toggle(wimax_dev, state);
+	else if (state == WIMAX_RF_OFF)	/* No op? can't turn off */
+		result = -ENXIO;
+	else				/* No op? can turn on */
+		result = 0;		/* should never happen tho */
+	if (result >= 0) {
+		result = 0;
+		wimax_dev->rf_sw = state;
+		wimax_state = state == WIMAX_RF_ON ?
+			WIMAX_ST_READY : WIMAX_ST_RADIO_OFF;
+		__wimax_state_change(wimax_dev, wimax_state);
+	}
+out_no_change:
+	d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
+		wimax_dev, state, result);
+	return result;
+}
+
+
+/*
+ * Translate from rfkill state to wimax state
+ *
+ * NOTE: Special state handling rules here
+ *
+ *     Just pretend the call didn't happen if we are in a state where
+ *     we know for sure it cannot be handled (WIMAX_ST_DOWN or
+ *     __WIMAX_ST_QUIESCING). rfkill() needs it to register and
+ *     unregister, as it will run this path.
+ *
+ * NOTE: This call will block until the operation is completed.
+ */
+static int wimax_rfkill_set_radio_block(void *data, bool blocked)
+{
+	int result;
+	struct wimax_dev *wimax_dev = data;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_rf_state rf_state;
+
+	d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked);
+	rf_state = WIMAX_RF_ON;
+	if (blocked)
+		rf_state = WIMAX_RF_OFF;
+	mutex_lock(&wimax_dev->mutex);
+	if (wimax_dev->state <= __WIMAX_ST_QUIESCING)
+		result = 0;
+	else
+		result = __wimax_rf_toggle_radio(wimax_dev, rf_state);
+	mutex_unlock(&wimax_dev->mutex);
+	d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n",
+		wimax_dev, blocked, result);
+	return result;
+}
+
+static const struct rfkill_ops wimax_rfkill_ops = {
+	.set_block = wimax_rfkill_set_radio_block,
+};
+
+/**
+ * wimax_rfkill - Set the software RF switch state for a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New RF state.
+ *
+ * Returns:
+ *
+ * >= 0 toggle state if ok, < 0 errno code on error. The toggle state
+ * is returned as a bitmap, bit 0 being the hardware RF state, bit 1
+ * the software RF state.
+ *
+ * 0 means disabled (%WIMAX_RF_ON, radio on), 1 means enabled radio
+ * off (%WIMAX_RF_OFF).
+ *
+ * Description:
+ *
+ * Called by the user when he wants to request the WiMAX radio to be
+ * switched on (%WIMAX_RF_ON) or off (%WIMAX_RF_OFF). With
+ * %WIMAX_RF_QUERY, just the current state is returned.
+ *
+ * NOTE:
+ *
+ * This call will block until the operation is complete.
+ */
+int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+
+	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+	mutex_lock(&wimax_dev->mutex);
+	result = wimax_dev_is_ready(wimax_dev);
+	if (result < 0) {
+		/* While initializing, < 1.4.3 wimax-tools versions use
+		 * this call to check if the device is a valid WiMAX
+		 * device; so we allow it to proceed always,
+		 * considering the radios are all off. */
+		if (result == -ENOMEDIUM && state == WIMAX_RF_QUERY)
+			result = WIMAX_RF_OFF << 1 | WIMAX_RF_OFF;
+		goto error_not_ready;
+	}
+	switch (state) {
+	case WIMAX_RF_ON:
+	case WIMAX_RF_OFF:
+		result = __wimax_rf_toggle_radio(wimax_dev, state);
+		if (result < 0)
+			goto error;
+		rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
+		break;
+	case WIMAX_RF_QUERY:
+		break;
+	default:
+		result = -EINVAL;
+		goto error;
+	}
+	result = wimax_dev->rf_sw << 1 | wimax_dev->rf_hw;
+error:
+error_not_ready:
+	mutex_unlock(&wimax_dev->mutex);
+	d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
+		wimax_dev, state, result);
+	return result;
+}
+EXPORT_SYMBOL(wimax_rfkill);
+
+
+/*
+ * Register a new WiMAX device's RF Kill support
+ *
+ * WARNING: wimax_dev->mutex must be unlocked
+ */
+int wimax_rfkill_add(struct wimax_dev *wimax_dev)
+{
+	int result;
+	struct rfkill *rfkill;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+
+	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+	/* Initialize RF Kill */
+	result = -ENOMEM;
+	rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX,
+			      &wimax_rfkill_ops, wimax_dev);
+	if (rfkill == NULL)
+		goto error_rfkill_allocate;
+
+	d_printf(1, dev, "rfkill %p\n", rfkill);
+
+	wimax_dev->rfkill = rfkill;
+
+	rfkill_init_sw_state(rfkill, 1);
+	result = rfkill_register(wimax_dev->rfkill);
+	if (result < 0)
+		goto error_rfkill_register;
+
+	/* If there is no SW toggle op, SW RFKill is always on */
+	if (wimax_dev->op_rfkill_sw_toggle == NULL)
+		wimax_dev->rf_sw = WIMAX_RF_ON;
+
+	d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev);
+	return 0;
+
+error_rfkill_register:
+	rfkill_destroy(wimax_dev->rfkill);
+error_rfkill_allocate:
+	d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
+	return result;
+}
+
+
+/*
+ * Deregister a WiMAX device's RF Kill support
+ *
+ * Ick, we can't call rfkill_free() after rfkill_unregister()...oh
+ * well.
+ *
+ * WARNING: wimax_dev->mutex must be unlocked
+ */
+void wimax_rfkill_rm(struct wimax_dev *wimax_dev)
+{
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+	rfkill_unregister(wimax_dev->rfkill);
+	rfkill_destroy(wimax_dev->rfkill);
+	d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev);
+}
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the rfkill command from user space, return a combination
+ * value that describe the states of the different toggles.
+ *
+ * Only one attribute: the new state requested (on, off or no change,
+ * just query).
+ */
+
+int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+	struct device *dev;
+	enum wimax_rf_state new_state;
+
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) {
+		pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	dev = wimax_dev_to_dev(wimax_dev);
+	result = -EINVAL;
+	if (info->attrs[WIMAX_GNL_RFKILL_STATE] == NULL) {
+		dev_err(dev, "WIMAX_GNL_RFKILL: can't find RFKILL_STATE "
+			"attribute\n");
+		goto error_no_pid;
+	}
+	new_state = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_STATE]);
+
+	/* Execute the operation and send the result back to user space */
+	result = wimax_rfkill(wimax_dev, new_state);
+error_no_pid:
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
diff --git a/drivers/staging/wimax/op-state-get.c b/drivers/staging/wimax/op-state-get.c
new file mode 100644
index 000000000000..c5bfbed505f5
--- /dev/null
+++ b/drivers/staging/wimax/op-state-get.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux WiMAX
+ * Implement and export a method for getting a WiMAX device current state
+ *
+ * Copyright (C) 2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on previous WiMAX core work by:
+ *  Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ *  Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ */
+
+#include "net-wimax.h"
+#include <net/genetlink.h>
+#include "linux-wimax.h"
+#include <linux/security.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_state_get
+#include "debug-levels.h"
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the state get command from user space, return a combination
+ * value that describe the current state.
+ *
+ * No attributes.
+ */
+int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) {
+		pr_err("WIMAX_GNL_OP_STATE_GET: can't find IFIDX attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	/* Execute the operation and send the result back to user space */
+	result = wimax_state_get(wimax_dev);
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
diff --git a/drivers/staging/wimax/stack.c b/drivers/staging/wimax/stack.c
new file mode 100644
index 000000000000..ace24a6dfd2d
--- /dev/null
+++ b/drivers/staging/wimax/stack.c
@@ -0,0 +1,616 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux WiMAX
+ * Initialization, addition and removal of wimax devices
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This implements:
+ *
+ *   - basic life cycle of 'struct wimax_dev' [wimax_dev_*()]; on
+ *     addition/registration initialize all subfields and allocate
+ *     generic netlink resources for user space communication. On
+ *     removal/unregistration, undo all that.
+ *
+ *   - device state machine [wimax_state_change()] and support to send
+ *     reports to user space when the state changes
+ *     [wimax_gnl_re_state_change*()].
+ *
+ * See include/net/wimax.h for rationales and design.
+ *
+ * ROADMAP
+ *
+ * [__]wimax_state_change()     Called by drivers to update device's state
+ *   wimax_gnl_re_state_change_alloc()
+ *   wimax_gnl_re_state_change_send()
+ *
+ * wimax_dev_init()	        Init a device
+ * wimax_dev_add()              Register
+ *   wimax_rfkill_add()
+ *   wimax_gnl_add()            Register all the generic netlink resources.
+ *   wimax_id_table_add()
+ * wimax_dev_rm()               Unregister
+ *   wimax_id_table_rm()
+ *   wimax_gnl_rm()
+ *   wimax_rfkill_rm()
+ */
+#include <linux/device.h>
+#include <linux/gfp.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include "linux-wimax.h"
+#include <linux/module.h>
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE stack
+#include "debug-levels.h"
+
+static char wimax_debug_params[128];
+module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params),
+		    0644);
+MODULE_PARM_DESC(debug,
+		 "String of space-separated NAME:VALUE pairs, where NAMEs "
+		 "are the different debug submodules and VALUE are the "
+		 "initial debug value to set.");
+
+/*
+ * Authoritative source for the RE_STATE_CHANGE attribute policy
+ *
+ * We don't really use it here, but /me likes to keep the definition
+ * close to where the data is generated.
+ */
+/*
+static const struct nla_policy wimax_gnl_re_status_change[WIMAX_GNL_ATTR_MAX + 1] = {
+	[WIMAX_GNL_STCH_STATE_OLD] = { .type = NLA_U8 },
+	[WIMAX_GNL_STCH_STATE_NEW] = { .type = NLA_U8 },
+};
+*/
+
+
+/*
+ * Allocate a Report State Change message
+ *
+ * @header: save it, you need it for _send()
+ *
+ * Creates and fills a basic state change message; different code
+ * paths can then add more attributes to the message as needed.
+ *
+ * Use wimax_gnl_re_state_change_send() to send the returned skb.
+ *
+ * Returns: skb with the genl message if ok, IS_ERR() ptr on error
+ *     with an errno code.
+ */
+static
+struct sk_buff *wimax_gnl_re_state_change_alloc(
+	struct wimax_dev *wimax_dev,
+	enum wimax_st new_state, enum wimax_st old_state,
+	void **header)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	void *data;
+	struct sk_buff *report_skb;
+
+	d_fnstart(3, dev, "(wimax_dev %p new_state %u old_state %u)\n",
+		  wimax_dev, new_state, old_state);
+	result = -ENOMEM;
+	report_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (report_skb == NULL) {
+		dev_err(dev, "RE_STCH: can't create message\n");
+		goto error_new;
+	}
+	/* FIXME: sending a group ID as the seq is wrong */
+	data = genlmsg_put(report_skb, 0, wimax_gnl_family.mcgrp_offset,
+			   &wimax_gnl_family, 0, WIMAX_GNL_RE_STATE_CHANGE);
+	if (data == NULL) {
+		dev_err(dev, "RE_STCH: can't put data into message\n");
+		goto error_put;
+	}
+	*header = data;
+
+	result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_OLD, old_state);
+	if (result < 0) {
+		dev_err(dev, "RE_STCH: Error adding OLD attr: %d\n", result);
+		goto error_put;
+	}
+	result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_NEW, new_state);
+	if (result < 0) {
+		dev_err(dev, "RE_STCH: Error adding NEW attr: %d\n", result);
+		goto error_put;
+	}
+	result = nla_put_u32(report_skb, WIMAX_GNL_STCH_IFIDX,
+			     wimax_dev->net_dev->ifindex);
+	if (result < 0) {
+		dev_err(dev, "RE_STCH: Error adding IFINDEX attribute\n");
+		goto error_put;
+	}
+	d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %p\n",
+		wimax_dev, new_state, old_state, report_skb);
+	return report_skb;
+
+error_put:
+	nlmsg_free(report_skb);
+error_new:
+	d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %d\n",
+		wimax_dev, new_state, old_state, result);
+	return ERR_PTR(result);
+}
+
+
+/*
+ * Send a Report State Change message (as created with _alloc).
+ *
+ * @report_skb: as returned by wimax_gnl_re_state_change_alloc()
+ * @header: as returned by wimax_gnl_re_state_change_alloc()
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * If the message is  NULL, pretend it didn't happen.
+ */
+static
+int wimax_gnl_re_state_change_send(
+	struct wimax_dev *wimax_dev, struct sk_buff *report_skb,
+	void *header)
+{
+	int result = 0;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n",
+		  wimax_dev, report_skb);
+	if (report_skb == NULL) {
+		result = -ENOMEM;
+		goto out;
+	}
+	genlmsg_end(report_skb, header);
+	genlmsg_multicast(&wimax_gnl_family, report_skb, 0, 0, GFP_KERNEL);
+out:
+	d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n",
+		wimax_dev, report_skb, result);
+	return result;
+}
+
+
+static
+void __check_new_state(enum wimax_st old_state, enum wimax_st new_state,
+		       unsigned int allowed_states_bm)
+{
+	if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) {
+		pr_err("SW BUG! Forbidden state change %u -> %u\n",
+		       old_state, new_state);
+	}
+}
+
+
+/*
+ * Set the current state of a WiMAX device [unlocking version of
+ * wimax_state_change().
+ */
+void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
+{
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st old_state = wimax_dev->state;
+	struct sk_buff *stch_skb;
+	void *header;
+
+	d_fnstart(3, dev, "(wimax_dev %p new_state %u [old %u])\n",
+		  wimax_dev, new_state, old_state);
+
+	if (WARN_ON(new_state >= __WIMAX_ST_INVALID)) {
+		dev_err(dev, "SW BUG: requesting invalid state %u\n",
+			new_state);
+		goto out;
+	}
+	if (old_state == new_state)
+		goto out;
+	header = NULL;	/* gcc complains? can't grok why */
+	stch_skb = wimax_gnl_re_state_change_alloc(
+		wimax_dev, new_state, old_state, &header);
+
+	/* Verify the state transition and do exit-from-state actions */
+	switch (old_state) {
+	case __WIMAX_ST_NULL:
+		__check_new_state(old_state, new_state,
+				  1 << WIMAX_ST_DOWN);
+		break;
+	case WIMAX_ST_DOWN:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_UNINITIALIZED
+				  | 1 << WIMAX_ST_RADIO_OFF);
+		break;
+	case __WIMAX_ST_QUIESCING:
+		__check_new_state(old_state, new_state, 1 << WIMAX_ST_DOWN);
+		break;
+	case WIMAX_ST_UNINITIALIZED:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF);
+		break;
+	case WIMAX_ST_RADIO_OFF:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_READY);
+		break;
+	case WIMAX_ST_READY:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF
+				  | 1 << WIMAX_ST_SCANNING
+				  | 1 << WIMAX_ST_CONNECTING
+				  | 1 << WIMAX_ST_CONNECTED);
+		break;
+	case WIMAX_ST_SCANNING:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF
+				  | 1 << WIMAX_ST_READY
+				  | 1 << WIMAX_ST_CONNECTING
+				  | 1 << WIMAX_ST_CONNECTED);
+		break;
+	case WIMAX_ST_CONNECTING:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF
+				  | 1 << WIMAX_ST_READY
+				  | 1 << WIMAX_ST_SCANNING
+				  | 1 << WIMAX_ST_CONNECTED);
+		break;
+	case WIMAX_ST_CONNECTED:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF
+				  | 1 << WIMAX_ST_READY);
+		netif_tx_disable(wimax_dev->net_dev);
+		netif_carrier_off(wimax_dev->net_dev);
+		break;
+	case __WIMAX_ST_INVALID:
+	default:
+		dev_err(dev, "SW BUG: wimax_dev %p is in unknown state %u\n",
+			wimax_dev, wimax_dev->state);
+		WARN_ON(1);
+		goto out;
+	}
+
+	/* Execute the actions of entry to the new state */
+	switch (new_state) {
+	case __WIMAX_ST_NULL:
+		dev_err(dev, "SW BUG: wimax_dev %p entering NULL state "
+			"from %u\n", wimax_dev, wimax_dev->state);
+		WARN_ON(1);		/* Nobody can enter this state */
+		break;
+	case WIMAX_ST_DOWN:
+		break;
+	case __WIMAX_ST_QUIESCING:
+		break;
+	case WIMAX_ST_UNINITIALIZED:
+		break;
+	case WIMAX_ST_RADIO_OFF:
+		break;
+	case WIMAX_ST_READY:
+		break;
+	case WIMAX_ST_SCANNING:
+		break;
+	case WIMAX_ST_CONNECTING:
+		break;
+	case WIMAX_ST_CONNECTED:
+		netif_carrier_on(wimax_dev->net_dev);
+		netif_wake_queue(wimax_dev->net_dev);
+		break;
+	case __WIMAX_ST_INVALID:
+	default:
+		BUG();
+	}
+	__wimax_state_set(wimax_dev, new_state);
+	if (!IS_ERR(stch_skb))
+		wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header);
+out:
+	d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n",
+		wimax_dev, new_state, old_state);
+}
+
+
+/**
+ * wimax_state_change - Set the current state of a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor (properly referenced)
+ * @new_state: New state to switch to
+ *
+ * This implements the state changes for the wimax devices. It will
+ *
+ * - verify that the state transition is legal (for now it'll just
+ *   print a warning if not) according to the table in
+ *   linux/wimax.h's documentation for 'enum wimax_st'.
+ *
+ * - perform the actions needed for leaving the current state and
+ *   whichever are needed for entering the new state.
+ *
+ * - issue a report to user space indicating the new state (and an
+ *   optional payload with information about the new state).
+ *
+ * NOTE: @wimax_dev must be locked
+ */
+void wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
+{
+	/*
+	 * A driver cannot take the wimax_dev out of the
+	 * __WIMAX_ST_NULL state unless by calling wimax_dev_add(). If
+	 * the wimax_dev's state is still NULL, we ignore any request
+	 * to change its state because it means it hasn't been yet
+	 * registered.
+	 *
+	 * There is no need to complain about it, as routines that
+	 * call this might be shared from different code paths that
+	 * are called before or after wimax_dev_add() has done its
+	 * job.
+	 */
+	mutex_lock(&wimax_dev->mutex);
+	if (wimax_dev->state > __WIMAX_ST_NULL)
+		__wimax_state_change(wimax_dev, new_state);
+	mutex_unlock(&wimax_dev->mutex);
+}
+EXPORT_SYMBOL_GPL(wimax_state_change);
+
+
+/**
+ * wimax_state_get() - Return the current state of a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Returns: Current state of the device according to its driver.
+ */
+enum wimax_st wimax_state_get(struct wimax_dev *wimax_dev)
+{
+	enum wimax_st state;
+	mutex_lock(&wimax_dev->mutex);
+	state = wimax_dev->state;
+	mutex_unlock(&wimax_dev->mutex);
+	return state;
+}
+EXPORT_SYMBOL_GPL(wimax_state_get);
+
+
+/**
+ * wimax_dev_init - initialize a newly allocated instance
+ *
+ * @wimax_dev: WiMAX device descriptor to initialize.
+ *
+ * Initializes fields of a freshly allocated @wimax_dev instance. This
+ * function assumes that after allocation, the memory occupied by
+ * @wimax_dev was zeroed.
+ */
+void wimax_dev_init(struct wimax_dev *wimax_dev)
+{
+	INIT_LIST_HEAD(&wimax_dev->id_table_node);
+	__wimax_state_set(wimax_dev, __WIMAX_ST_NULL);
+	mutex_init(&wimax_dev->mutex);
+	mutex_init(&wimax_dev->mutex_reset);
+}
+EXPORT_SYMBOL_GPL(wimax_dev_init);
+
+/*
+ * There are multiple enums reusing the same values, adding
+ * others is only possible if they use a compatible policy.
+ */
+static const struct nla_policy wimax_gnl_policy[WIMAX_GNL_ATTR_MAX + 1] = {
+	/*
+	 * WIMAX_GNL_RESET_IFIDX, WIMAX_GNL_RFKILL_IFIDX,
+	 * WIMAX_GNL_STGET_IFIDX, WIMAX_GNL_MSG_IFIDX
+	 */
+	[1] = { .type = NLA_U32, },
+	/*
+	 * WIMAX_GNL_RFKILL_STATE, WIMAX_GNL_MSG_PIPE_NAME
+	 */
+	[2] = { .type = NLA_U32, }, /* enum wimax_rf_state */
+	/*
+	 * WIMAX_GNL_MSG_DATA
+	 */
+	[3] = { .type = NLA_UNSPEC, }, /* libnl doesn't grok BINARY yet */
+};
+
+static const struct genl_small_ops wimax_gnl_ops[] = {
+	{
+		.cmd = WIMAX_GNL_OP_MSG_FROM_USER,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.flags = GENL_ADMIN_PERM,
+		.doit = wimax_gnl_doit_msg_from_user,
+	},
+	{
+		.cmd = WIMAX_GNL_OP_RESET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.flags = GENL_ADMIN_PERM,
+		.doit = wimax_gnl_doit_reset,
+	},
+	{
+		.cmd = WIMAX_GNL_OP_RFKILL,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.flags = GENL_ADMIN_PERM,
+		.doit = wimax_gnl_doit_rfkill,
+	},
+	{
+		.cmd = WIMAX_GNL_OP_STATE_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.flags = GENL_ADMIN_PERM,
+		.doit = wimax_gnl_doit_state_get,
+	},
+};
+
+
+static
+size_t wimax_addr_scnprint(char *addr_str, size_t addr_str_size,
+			   unsigned char *addr, size_t addr_len)
+{
+	unsigned int cnt, total;
+
+	for (total = cnt = 0; cnt < addr_len; cnt++)
+		total += scnprintf(addr_str + total, addr_str_size - total,
+				   "%02x%c", addr[cnt],
+				   cnt == addr_len - 1 ? '\0' : ':');
+	return total;
+}
+
+
+/**
+ * wimax_dev_add - Register a new WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor (as embedded in your @net_dev's
+ *     priv data). You must have called wimax_dev_init() on it before.
+ *
+ * @net_dev: net device the @wimax_dev is associated with. The
+ *     function expects SET_NETDEV_DEV() and register_netdev() were
+ *     already called on it.
+ *
+ * Registers the new WiMAX device, sets up the user-kernel control
+ * interface (generic netlink) and common WiMAX infrastructure.
+ *
+ * Note that the parts that will allow interaction with user space are
+ * setup at the very end, when the rest is in place, as once that
+ * happens, the driver might get user space control requests via
+ * netlink or from debugfs that might translate into calls into
+ * wimax_dev->op_*().
+ */
+int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
+{
+	int result;
+	struct device *dev = net_dev->dev.parent;
+	char addr_str[32];
+
+	d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev);
+
+	/* Do the RFKILL setup before locking, as RFKILL will call
+	 * into our functions.
+	 */
+	wimax_dev->net_dev = net_dev;
+	result = wimax_rfkill_add(wimax_dev);
+	if (result < 0)
+		goto error_rfkill_add;
+
+	/* Set up user-space interaction */
+	mutex_lock(&wimax_dev->mutex);
+	wimax_id_table_add(wimax_dev);
+	wimax_debugfs_add(wimax_dev);
+
+	__wimax_state_set(wimax_dev, WIMAX_ST_DOWN);
+	mutex_unlock(&wimax_dev->mutex);
+
+	wimax_addr_scnprint(addr_str, sizeof(addr_str),
+			    net_dev->dev_addr, net_dev->addr_len);
+	dev_err(dev, "WiMAX interface %s (%s) ready\n",
+		net_dev->name, addr_str);
+	d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev);
+	return 0;
+
+error_rfkill_add:
+	d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n",
+		wimax_dev, net_dev, result);
+	return result;
+}
+EXPORT_SYMBOL_GPL(wimax_dev_add);
+
+
+/**
+ * wimax_dev_rm - Unregister an existing WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Unregisters a WiMAX device previously registered for use with
+ * wimax_add_rm().
+ *
+ * IMPORTANT! Must call before calling unregister_netdev().
+ *
+ * After this function returns, you will not get any more user space
+ * control requests (via netlink or debugfs) and thus to wimax_dev->ops.
+ *
+ * Reentrancy control is ensured by setting the state to
+ * %__WIMAX_ST_QUIESCING. rfkill operations coming through
+ * wimax_*rfkill*() will be stopped by the quiescing state; ops coming
+ * from the rfkill subsystem will be stopped by the support being
+ * removed by wimax_rfkill_rm().
+ */
+void wimax_dev_rm(struct wimax_dev *wimax_dev)
+{
+	d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+
+	mutex_lock(&wimax_dev->mutex);
+	__wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING);
+	wimax_debugfs_rm(wimax_dev);
+	wimax_id_table_rm(wimax_dev);
+	__wimax_state_change(wimax_dev, WIMAX_ST_DOWN);
+	mutex_unlock(&wimax_dev->mutex);
+	wimax_rfkill_rm(wimax_dev);
+	d_fnend(3, NULL, "(wimax_dev %p) = void\n", wimax_dev);
+}
+EXPORT_SYMBOL_GPL(wimax_dev_rm);
+
+
+/* Debug framework control of debug levels */
+struct d_level D_LEVEL[] = {
+	D_SUBMODULE_DEFINE(debugfs),
+	D_SUBMODULE_DEFINE(id_table),
+	D_SUBMODULE_DEFINE(op_msg),
+	D_SUBMODULE_DEFINE(op_reset),
+	D_SUBMODULE_DEFINE(op_rfkill),
+	D_SUBMODULE_DEFINE(op_state_get),
+	D_SUBMODULE_DEFINE(stack),
+};
+size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
+
+
+static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
+	{ .name = "msg", },
+};
+
+struct genl_family wimax_gnl_family __ro_after_init = {
+	.name = "WiMAX",
+	.version = WIMAX_GNL_VERSION,
+	.hdrsize = 0,
+	.maxattr = WIMAX_GNL_ATTR_MAX,
+	.policy = wimax_gnl_policy,
+	.module = THIS_MODULE,
+	.small_ops = wimax_gnl_ops,
+	.n_small_ops = ARRAY_SIZE(wimax_gnl_ops),
+	.mcgrps = wimax_gnl_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps),
+};
+
+
+
+/* Shutdown the wimax stack */
+static
+int __init wimax_subsys_init(void)
+{
+	int result;
+
+	d_fnstart(4, NULL, "()\n");
+	d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
+		       "wimax.debug");
+
+	result = genl_register_family(&wimax_gnl_family);
+	if (unlikely(result < 0)) {
+		pr_err("cannot register generic netlink family: %d\n", result);
+		goto error_register_family;
+	}
+
+	d_fnend(4, NULL, "() = 0\n");
+	return 0;
+
+error_register_family:
+	d_fnend(4, NULL, "() = %d\n", result);
+	return result;
+
+}
+module_init(wimax_subsys_init);
+
+
+/* Shutdown the wimax stack */
+static
+void __exit wimax_subsys_exit(void)
+{
+	wimax_id_table_release();
+	genl_unregister_family(&wimax_gnl_family);
+}
+module_exit(wimax_subsys_exit);
+
+MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
+MODULE_DESCRIPTION("Linux WiMAX stack");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/wimax/wimax-internal.h b/drivers/staging/wimax/wimax-internal.h
new file mode 100644
index 000000000000..a6b6990642a1
--- /dev/null
+++ b/drivers/staging/wimax/wimax-internal.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Linux WiMAX
+ * Internal API for kernel space WiMAX stack
+ *
+ * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This header file is for declarations and definitions internal to
+ * the WiMAX stack. For public APIs and documentation, see
+ * include/net/wimax.h and include/linux/wimax.h.
+ */
+
+#ifndef __WIMAX_INTERNAL_H__
+#define __WIMAX_INTERNAL_H__
+#ifdef __KERNEL__
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include "net-wimax.h"
+
+
+/*
+ * Decide if a (locked) device is ready for use
+ *
+ * Before using the device structure, it must be locked
+ * (wimax_dev->mutex). As well, most operations need to call this
+ * function to check if the state is the right one.
+ *
+ * An error value will be returned if the state is not the right
+ * one. In that case, the caller should not attempt to use the device
+ * and just unlock it.
+ */
+static inline __must_check
+int wimax_dev_is_ready(struct wimax_dev *wimax_dev)
+{
+	if (wimax_dev->state == __WIMAX_ST_NULL)
+		return -EINVAL;	/* Device is not even registered! */
+	if (wimax_dev->state == WIMAX_ST_DOWN)
+		return -ENOMEDIUM;
+	if (wimax_dev->state == __WIMAX_ST_QUIESCING)
+		return -ESHUTDOWN;
+	return 0;
+}
+
+
+static inline
+void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state)
+{
+	wimax_dev->state = state;
+}
+void __wimax_state_change(struct wimax_dev *, enum wimax_st);
+
+#ifdef CONFIG_DEBUG_FS
+void wimax_debugfs_add(struct wimax_dev *);
+void wimax_debugfs_rm(struct wimax_dev *);
+#else
+static inline void wimax_debugfs_add(struct wimax_dev *wimax_dev) {}
+static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {}
+#endif
+
+void wimax_id_table_add(struct wimax_dev *);
+struct wimax_dev *wimax_dev_get_by_genl_info(struct genl_info *, int);
+void wimax_id_table_rm(struct wimax_dev *);
+void wimax_id_table_release(void);
+
+int wimax_rfkill_add(struct wimax_dev *);
+void wimax_rfkill_rm(struct wimax_dev *);
+
+/* generic netlink */
+extern struct genl_family wimax_gnl_family;
+
+/* ops */
+int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info);
+int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info);
+int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info);
+int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info);
+
+#endif /* #ifdef __KERNEL__ */
+#endif /* #ifndef __WIMAX_INTERNAL_H__ */
diff --git a/include/linux/wimax/debug.h b/include/linux/wimax/debug.h
deleted file mode 100644
index cdae052bcdcd..000000000000
--- a/include/linux/wimax/debug.h
+++ /dev/null
@@ -1,491 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Linux WiMAX
- * Collection of tools to manage debug operations.
- *
- * Copyright (C) 2005-2007 Intel Corporation
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * Don't #include this file directly, read on!
- *
- * EXECUTING DEBUGGING ACTIONS OR NOT
- *
- * The main thing this framework provides is decission power to take a
- * debug action (like printing a message) if the current debug level
- * allows it.
- *
- * The decission power is at two levels: at compile-time (what does
- * not make it is compiled out) and at run-time. The run-time
- * selection is done per-submodule (as they are declared by the user
- * of the framework).
- *
- * A call to d_test(L) (L being the target debug level) returns true
- * if the action should be taken because the current debug levels
- * allow it (both compile and run time).
- *
- * It follows that a call to d_test() that can be determined to be
- * always false at compile time will get the code depending on it
- * compiled out by optimization.
- *
- * DEBUG LEVELS
- *
- * It is up to the caller to define how much a debugging level is.
- *
- * Convention sets 0 as "no debug" (so an action marked as debug level 0
- * will always be taken). The increasing debug levels are used for
- * increased verbosity.
- *
- * USAGE
- *
- * Group the code in modules and submodules inside each module [which
- * in most cases maps to Linux modules and .c files that compose
- * those].
- *
- * For each module, there is:
- *
- *  - a MODULENAME (single word, legal C identifier)
- *
- *  - a debug-levels.h header file that declares the list of
- *    submodules and that is included by all .c files that use
- *    the debugging tools. The file name can be anything.
- *
- *  - some (optional) .c code to manipulate the runtime debug levels
- *    through debugfs.
- *
- * The debug-levels.h file would look like:
- *
- *     #ifndef __debug_levels__h__
- *     #define __debug_levels__h__
- *
- *     #define D_MODULENAME modulename
- *     #define D_MASTER 10
- *
- *     #include <linux/wimax/debug.h>
- *
- *     enum d_module {
- *             D_SUBMODULE_DECLARE(submodule_1),
- *             D_SUBMODULE_DECLARE(submodule_2),
- *             ...
- *             D_SUBMODULE_DECLARE(submodule_N)
- *     };
- *
- *     #endif
- *
- * D_MASTER is the maximum compile-time debug level; any debug actions
- * above this will be out. D_MODULENAME is the module name (legal C
- * identifier), which has to be unique for each module (to avoid
- * namespace collisions during linkage). Note those #defines need to
- * be done before #including debug.h
- *
- * We declare N different submodules whose debug level can be
- * independently controlled during runtime.
- *
- * In a .c file of the module (and only in one of them), define the
- * following code:
- *
- *     struct d_level D_LEVEL[] = {
- *             D_SUBMODULE_DEFINE(submodule_1),
- *             D_SUBMODULE_DEFINE(submodule_2),
- *             ...
- *             D_SUBMODULE_DEFINE(submodule_N),
- *     };
- *     size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
- *
- * Externs for d_level_MODULENAME and d_level_size_MODULENAME are used
- * and declared in this file using the D_LEVEL and D_LEVEL_SIZE macros
- * #defined also in this file.
- *
- * To manipulate from user space the levels, create a debugfs dentry
- * and then register each submodule with:
- *
- *     d_level_register_debugfs("PREFIX_", submodule_X, parent);
- *
- * Where PREFIX_ is a name of your chosing. This will create debugfs
- * file with a single numeric value that can be use to tweak it. To
- * remove the entires, just use debugfs_remove_recursive() on 'parent'.
- *
- * NOTE: remember that even if this will show attached to some
- *     particular instance of a device, the settings are *global*.
- *
- * On each submodule (for example, .c files), the debug infrastructure
- * should be included like this:
- *
- *     #define D_SUBMODULE submodule_x     // matches one in debug-levels.h
- *     #include "debug-levels.h"
- *
- * after #including all your include files.
- *
- * Now you can use the d_*() macros below [d_test(), d_fnstart(),
- * d_fnend(), d_printf(), d_dump()].
- *
- * If their debug level is greater than D_MASTER, they will be
- * compiled out.
- *
- * If their debug level is lower or equal than D_MASTER but greater
- * than the current debug level of their submodule, they'll be
- * ignored.
- *
- * Otherwise, the action will be performed.
- */
-#ifndef __debug__h__
-#define __debug__h__
-
-#include <linux/types.h>
-#include <linux/slab.h>
-
-struct device;
-
-/* Backend stuff */
-
-/*
- * Debug backend: generate a message header from a 'struct device'
- *
- * @head: buffer where to place the header
- * @head_size: length of @head
- * @dev: pointer to device used to generate a header from. If NULL,
- *     an empty ("") header is generated.
- */
-static inline
-void __d_head(char *head, size_t head_size,
-	      struct device *dev)
-{
-	if (dev == NULL)
-		head[0] = 0;
-	else if ((unsigned long)dev < 4096) {
-		printk(KERN_ERR "E: Corrupt dev %p\n", dev);
-		WARN_ON(1);
-	} else
-		snprintf(head, head_size, "%s %s: ",
-			 dev_driver_string(dev), dev_name(dev));
-}
-
-
-/*
- * Debug backend: log some message if debugging is enabled
- *
- * @l: intended debug level
- * @tag: tag to prefix the message with
- * @dev: 'struct device' associated to this message
- * @f: printf-like format and arguments
- *
- * Note this is optimized out if it doesn't pass the compile-time
- * check; however, it is *always* compiled. This is useful to make
- * sure the printf-like formats and variables are always checked and
- * they don't get bit rot if you have all the debugging disabled.
- */
-#define _d_printf(l, tag, dev, f, a...)					\
-do {									\
-	char head[64];							\
-	if (!d_test(l))							\
-		break;							\
-	__d_head(head, sizeof(head), dev);				\
-	printk(KERN_ERR "%s%s%s: " f, head, __func__, tag, ##a);	\
-} while (0)
-
-
-/*
- * CPP syntactic sugar to generate A_B like symbol names when one of
- * the arguments is a preprocessor #define.
- */
-#define __D_PASTE__(varname, modulename) varname##_##modulename
-#define __D_PASTE(varname, modulename) (__D_PASTE__(varname, modulename))
-#define _D_SUBMODULE_INDEX(_name) (D_SUBMODULE_DECLARE(_name))
-
-
-/*
- * Store a submodule's runtime debug level and name
- */
-struct d_level {
-	u8 level;
-	const char *name;
-};
-
-
-/*
- * List of available submodules and their debug levels
- *
- * We call them d_level_MODULENAME and d_level_size_MODULENAME; the
- * macros D_LEVEL and D_LEVEL_SIZE contain the name already for
- * convenience.
- *
- * This array and the size are defined on some .c file that is part of
- * the current module.
- */
-#define D_LEVEL __D_PASTE(d_level, D_MODULENAME)
-#define D_LEVEL_SIZE __D_PASTE(d_level_size, D_MODULENAME)
-
-extern struct d_level D_LEVEL[];
-extern size_t D_LEVEL_SIZE;
-
-
-/*
- * Frontend stuff
- *
- *
- * Stuff you need to declare prior to using the actual "debug" actions
- * (defined below).
- */
-
-#ifndef D_MODULENAME
-#error D_MODULENAME is not defined in your debug-levels.h file
-/**
- * D_MODULE - Name of the current module
- *
- * #define in your module's debug-levels.h, making sure it is
- * unique. This has to be a legal C identifier.
- */
-#define D_MODULENAME undefined_modulename
-#endif
-
-
-#ifndef D_MASTER
-#warning D_MASTER not defined, but debug.h included! [see docs]
-/**
- * D_MASTER - Compile time maximum debug level
- *
- * #define in your debug-levels.h file to the maximum debug level the
- * runtime code will be allowed to have. This allows you to provide a
- * main knob.
- *
- * Anything above that level will be optimized out of the compile.
- *
- * Defaults to zero (no debug code compiled in).
- *
- * Maximum one definition per module (at the debug-levels.h file).
- */
-#define D_MASTER 0
-#endif
-
-#ifndef D_SUBMODULE
-#error D_SUBMODULE not defined, but debug.h included! [see docs]
-/**
- * D_SUBMODULE - Name of the current submodule
- *
- * #define in your submodule .c file before #including debug-levels.h
- * to the name of the current submodule as previously declared and
- * defined with D_SUBMODULE_DECLARE() (in your module's
- * debug-levels.h) and D_SUBMODULE_DEFINE().
- *
- * This is used to provide runtime-control over the debug levels.
- *
- * Maximum one per .c file! Can be shared among different .c files
- * (meaning they belong to the same submodule categorization).
- */
-#define D_SUBMODULE undefined_module
-#endif
-
-
-/**
- * D_SUBMODULE_DECLARE - Declare a submodule for runtime debug level control
- *
- * @_name: name of the submodule, restricted to the chars that make up a
- *     valid C identifier ([a-zA-Z0-9_]).
- *
- * Declare in the module's debug-levels.h header file as:
- *
- * enum d_module {
- *         D_SUBMODULE_DECLARE(submodule_1),
- *         D_SUBMODULE_DECLARE(submodule_2),
- *         D_SUBMODULE_DECLARE(submodule_3),
- * };
- *
- * Some corresponding .c file needs to have a matching
- * D_SUBMODULE_DEFINE().
- */
-#define D_SUBMODULE_DECLARE(_name) __D_SUBMODULE_##_name
-
-
-/**
- * D_SUBMODULE_DEFINE - Define a submodule for runtime debug level control
- *
- * @_name: name of the submodule, restricted to the chars that make up a
- *     valid C identifier ([a-zA-Z0-9_]).
- *
- * Use once per module (in some .c file) as:
- *
- * static
- * struct d_level d_level_SUBMODULENAME[] = {
- *         D_SUBMODULE_DEFINE(submodule_1),
- *         D_SUBMODULE_DEFINE(submodule_2),
- *         D_SUBMODULE_DEFINE(submodule_3),
- * };
- * size_t d_level_size_SUBDMODULENAME = ARRAY_SIZE(d_level_SUBDMODULENAME);
- *
- * Matching D_SUBMODULE_DECLARE()s have to be present in a
- * debug-levels.h header file.
- */
-#define D_SUBMODULE_DEFINE(_name)		\
-[__D_SUBMODULE_##_name] = {			\
-	.level = 0,				\
-	.name = #_name				\
-}
-
-
-
-/* The actual "debug" operations */
-
-
-/**
- * d_test - Returns true if debugging should be enabled
- *
- * @l: intended debug level (unsigned)
- *
- * If the master debug switch is enabled and the current settings are
- * higher or equal to the requested level, then debugging
- * output/actions should be enabled.
- *
- * NOTE:
- *
- * This needs to be coded so that it can be evaluated in compile
- * time; this is why the ugly BUG_ON() is placed in there, so the
- * D_MASTER evaluation compiles all out if it is compile-time false.
- */
-#define d_test(l)							\
-({									\
-	unsigned __l = l;	/* type enforcer */			\
-	(D_MASTER) >= __l						\
-	&& ({								\
-		BUG_ON(_D_SUBMODULE_INDEX(D_SUBMODULE) >= D_LEVEL_SIZE);\
-		D_LEVEL[_D_SUBMODULE_INDEX(D_SUBMODULE)].level >= __l;	\
-	});								\
-})
-
-
-/**
- * d_fnstart - log message at function start if debugging enabled
- *
- * @l: intended debug level
- * @_dev: 'struct device' pointer, NULL if none (for context)
- * @f: printf-like format and arguments
- */
-#define d_fnstart(l, _dev, f, a...) _d_printf(l, " FNSTART", _dev, f, ## a)
-
-
-/**
- * d_fnend - log message at function end if debugging enabled
- *
- * @l: intended debug level
- * @_dev: 'struct device' pointer, NULL if none (for context)
- * @f: printf-like format and arguments
- */
-#define d_fnend(l, _dev, f, a...) _d_printf(l, " FNEND", _dev, f, ## a)
-
-
-/**
- * d_printf - log message if debugging enabled
- *
- * @l: intended debug level
- * @_dev: 'struct device' pointer, NULL if none (for context)
- * @f: printf-like format and arguments
- */
-#define d_printf(l, _dev, f, a...) _d_printf(l, "", _dev, f, ## a)
-
-
-/**
- * d_dump - log buffer hex dump if debugging enabled
- *
- * @l: intended debug level
- * @_dev: 'struct device' pointer, NULL if none (for context)
- * @f: printf-like format and arguments
- */
-#define d_dump(l, dev, ptr, size)			\
-do {							\
-	char head[64];					\
-	if (!d_test(l))					\
-		break;					\
-	__d_head(head, sizeof(head), dev);		\
-	print_hex_dump(KERN_ERR, head, 0, 16, 1,	\
-		       ((void *) ptr), (size), 0);	\
-} while (0)
-
-
-/**
- * Export a submodule's debug level over debugfs as PREFIXSUBMODULE
- *
- * @prefix: string to prefix the name with
- * @submodule: name of submodule (not a string, just the name)
- * @dentry: debugfs parent dentry
- *
- * For removing, just use debugfs_remove_recursive() on the parent.
- */
-#define d_level_register_debugfs(prefix, name, parent)			\
-({									\
-	debugfs_create_u8(						\
-		prefix #name, 0600, parent,				\
-		&(D_LEVEL[__D_SUBMODULE_ ## name].level));		\
-})
-
-
-static inline
-void d_submodule_set(struct d_level *d_level, size_t d_level_size,
-		     const char *submodule, u8 level, const char *tag)
-{
-	struct d_level *itr, *top;
-	int index = -1;
-
-	for (itr = d_level, top = itr + d_level_size; itr < top; itr++) {
-		index++;
-		if (itr->name == NULL) {
-			printk(KERN_ERR "%s: itr->name NULL?? (%p, #%d)\n",
-			       tag, itr, index);
-			continue;
-		}
-		if (!strcmp(itr->name, submodule)) {
-			itr->level = level;
-			return;
-		}
-	}
-	printk(KERN_ERR "%s: unknown submodule %s\n", tag, submodule);
-}
-
-
-/**
- * d_parse_params - Parse a string with debug parameters from the
- * command line
- *
- * @d_level: level structure (D_LEVEL)
- * @d_level_size: number of items in the level structure
- *     (D_LEVEL_SIZE).
- * @_params: string with the parameters; this is a space (not tab!)
- *     separated list of NAME:VALUE, where value is the debug level
- *     and NAME is the name of the submodule.
- * @tag: string for error messages (example: MODULE.ARGNAME).
- */
-static inline
-void d_parse_params(struct d_level *d_level, size_t d_level_size,
-		    const char *_params, const char *tag)
-{
-	char submodule[130], *params, *params_orig, *token, *colon;
-	unsigned level, tokens;
-
-	if (_params == NULL)
-		return;
-	params_orig = kstrdup(_params, GFP_KERNEL);
-	params = params_orig;
-	while (1) {
-		token = strsep(&params, " ");
-		if (token == NULL)
-			break;
-		if (*token == '\0')	/* eat joint spaces */
-			continue;
-		/* kernel's sscanf %s eats until whitespace, so we
-		 * replace : by \n so it doesn't get eaten later by
-		 * strsep */
-		colon = strchr(token, ':');
-		if (colon != NULL)
-			*colon = '\n';
-		tokens = sscanf(token, "%s\n%u", submodule, &level);
-		if (colon != NULL)
-			*colon = ':';	/* set back, for error messages */
-		if (tokens == 2)
-			d_submodule_set(d_level, d_level_size,
-					submodule, level, tag);
-		else
-			printk(KERN_ERR "%s: can't parse '%s' as a "
-			       "SUBMODULE:LEVEL (%d tokens)\n",
-			       tag, token, tokens);
-	}
-	kfree(params_orig);
-}
-
-#endif /* #ifndef __debug__h__ */
diff --git a/include/net/wimax.h b/include/net/wimax.h
deleted file mode 100644
index f6e31d2f47aa..000000000000
--- a/include/net/wimax.h
+++ /dev/null
@@ -1,503 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Linux WiMAX
- * Kernel space API for accessing WiMAX devices
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * The WiMAX stack provides an API for controlling and managing the
- * system's WiMAX devices. This API affects the control plane; the
- * data plane is accessed via the network stack (netdev).
- *
- * Parts of the WiMAX stack API and notifications are exported to
- * user space via Generic Netlink. In user space, libwimax (part of
- * the wimax-tools package) provides a shim layer for accessing those
- * calls.
- *
- * The API is standarized for all WiMAX devices and different drivers
- * implement the backend support for it. However, device-specific
- * messaging pipes are provided that can be used to issue commands and
- * receive notifications in free form.
- *
- * Currently the messaging pipes are the only means of control as it
- * is not known (due to the lack of more devices in the market) what
- * will be a good abstraction layer. Expect this to change as more
- * devices show in the market. This API is designed to be growable in
- * order to address this problem.
- *
- * USAGE
- *
- * Embed a `struct wimax_dev` at the beginning of the device's
- * private structure, initialize and register it. For details, see
- * `struct wimax_dev`s documentation.
- *
- * Once this is done, wimax-tools's libwimaxll can be used to
- * communicate with the driver from user space. You user space
- * application does not have to forcibily use libwimaxll and can talk
- * the generic netlink protocol directly if desired.
- *
- * Remember this is a very low level API that will to provide all of
- * WiMAX features. Other daemons and services running in user space
- * are the expected clients of it. They offer a higher level API that
- * applications should use (an example of this is the Intel's WiMAX
- * Network Service for the i2400m).
- *
- * DESIGN
- *
- * Although not set on final stone, this very basic interface is
- * mostly completed. Remember this is meant to grow as new common
- * operations are decided upon. New operations will be added to the
- * interface, intent being on keeping backwards compatibility as much
- * as possible.
- *
- * This layer implements a set of calls to control a WiMAX device,
- * exposing a frontend to the rest of the kernel and user space (via
- * generic netlink) and a backend implementation in the driver through
- * function pointers.
- *
- * WiMAX devices have a state, and a kernel-only API allows the
- * drivers to manipulate that state. State transitions are atomic, and
- * only some of them are allowed (see `enum wimax_st`).
- *
- * Most API calls will set the state automatically; in most cases
- * drivers have to only report state changes due to external
- * conditions.
- *
- * All API operations are 'atomic', serialized through a mutex in the
- * `struct wimax_dev`.
- *
- * EXPORTING TO USER SPACE THROUGH GENERIC NETLINK
- *
- * The API is exported to user space using generic netlink (other
- * methods can be added as needed).
- *
- * There is a Generic Netlink Family named "WiMAX", where interfaces
- * supporting the WiMAX interface receive commands and broadcast their
- * signals over a multicast group named "msg".
- *
- * Mapping to the source/destination interface is done by an interface
- * index attribute.
- *
- * For user-to-kernel traffic (commands) we use a function call
- * marshalling mechanism, where a message X with attributes A, B, C
- * sent from user space to kernel space means executing the WiMAX API
- * call wimax_X(A, B, C), sending the results back as a message.
- *
- * Kernel-to-user (notifications or signals) communication is sent
- * over multicast groups. This allows to have multiple applications
- * monitoring them.
- *
- * Each command/signal gets assigned it's own attribute policy. This
- * way the validator will verify that all the attributes in there are
- * only the ones that should be for each command/signal. Thing of an
- * attribute mapping to a type+argumentname for each command/signal.
- *
- * If we had a single policy for *all* commands/signals, after running
- * the validator we'd have to check "does this attribute belong in
- * here"?  for each one. It can be done manually, but it's just easier
- * to have the validator do that job with multiple policies. As well,
- * it makes it easier to later expand each command/signal signature
- * without affecting others and keeping the namespace more or less
- * sane. Not that it is too complicated, but it makes it even easier.
- *
- * No state information is maintained in the kernel for each user
- * space connection (the connection is stateless).
- *
- * TESTING FOR THE INTERFACE AND VERSIONING
- *
- * If network interface X is a WiMAX device, there will be a Generic
- * Netlink family named "WiMAX X" and the device will present a
- * "wimax" directory in it's network sysfs directory
- * (/sys/class/net/DEVICE/wimax) [used by HAL].
- *
- * The inexistence of any of these means the device does not support
- * this WiMAX API.
- *
- * By querying the generic netlink controller, versioning information
- * and the multicast groups available can be found. Applications using
- * the interface can either rely on that or use the generic netlink
- * controller to figure out which generic netlink commands/signals are
- * supported.
- *
- * NOTE: this versioning is a last resort to avoid hard
- *    incompatibilities. It is the intention of the design of this
- *    stack not to introduce backward incompatible changes.
- *
- * The version code has to fit in one byte (restrictions imposed by
- * generic netlink); we use `version / 10` for the major version and
- * `version % 10` for the minor. This gives 9 minors for each major
- * and 25 majors.
- *
- * The version change protocol is as follow:
- *
- * - Major versions: needs to be increased if an existing message/API
- *   call is changed or removed. Doesn't need to be changed if a new
- *   message is added.
- *
- * - Minor version: needs to be increased if new messages/API calls are
- *   being added or some other consideration that doesn't impact the
- *   user-kernel interface too much (like some kind of bug fix) and
- *   that is kind of left up in the air to common sense.
- *
- * User space code should not try to work if the major version it was
- * compiled for differs from what the kernel offers. As well, if the
- * minor version of the kernel interface is lower than the one user
- * space is expecting (the one it was compiled for), the kernel
- * might be missing API calls; user space shall be ready to handle
- * said condition. Use the generic netlink controller operations to
- * find which ones are supported and which not.
- *
- * libwimaxll:wimaxll_open() takes care of checking versions.
- *
- * THE OPERATIONS:
- *
- * Each operation is defined in its on file (drivers/net/wimax/op-*.c)
- * for clarity. The parts needed for an operation are:
- *
- *  - a function pointer in `struct wimax_dev`: optional, as the
- *    operation might be implemented by the stack and not by the
- *    driver.
- *
- *    All function pointers are named wimax_dev->op_*(), and drivers
- *    must implement them except where noted otherwise.
- *
- *  - When exported to user space, a `struct nla_policy` to define the
- *    attributes of the generic netlink command and a `struct genl_ops`
- *    to define the operation.
- *
- * All the declarations for the operation codes (WIMAX_GNL_OP_<NAME>)
- * and generic netlink attributes (WIMAX_GNL_<NAME>_*) are declared in
- * include/linux/wimax.h; this file is intended to be cloned by user
- * space to gain access to those declarations.
- *
- * A few caveats to remember:
- *
- *  - Need to define attribute numbers starting in 1; otherwise it
- *    fails.
- *
- *  - the `struct genl_family` requires a maximum attribute id; when
- *    defining the `struct nla_policy` for each message, it has to have
- *    an array size of WIMAX_GNL_ATTR_MAX+1.
- *
- * The op_*() function pointers will not be called if the wimax_dev is
- * in a state <= %WIMAX_ST_UNINITIALIZED. The exception is:
- *
- * - op_reset: can be called at any time after wimax_dev_add() has
- *   been called.
- *
- * THE PIPE INTERFACE:
- *
- * This interface is kept intentionally simple. The driver can send
- * and receive free-form messages to/from user space through a
- * pipe. See drivers/net/wimax/op-msg.c for details.
- *
- * The kernel-to-user messages are sent with
- * wimax_msg(). user-to-kernel messages are delivered via
- * wimax_dev->op_msg_from_user().
- *
- * RFKILL:
- *
- * RFKILL support is built into the wimax_dev layer; the driver just
- * needs to call wimax_report_rfkill_{hw,sw}() to inform of changes in
- * the hardware or software RF kill switches. When the stack wants to
- * turn the radio off, it will call wimax_dev->op_rfkill_sw_toggle(),
- * which the driver implements.
- *
- * User space can set the software RF Kill switch by calling
- * wimax_rfkill().
- *
- * The code for now only supports devices that don't require polling;
- * If the device needs to be polled, create a self-rearming delayed
- * work struct for polling or look into adding polled support to the
- * WiMAX stack.
- *
- * When initializing the hardware (_probe), after calling
- * wimax_dev_add(), query the device for it's RF Kill switches status
- * and feed it back to the WiMAX stack using
- * wimax_report_rfkill_{hw,sw}(). If any switch is missing, always
- * report it as ON.
- *
- * NOTE: the wimax stack uses an inverted terminology to that of the
- * RFKILL subsystem:
- *
- *  - ON: radio is ON, RFKILL is DISABLED or OFF.
- *  - OFF: radio is OFF, RFKILL is ENABLED or ON.
- *
- * MISCELLANEOUS OPS:
- *
- * wimax_reset() can be used to reset the device to power on state; by
- * default it issues a warm reset that maintains the same device
- * node. If that is not possible, it falls back to a cold reset
- * (device reconnect). The driver implements the backend to this
- * through wimax_dev->op_reset().
- */
-
-#ifndef __NET__WIMAX_H__
-#define __NET__WIMAX_H__
-
-#include <linux/wimax.h>
-#include <net/genetlink.h>
-#include <linux/netdevice.h>
-
-struct net_device;
-struct genl_info;
-struct wimax_dev;
-
-/**
- * struct wimax_dev - Generic WiMAX device
- *
- * @net_dev: [fill] Pointer to the &struct net_device this WiMAX
- *     device implements.
- *
- * @op_msg_from_user: [fill] Driver-specific operation to
- *     handle a raw message from user space to the driver. The
- *     driver can send messages to user space using with
- *     wimax_msg_to_user().
- *
- * @op_rfkill_sw_toggle: [fill] Driver-specific operation to act on
- *     userspace (or any other agent) requesting the WiMAX device to
- *     change the RF Kill software switch (WIMAX_RF_ON or
- *     WIMAX_RF_OFF).
- *     If such hardware support is not present, it is assumed the
- *     radio cannot be switched off and it is always on (and the stack
- *     will error out when trying to switch it off). In such case,
- *     this function pointer can be left as NULL.
- *
- * @op_reset: [fill] Driver specific operation to reset the
- *     device.
- *     This operation should always attempt first a warm reset that
- *     does not disconnect the device from the bus and return 0.
- *     If that fails, it should resort to some sort of cold or bus
- *     reset (even if it implies a bus disconnection and device
- *     disappearance). In that case, -ENODEV should be returned to
- *     indicate the device is gone.
- *     This operation has to be synchronous, and return only when the
- *     reset is complete. In case of having had to resort to bus/cold
- *     reset implying a device disconnection, the call is allowed to
- *     return immediately.
- *     NOTE: wimax_dev->mutex is NOT locked when this op is being
- *     called; however, wimax_dev->mutex_reset IS locked to ensure
- *     serialization of calls to wimax_reset().
- *     See wimax_reset()'s documentation.
- *
- * @name: [fill] A way to identify this device. We need to register a
- *     name with many subsystems (rfkill, workqueue creation, etc).
- *     We can't use the network device name as that
- *     might change and in some instances we don't know it yet (until
- *     we don't call register_netdev()). So we generate an unique one
- *     using the driver name and device bus id, place it here and use
- *     it across the board. Recommended naming:
- *     DRIVERNAME-BUSNAME:BUSID (dev->bus->name, dev->bus_id).
- *
- * @id_table_node: [private] link to the list of wimax devices kept by
- *     id-table.c. Protected by it's own spinlock.
- *
- * @mutex: [private] Serializes all concurrent access and execution of
- *     operations.
- *
- * @mutex_reset: [private] Serializes reset operations. Needs to be a
- *     different mutex because as part of the reset operation, the
- *     driver has to call back into the stack to do things such as
- *     state change, that require wimax_dev->mutex.
- *
- * @state: [private] Current state of the WiMAX device.
- *
- * @rfkill: [private] integration into the RF-Kill infrastructure.
- *
- * @rf_sw: [private] State of the software radio switch (OFF/ON)
- *
- * @rf_hw: [private] State of the hardware radio switch (OFF/ON)
- *
- * @debugfs_dentry: [private] Used to hook up a debugfs entry. This
- *     shows up in the debugfs root as wimax\:DEVICENAME.
- *
- * Description:
- * This structure defines a common interface to access all WiMAX
- * devices from different vendors and provides a common API as well as
- * a free-form device-specific messaging channel.
- *
- * Usage:
- *  1. Embed a &struct wimax_dev at *the beginning* the network
- *     device structure so that netdev_priv() points to it.
- *
- *  2. memset() it to zero
- *
- *  3. Initialize with wimax_dev_init(). This will leave the WiMAX
- *     device in the %__WIMAX_ST_NULL state.
- *
- *  4. Fill all the fields marked with [fill]; once called
- *     wimax_dev_add(), those fields CANNOT be modified.
- *
- *  5. Call wimax_dev_add() *after* registering the network
- *     device. This will leave the WiMAX device in the %WIMAX_ST_DOWN
- *     state.
- *     Protect the driver's net_device->open() against succeeding if
- *     the wimax device state is lower than %WIMAX_ST_DOWN.
- *
- *  6. Select when the device is going to be turned on/initialized;
- *     for example, it could be initialized on 'ifconfig up' (when the
- *     netdev op 'open()' is called on the driver).
- *
- * When the device is initialized (at `ifconfig up` time, or right
- * after calling wimax_dev_add() from _probe(), make sure the
- * following steps are taken
- *
- *  a. Move the device to %WIMAX_ST_UNINITIALIZED. This is needed so
- *     some API calls that shouldn't work until the device is ready
- *     can be blocked.
- *
- *  b. Initialize the device. Make sure to turn the SW radio switch
- *     off and move the device to state %WIMAX_ST_RADIO_OFF when
- *     done. When just initialized, a device should be left in RADIO
- *     OFF state until user space devices to turn it on.
- *
- *  c. Query the device for the state of the hardware rfkill switch
- *     and call wimax_rfkill_report_hw() and wimax_rfkill_report_sw()
- *     as needed. See below.
- *
- * wimax_dev_rm() undoes before unregistering the network device. Once
- * wimax_dev_add() is called, the driver can get called on the
- * wimax_dev->op_* function pointers
- *
- * CONCURRENCY:
- *
- * The stack provides a mutex for each device that will disallow API
- * calls happening concurrently; thus, op calls into the driver
- * through the wimax_dev->op*() function pointers will always be
- * serialized and *never* concurrent.
- *
- * For locking, take wimax_dev->mutex is taken; (most) operations in
- * the API have to check for wimax_dev_is_ready() to return 0 before
- * continuing (this is done internally).
- *
- * REFERENCE COUNTING:
- *
- * The WiMAX device is reference counted by the associated network
- * device. The only operation that can be used to reference the device
- * is wimax_dev_get_by_genl_info(), and the reference it acquires has
- * to be released with dev_put(wimax_dev->net_dev).
- *
- * RFKILL:
- *
- * At startup, both HW and SW radio switchess are assumed to be off.
- *
- * At initialization time [after calling wimax_dev_add()], have the
- * driver query the device for the status of the software and hardware
- * RF kill switches and call wimax_report_rfkill_hw() and
- * wimax_rfkill_report_sw() to indicate their state. If any is
- * missing, just call it to indicate it is ON (radio always on).
- *
- * Whenever the driver detects a change in the state of the RF kill
- * switches, it should call wimax_report_rfkill_hw() or
- * wimax_report_rfkill_sw() to report it to the stack.
- */
-struct wimax_dev {
-	struct net_device *net_dev;
-	struct list_head id_table_node;
-	struct mutex mutex;		/* Protects all members and API calls */
-	struct mutex mutex_reset;
-	enum wimax_st state;
-
-	int (*op_msg_from_user)(struct wimax_dev *wimax_dev,
-				const char *,
-				const void *, size_t,
-				const struct genl_info *info);
-	int (*op_rfkill_sw_toggle)(struct wimax_dev *wimax_dev,
-				   enum wimax_rf_state);
-	int (*op_reset)(struct wimax_dev *wimax_dev);
-
-	struct rfkill *rfkill;
-	unsigned int rf_hw;
-	unsigned int rf_sw;
-	char name[32];
-
-	struct dentry *debugfs_dentry;
-};
-
-
-
-/*
- * WiMAX stack public API for device drivers
- * -----------------------------------------
- *
- * These functions are not exported to user space.
- */
-void wimax_dev_init(struct wimax_dev *);
-int wimax_dev_add(struct wimax_dev *, struct net_device *);
-void wimax_dev_rm(struct wimax_dev *);
-
-static inline
-struct wimax_dev *net_dev_to_wimax(struct net_device *net_dev)
-{
-	return netdev_priv(net_dev);
-}
-
-static inline
-struct device *wimax_dev_to_dev(struct wimax_dev *wimax_dev)
-{
-	return wimax_dev->net_dev->dev.parent;
-}
-
-void wimax_state_change(struct wimax_dev *, enum wimax_st);
-enum wimax_st wimax_state_get(struct wimax_dev *);
-
-/*
- * Radio Switch state reporting.
- *
- * enum wimax_rf_state is declared in linux/wimax.h so the exports
- * to user space can use it.
- */
-void wimax_report_rfkill_hw(struct wimax_dev *, enum wimax_rf_state);
-void wimax_report_rfkill_sw(struct wimax_dev *, enum wimax_rf_state);
-
-
-/*
- * Free-form messaging to/from user space
- *
- * Sending a message:
- *
- *   wimax_msg(wimax_dev, pipe_name, buf, buf_size, GFP_KERNEL);
- *
- * Broken up:
- *
- *   skb = wimax_msg_alloc(wimax_dev, pipe_name, buf_size, GFP_KERNEL);
- *   ...fill up skb...
- *   wimax_msg_send(wimax_dev, pipe_name, skb);
- *
- * Be sure not to modify skb->data in the middle (ie: don't use
- * skb_push()/skb_pull()/skb_reserve() on the skb).
- *
- * "pipe_name" is any string, that can be interpreted as the name of
- * the pipe or recipient; the interpretation of it is driver
- * specific, so the recipient can multiplex it as wished. It can be
- * NULL, it won't be used - an example is using a "diagnostics" tag to
- * send diagnostics information that a device-specific diagnostics
- * tool would be interested in.
- */
-struct sk_buff *wimax_msg_alloc(struct wimax_dev *, const char *, const void *,
-				size_t, gfp_t);
-int wimax_msg_send(struct wimax_dev *, struct sk_buff *);
-int wimax_msg(struct wimax_dev *, const char *, const void *, size_t, gfp_t);
-
-const void *wimax_msg_data_len(struct sk_buff *, size_t *);
-const void *wimax_msg_data(struct sk_buff *);
-ssize_t wimax_msg_len(struct sk_buff *);
-
-
-/*
- * WiMAX stack user space API
- * --------------------------
- *
- * This API is what gets exported to user space for general
- * operations. As well, they can be called from within the kernel,
- * (with a properly referenced `struct wimax_dev`).
- *
- * Properly referenced means: the 'struct net_device' that embeds the
- * device's control structure and (as such) the 'struct wimax_dev' is
- * referenced by the caller.
- */
-int wimax_rfkill(struct wimax_dev *, enum wimax_rf_state);
-int wimax_reset(struct wimax_dev *);
-
-#endif /* #ifndef __NET__WIMAX_H__ */
diff --git a/include/uapi/linux/wimax.h b/include/uapi/linux/wimax.h
deleted file mode 100644
index 9f6b77af2f6d..000000000000
--- a/include/uapi/linux/wimax.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Linux WiMax
- * API for user space
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Initial implementation
- *
- *
- * This file declares the user/kernel protocol that is spoken over
- * Generic Netlink, as well as any type declaration that is to be used
- * by kernel and user space.
- *
- * It is intended for user space to clone it verbatim to use it as a
- * primary reference for definitions.
- *
- * Stuff intended for kernel usage as well as full protocol and stack
- * documentation is rooted in include/net/wimax.h.
- */
-
-#ifndef __LINUX__WIMAX_H__
-#define __LINUX__WIMAX_H__
-
-#include <linux/types.h>
-
-enum {
-	/**
-	 * Version of the interface (unsigned decimal, MMm, max 25.5)
-	 * M - Major: change if removing or modifying an existing call.
-	 * m - minor: change when adding a new call
-	 */
-	WIMAX_GNL_VERSION = 01,
-	/* Generic NetLink attributes */
-	WIMAX_GNL_ATTR_INVALID = 0x00,
-	WIMAX_GNL_ATTR_MAX = 10,
-};
-
-
-/*
- * Generic NetLink operations
- *
- * Most of these map to an API call; _OP_ stands for operation, _RP_
- * for reply and _RE_ for report (aka: signal).
- */
-enum {
-	WIMAX_GNL_OP_MSG_FROM_USER,	/* User to kernel message */
-	WIMAX_GNL_OP_MSG_TO_USER,	/* Kernel to user message */
-	WIMAX_GNL_OP_RFKILL,	/* Run wimax_rfkill() */
-	WIMAX_GNL_OP_RESET,	/* Run wimax_rfkill() */
-	WIMAX_GNL_RE_STATE_CHANGE,	/* Report: status change */
-	WIMAX_GNL_OP_STATE_GET,		/* Request for current state */
-};
-
-
-/* Message from user / to user */
-enum {
-	WIMAX_GNL_MSG_IFIDX = 1,
-	WIMAX_GNL_MSG_PIPE_NAME,
-	WIMAX_GNL_MSG_DATA,
-};
-
-
-/*
- * wimax_rfkill()
- *
- * The state of the radio (ON/OFF) is mapped to the rfkill subsystem's
- * switch state (DISABLED/ENABLED).
- */
-enum wimax_rf_state {
-	WIMAX_RF_OFF = 0,	/* Radio is off, rfkill on/enabled */
-	WIMAX_RF_ON = 1,	/* Radio is on, rfkill off/disabled */
-	WIMAX_RF_QUERY = 2,
-};
-
-/* Attributes */
-enum {
-	WIMAX_GNL_RFKILL_IFIDX = 1,
-	WIMAX_GNL_RFKILL_STATE,
-};
-
-
-/* Attributes for wimax_reset() */
-enum {
-	WIMAX_GNL_RESET_IFIDX = 1,
-};
-
-/* Attributes for wimax_state_get() */
-enum {
-	WIMAX_GNL_STGET_IFIDX = 1,
-};
-
-/*
- * Attributes for the Report State Change
- *
- * For now we just have the old and new states; new attributes might
- * be added later on.
- */
-enum {
-	WIMAX_GNL_STCH_IFIDX = 1,
-	WIMAX_GNL_STCH_STATE_OLD,
-	WIMAX_GNL_STCH_STATE_NEW,
-};
-
-
-/**
- * enum wimax_st - The different states of a WiMAX device
- * @__WIMAX_ST_NULL: The device structure has been allocated and zeroed,
- *     but still wimax_dev_add() hasn't been called. There is no state.
- *
- * @WIMAX_ST_DOWN: The device has been registered with the WiMAX and
- *     networking stacks, but it is not initialized (normally that is
- *     done with 'ifconfig DEV up' [or equivalent], which can upload
- *     firmware and enable communications with the device).
- *     In this state, the device is powered down and using as less
- *     power as possible.
- *     This state is the default after a call to wimax_dev_add(). It
- *     is ok to have drivers move directly to %WIMAX_ST_UNINITIALIZED
- *     or %WIMAX_ST_RADIO_OFF in _probe() after the call to
- *     wimax_dev_add().
- *     It is recommended that the driver leaves this state when
- *     calling 'ifconfig DEV up' and enters it back on 'ifconfig DEV
- *     down'.
- *
- * @__WIMAX_ST_QUIESCING: The device is being torn down, so no API
- *     operations are allowed to proceed except the ones needed to
- *     complete the device clean up process.
- *
- * @WIMAX_ST_UNINITIALIZED: [optional] Communication with the device
- *     is setup, but the device still requires some configuration
- *     before being operational.
- *     Some WiMAX API calls might work.
- *
- * @WIMAX_ST_RADIO_OFF: The device is fully up; radio is off (wether
- *     by hardware or software switches).
- *     It is recommended to always leave the device in this state
- *     after initialization.
- *
- * @WIMAX_ST_READY: The device is fully up and radio is on.
- *
- * @WIMAX_ST_SCANNING: [optional] The device has been instructed to
- *     scan. In this state, the device cannot be actively connected to
- *     a network.
- *
- * @WIMAX_ST_CONNECTING: The device is connecting to a network. This
- *     state exists because in some devices, the connect process can
- *     include a number of negotiations between user space, kernel
- *     space and the device. User space needs to know what the device
- *     is doing. If the connect sequence in a device is atomic and
- *     fast, the device can transition directly to CONNECTED
- *
- * @WIMAX_ST_CONNECTED: The device is connected to a network.
- *
- * @__WIMAX_ST_INVALID: This is an invalid state used to mark the
- *     maximum numeric value of states.
- *
- * Description:
- *
- * Transitions from one state to another one are atomic and can only
- * be caused in kernel space with wimax_state_change(). To read the
- * state, use wimax_state_get().
- *
- * States starting with __ are internal and shall not be used or
- * referred to by drivers or userspace. They look ugly, but that's the
- * point -- if any use is made non-internal to the stack, it is easier
- * to catch on review.
- *
- * All API operations [with well defined exceptions] will take the
- * device mutex before starting and then check the state. If the state
- * is %__WIMAX_ST_NULL, %WIMAX_ST_DOWN, %WIMAX_ST_UNINITIALIZED or
- * %__WIMAX_ST_QUIESCING, it will drop the lock and quit with
- * -%EINVAL, -%ENOMEDIUM, -%ENOTCONN or -%ESHUTDOWN.
- *
- * The order of the definitions is important, so we can do numerical
- * comparisons (eg: < %WIMAX_ST_RADIO_OFF means the device is not ready
- * to operate).
- */
-/*
- * The allowed state transitions are described in the table below
- * (states in rows can go to states in columns where there is an X):
- *
- *                                  UNINI   RADIO READY SCAN CONNEC CONNEC
- *             NULL DOWN QUIESCING TIALIZED  OFF        NING  TING   TED
- * NULL         -    x
- * DOWN              -      x        x       x
- * QUIESCING         x      -
- * UNINITIALIZED            x        -       x
- * RADIO_OFF                x                -     x
- * READY                    x                x     -     x     x      x
- * SCANNING                 x                x     x     -     x      x
- * CONNECTING               x                x     x     x     -      x
- * CONNECTED                x                x     x                  -
- *
- * This table not available in kernel-doc because the formatting messes it up.
- */
- enum wimax_st {
-	__WIMAX_ST_NULL = 0,
-	WIMAX_ST_DOWN,
-	__WIMAX_ST_QUIESCING,
-	WIMAX_ST_UNINITIALIZED,
-	WIMAX_ST_RADIO_OFF,
-	WIMAX_ST_READY,
-	WIMAX_ST_SCANNING,
-	WIMAX_ST_CONNECTING,
-	WIMAX_ST_CONNECTED,
-	__WIMAX_ST_INVALID			/* Always keep last */
-};
-
-
-#endif /* #ifndef __LINUX__WIMAX_H__ */
diff --git a/include/uapi/linux/wimax/i2400m.h b/include/uapi/linux/wimax/i2400m.h
deleted file mode 100644
index fd198bc24a3c..000000000000
--- a/include/uapi/linux/wimax/i2400m.h
+++ /dev/null
@@ -1,572 +0,0 @@
-/*
- * Intel Wireless WiMax Connection 2400m
- * Host-Device protocol interface definitions
- *
- *
- * Copyright (C) 2007-2008 Intel Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *  - Initial implementation
- *
- *
- * This header defines the data structures and constants used to
- * communicate with the device.
- *
- * BOOTMODE/BOOTROM/FIRMWARE UPLOAD PROTOCOL
- *
- * The firmware upload protocol is quite simple and only requires a
- * handful of commands. See drivers/net/wimax/i2400m/fw.c for more
- * details.
- *
- * The BCF data structure is for the firmware file header.
- *
- *
- * THE DATA / CONTROL PROTOCOL
- *
- * This is the normal protocol spoken with the device once the
- * firmware is uploaded. It transports data payloads and control
- * messages back and forth.
- *
- * It consists 'messages' that pack one or more payloads each. The
- * format is described in detail in drivers/net/wimax/i2400m/rx.c and
- * tx.c.
- *
- *
- * THE L3L4 PROTOCOL
- *
- * The term L3L4 refers to Layer 3 (the device), Layer 4 (the
- * driver/host software).
- *
- * This is the control protocol used by the host to control the i2400m
- * device (scan, connect, disconnect...). This is sent to / received
- * as control frames. These frames consist of a header and zero or
- * more TLVs with information. We call each control frame a "message".
- *
- * Each message is composed of:
- *
- * HEADER
- * [TLV0 + PAYLOAD0]
- * [TLV1 + PAYLOAD1]
- * [...]
- * [TLVN + PAYLOADN]
- *
- * The HEADER is defined by 'struct i2400m_l3l4_hdr'. The payloads are
- * defined by a TLV structure (Type Length Value) which is a 'header'
- * (struct i2400m_tlv_hdr) and then the payload.
- *
- * All integers are represented as Little Endian.
- *
- * - REQUESTS AND EVENTS
- *
- * The requests can be clasified as follows:
- *
- *   COMMAND:  implies a request from the host to the device requesting
- *             an action being performed. The device will reply with a
- *             message (with the same type as the command), status and
- *             no (TLV) payload. Execution of a command might cause
- *             events (of different type) to be sent later on as
- *             device's state changes.
- *
- *   GET/SET:  similar to COMMAND, but will not cause other
- *             EVENTs. The reply, in the case of GET, will contain
- *             TLVs with the requested information.
- *
- *   EVENT:    asynchronous messages sent from the device, maybe as a
- *             consequence of previous COMMANDs but disassociated from
- *             them.
- *
- * Only one request might be pending at the same time (ie: don't
- * parallelize nor post another GET request before the previous
- * COMMAND has been acknowledged with it's corresponding reply by the
- * device).
- *
- * The different requests and their formats are described below:
- *
- *  I2400M_MT_*   Message types
- *  I2400M_MS_*   Message status (for replies, events)
- *  i2400m_tlv_*  TLVs
- *
- * data types are named 'struct i2400m_msg_OPNAME', OPNAME matching the
- * operation.
- */
-
-#ifndef __LINUX__WIMAX__I2400M_H__
-#define __LINUX__WIMAX__I2400M_H__
-
-#include <linux/types.h>
-#include <linux/if_ether.h>
-
-/*
- * Host Device Interface (HDI) common to all busses
- */
-
-/* Boot-mode (firmware upload mode) commands */
-
-/* Header for the firmware file */
-struct i2400m_bcf_hdr {
-	__le32 module_type;
-	__le32 header_len;
-	__le32 header_version;
-	__le32 module_id;
-	__le32 module_vendor;
-	__le32 date;		/* BCD YYYMMDD */
-	__le32 size;            /* in dwords */
-	__le32 key_size;	/* in dwords */
-	__le32 modulus_size;	/* in dwords */
-	__le32 exponent_size;	/* in dwords */
-	__u8 reserved[88];
-} __attribute__ ((packed));
-
-/* Boot mode opcodes */
-enum i2400m_brh_opcode {
-	I2400M_BRH_READ = 1,
-	I2400M_BRH_WRITE = 2,
-	I2400M_BRH_JUMP = 3,
-	I2400M_BRH_SIGNED_JUMP = 8,
-	I2400M_BRH_HASH_PAYLOAD_ONLY = 9,
-};
-
-/* Boot mode command masks and stuff */
-enum i2400m_brh {
-	I2400M_BRH_SIGNATURE = 0xcbbc0000,
-	I2400M_BRH_SIGNATURE_MASK = 0xffff0000,
-	I2400M_BRH_SIGNATURE_SHIFT = 16,
-	I2400M_BRH_OPCODE_MASK = 0x0000000f,
-	I2400M_BRH_RESPONSE_MASK = 0x000000f0,
-	I2400M_BRH_RESPONSE_SHIFT = 4,
-	I2400M_BRH_DIRECT_ACCESS = 0x00000400,
-	I2400M_BRH_RESPONSE_REQUIRED = 0x00000200,
-	I2400M_BRH_USE_CHECKSUM = 0x00000100,
-};
-
-
-/**
- * i2400m_bootrom_header - Header for a boot-mode command
- *
- * @cmd: the above command descriptor
- * @target_addr: where on the device memory should the action be performed.
- * @data_size: for read/write, amount of data to be read/written
- * @block_checksum: checksum value (if applicable)
- * @payload: the beginning of data attached to this header
- */
-struct i2400m_bootrom_header {
-	__le32 command;		/* Compose with enum i2400_brh */
-	__le32 target_addr;
-	__le32 data_size;
-	__le32 block_checksum;
-	char payload[0];
-} __attribute__ ((packed));
-
-
-/*
- * Data / control protocol
- */
-
-/* Packet types for the host-device interface */
-enum i2400m_pt {
-	I2400M_PT_DATA = 0,
-	I2400M_PT_CTRL,
-	I2400M_PT_TRACE,	/* For device debug */
-	I2400M_PT_RESET_WARM,	/* device reset */
-	I2400M_PT_RESET_COLD,	/* USB[transport] reset, like reconnect */
-	I2400M_PT_EDATA,	/* Extended RX data */
-	I2400M_PT_ILLEGAL
-};
-
-
-/*
- * Payload for a data packet
- *
- * This is prefixed to each and every outgoing DATA type.
- */
-struct i2400m_pl_data_hdr {
-	__le32 reserved;
-} __attribute__((packed));
-
-
-/*
- * Payload for an extended data packet
- *
- * New in fw v1.4
- *
- * @reorder: if this payload has to be reorder or not (and how)
- * @cs: the type of data in the packet, as defined per (802.16e
- *     T11.13.19.1). Currently only 2 (IPv4 packet) supported.
- *
- * This is prefixed to each and every INCOMING DATA packet.
- */
-struct i2400m_pl_edata_hdr {
-	__le32 reorder;		/* bits defined in i2400m_ro */
-	__u8 cs;
-	__u8 reserved[11];
-} __attribute__((packed));
-
-enum i2400m_cs {
-	I2400M_CS_IPV4_0 = 0,
-	I2400M_CS_IPV4 = 2,
-};
-
-enum i2400m_ro {
-	I2400M_RO_NEEDED     = 0x01,
-	I2400M_RO_TYPE       = 0x03,
-	I2400M_RO_TYPE_SHIFT = 1,
-	I2400M_RO_CIN        = 0x0f,
-	I2400M_RO_CIN_SHIFT  = 4,
-	I2400M_RO_FBN        = 0x07ff,
-	I2400M_RO_FBN_SHIFT  = 8,
-	I2400M_RO_SN         = 0x07ff,
-	I2400M_RO_SN_SHIFT   = 21,
-};
-
-enum i2400m_ro_type {
-	I2400M_RO_TYPE_RESET = 0,
-	I2400M_RO_TYPE_PACKET,
-	I2400M_RO_TYPE_WS,
-	I2400M_RO_TYPE_PACKET_WS,
-};
-
-
-/* Misc constants */
-enum {
-	I2400M_PL_ALIGN = 16,	/* Payload data size alignment */
-	I2400M_PL_SIZE_MAX = 0x3EFF,
-	I2400M_MAX_PLS_IN_MSG = 60,
-	/* protocol barkers: sync sequences; for notifications they
-	 * are sent in groups of four. */
-	I2400M_H2D_PREVIEW_BARKER = 0xcafe900d,
-	I2400M_COLD_RESET_BARKER = 0xc01dc01d,
-	I2400M_WARM_RESET_BARKER = 0x50f750f7,
-	I2400M_NBOOT_BARKER = 0xdeadbeef,
-	I2400M_SBOOT_BARKER = 0x0ff1c1a1,
-	I2400M_SBOOT_BARKER_6050 = 0x80000001,
-	I2400M_ACK_BARKER = 0xfeedbabe,
-	I2400M_D2H_MSG_BARKER = 0xbeefbabe,
-};
-
-
-/*
- * Hardware payload descriptor
- *
- * Bitfields encoded in a struct to enforce typing semantics.
- *
- * Look in rx.c and tx.c for a full description of the format.
- */
-struct i2400m_pld {
-	__le32 val;
-} __attribute__ ((packed));
-
-#define I2400M_PLD_SIZE_MASK 0x00003fff
-#define I2400M_PLD_TYPE_SHIFT 16
-#define I2400M_PLD_TYPE_MASK 0x000f0000
-
-/*
- * Header for a TX message or RX message
- *
- * @barker: preamble
- * @size: used for management of the FIFO queue buffer; before
- *     sending, this is converted to be a real preamble. This
- *     indicates the real size of the TX message that starts at this
- *     point. If the highest bit is set, then this message is to be
- *     skipped.
- * @sequence: sequence number of this message
- * @offset: offset where the message itself starts -- see the comments
- *     in the file header about message header and payload descriptor
- *     alignment.
- * @num_pls: number of payloads in this message
- * @padding: amount of padding bytes at the end of the message to make
- *           it be of block-size aligned
- *
- * Look in rx.c and tx.c for a full description of the format.
- */
-struct i2400m_msg_hdr {
-	union {
-		__le32 barker;
-		__u32 size;	/* same size type as barker!! */
-	};
-	union {
-		__le32 sequence;
-		__u32 offset;	/* same size type as barker!! */
-	};
-	__le16 num_pls;
-	__le16 rsv1;
-	__le16 padding;
-	__le16 rsv2;
-	struct i2400m_pld pld[0];
-} __attribute__ ((packed));
-
-
-
-/*
- * L3/L4 control protocol
- */
-
-enum {
-	/* Interface version */
-	I2400M_L3L4_VERSION             = 0x0100,
-};
-
-/* Message types */
-enum i2400m_mt {
-	I2400M_MT_RESERVED              = 0x0000,
-	I2400M_MT_INVALID               = 0xffff,
-	I2400M_MT_REPORT_MASK		= 0x8000,
-
-	I2400M_MT_GET_SCAN_RESULT  	= 0x4202,
-	I2400M_MT_SET_SCAN_PARAM   	= 0x4402,
-	I2400M_MT_CMD_RF_CONTROL   	= 0x4602,
-	I2400M_MT_CMD_SCAN         	= 0x4603,
-	I2400M_MT_CMD_CONNECT      	= 0x4604,
-	I2400M_MT_CMD_DISCONNECT   	= 0x4605,
-	I2400M_MT_CMD_EXIT_IDLE   	= 0x4606,
-	I2400M_MT_GET_LM_VERSION   	= 0x5201,
-	I2400M_MT_GET_DEVICE_INFO  	= 0x5202,
-	I2400M_MT_GET_LINK_STATUS  	= 0x5203,
-	I2400M_MT_GET_STATISTICS   	= 0x5204,
-	I2400M_MT_GET_STATE        	= 0x5205,
-	I2400M_MT_GET_MEDIA_STATUS	= 0x5206,
-	I2400M_MT_SET_INIT_CONFIG	= 0x5404,
-	I2400M_MT_CMD_INIT	        = 0x5601,
-	I2400M_MT_CMD_TERMINATE		= 0x5602,
-	I2400M_MT_CMD_MODE_OF_OP	= 0x5603,
-	I2400M_MT_CMD_RESET_DEVICE	= 0x5604,
-	I2400M_MT_CMD_MONITOR_CONTROL   = 0x5605,
-	I2400M_MT_CMD_ENTER_POWERSAVE   = 0x5606,
-	I2400M_MT_GET_TLS_OPERATION_RESULT = 0x6201,
-	I2400M_MT_SET_EAP_SUCCESS       = 0x6402,
-	I2400M_MT_SET_EAP_FAIL          = 0x6403,
-	I2400M_MT_SET_EAP_KEY          	= 0x6404,
-	I2400M_MT_CMD_SEND_EAP_RESPONSE = 0x6602,
-	I2400M_MT_REPORT_SCAN_RESULT    = 0xc002,
-	I2400M_MT_REPORT_STATE		= 0xd002,
-	I2400M_MT_REPORT_POWERSAVE_READY = 0xd005,
-	I2400M_MT_REPORT_EAP_REQUEST    = 0xe002,
-	I2400M_MT_REPORT_EAP_RESTART    = 0xe003,
-	I2400M_MT_REPORT_ALT_ACCEPT    	= 0xe004,
-	I2400M_MT_REPORT_KEY_REQUEST 	= 0xe005,
-};
-
-
-/*
- * Message Ack Status codes
- *
- * When a message is replied-to, this status is reported.
- */
-enum i2400m_ms {
-	I2400M_MS_DONE_OK                  = 0,
-	I2400M_MS_DONE_IN_PROGRESS         = 1,
-	I2400M_MS_INVALID_OP               = 2,
-	I2400M_MS_BAD_STATE                = 3,
-	I2400M_MS_ILLEGAL_VALUE            = 4,
-	I2400M_MS_MISSING_PARAMS           = 5,
-	I2400M_MS_VERSION_ERROR            = 6,
-	I2400M_MS_ACCESSIBILITY_ERROR      = 7,
-	I2400M_MS_BUSY                     = 8,
-	I2400M_MS_CORRUPTED_TLV            = 9,
-	I2400M_MS_UNINITIALIZED            = 10,
-	I2400M_MS_UNKNOWN_ERROR            = 11,
-	I2400M_MS_PRODUCTION_ERROR         = 12,
-	I2400M_MS_NO_RF                    = 13,
-	I2400M_MS_NOT_READY_FOR_POWERSAVE  = 14,
-	I2400M_MS_THERMAL_CRITICAL         = 15,
-	I2400M_MS_MAX
-};
-
-
-/**
- * i2400m_tlv - enumeration of the different types of TLVs
- *
- * TLVs stand for type-length-value and are the header for a payload
- * composed of almost anything. Each payload has a type assigned
- * and a length.
- */
-enum i2400m_tlv {
-	I2400M_TLV_L4_MESSAGE_VERSIONS = 129,
-	I2400M_TLV_SYSTEM_STATE = 141,
-	I2400M_TLV_MEDIA_STATUS = 161,
-	I2400M_TLV_RF_OPERATION = 162,
-	I2400M_TLV_RF_STATUS = 163,
-	I2400M_TLV_DEVICE_RESET_TYPE = 132,
-	I2400M_TLV_CONFIG_IDLE_PARAMETERS = 601,
-	I2400M_TLV_CONFIG_IDLE_TIMEOUT = 611,
-	I2400M_TLV_CONFIG_D2H_DATA_FORMAT = 614,
-	I2400M_TLV_CONFIG_DL_HOST_REORDER = 615,
-};
-
-
-struct i2400m_tlv_hdr {
-	__le16 type;
-	__le16 length;		/* payload's */
-	__u8   pl[0];
-} __attribute__((packed));
-
-
-struct i2400m_l3l4_hdr {
-	__le16 type;
-	__le16 length;		/* payload's */
-	__le16 version;
-	__le16 resv1;
-	__le16 status;
-	__le16 resv2;
-	struct i2400m_tlv_hdr pl[0];
-} __attribute__((packed));
-
-
-/**
- * i2400m_system_state - different states of the device
- */
-enum i2400m_system_state {
-	I2400M_SS_UNINITIALIZED = 1,
-	I2400M_SS_INIT,
-	I2400M_SS_READY,
-	I2400M_SS_SCAN,
-	I2400M_SS_STANDBY,
-	I2400M_SS_CONNECTING,
-	I2400M_SS_WIMAX_CONNECTED,
-	I2400M_SS_DATA_PATH_CONNECTED,
-	I2400M_SS_IDLE,
-	I2400M_SS_DISCONNECTING,
-	I2400M_SS_OUT_OF_ZONE,
-	I2400M_SS_SLEEPACTIVE,
-	I2400M_SS_PRODUCTION,
-	I2400M_SS_CONFIG,
-	I2400M_SS_RF_OFF,
-	I2400M_SS_RF_SHUTDOWN,
-	I2400M_SS_DEVICE_DISCONNECT,
-	I2400M_SS_MAX,
-};
-
-
-/**
- * i2400m_tlv_system_state - report on the state of the system
- *
- * @state: see enum i2400m_system_state
- */
-struct i2400m_tlv_system_state {
-	struct i2400m_tlv_hdr hdr;
-	__le32 state;
-} __attribute__((packed));
-
-
-struct i2400m_tlv_l4_message_versions {
-	struct i2400m_tlv_hdr hdr;
-	__le16 major;
-	__le16 minor;
-	__le16 branch;
-	__le16 reserved;
-} __attribute__((packed));
-
-
-struct i2400m_tlv_detailed_device_info {
-	struct i2400m_tlv_hdr hdr;
-	__u8 reserved1[400];
-	__u8 mac_address[ETH_ALEN];
-	__u8 reserved2[2];
-} __attribute__((packed));
-
-
-enum i2400m_rf_switch_status {
-	I2400M_RF_SWITCH_ON = 1,
-	I2400M_RF_SWITCH_OFF = 2,
-};
-
-struct i2400m_tlv_rf_switches_status {
-	struct i2400m_tlv_hdr hdr;
-	__u8 sw_rf_switch;	/* 1 ON, 2 OFF */
-	__u8 hw_rf_switch;	/* 1 ON, 2 OFF */
-	__u8 reserved[2];
-} __attribute__((packed));
-
-
-enum {
-	i2400m_rf_operation_on = 1,
-	i2400m_rf_operation_off = 2
-};
-
-struct i2400m_tlv_rf_operation {
-	struct i2400m_tlv_hdr hdr;
-	__le32 status;	/* 1 ON, 2 OFF */
-} __attribute__((packed));
-
-
-enum i2400m_tlv_reset_type {
-	I2400M_RESET_TYPE_COLD = 1,
-	I2400M_RESET_TYPE_WARM
-};
-
-struct i2400m_tlv_device_reset_type {
-	struct i2400m_tlv_hdr hdr;
-	__le32 reset_type;
-} __attribute__((packed));
-
-
-struct i2400m_tlv_config_idle_parameters {
-	struct i2400m_tlv_hdr hdr;
-	__le32 idle_timeout;	/* 100 to 300000 ms [5min], 100 increments
-				 * 0 disabled */
-	__le32 idle_paging_interval;	/* frames */
-} __attribute__((packed));
-
-
-enum i2400m_media_status {
-	I2400M_MEDIA_STATUS_LINK_UP = 1,
-	I2400M_MEDIA_STATUS_LINK_DOWN,
-	I2400M_MEDIA_STATUS_LINK_RENEW,
-};
-
-struct i2400m_tlv_media_status {
-	struct i2400m_tlv_hdr hdr;
-	__le32 media_status;
-} __attribute__((packed));
-
-
-/* New in v1.4 */
-struct i2400m_tlv_config_idle_timeout {
-	struct i2400m_tlv_hdr hdr;
-	__le32 timeout;	/* 100 to 300000 ms [5min], 100 increments
-			 * 0 disabled */
-} __attribute__((packed));
-
-/* New in v1.4 -- for backward compat, will be removed */
-struct i2400m_tlv_config_d2h_data_format {
-	struct i2400m_tlv_hdr hdr;
-	__u8 format; 		/* 0 old format, 1 enhanced */
-	__u8 reserved[3];
-} __attribute__((packed));
-
-/* New in v1.4 */
-struct i2400m_tlv_config_dl_host_reorder {
-	struct i2400m_tlv_hdr hdr;
-	__u8 reorder; 		/* 0 disabled, 1 enabled */
-	__u8 reserved[3];
-} __attribute__((packed));
-
-
-#endif /* #ifndef __LINUX__WIMAX__I2400M_H__ */
diff --git a/net/Kconfig b/net/Kconfig
index d6567162c1cf..f4c32d982af6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -386,8 +386,6 @@ source "net/mac80211/Kconfig"
 
 endif # WIRELESS
 
-source "net/wimax/Kconfig"
-
 source "net/rfkill/Kconfig"
 source "net/9p/Kconfig"
 source "net/caif/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 5744bf1997fd..d96b0aa8f39f 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -66,7 +66,6 @@ obj-$(CONFIG_MAC802154)		+= mac802154/
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
-obj-$(CONFIG_WIMAX)		+= wimax/
 obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
 obj-$(CONFIG_CEPH_LIB)		+= ceph/
 obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig
deleted file mode 100644
index d13762bc4abc..000000000000
--- a/net/wimax/Kconfig
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# WiMAX LAN device configuration
-#
-
-menuconfig WIMAX
-	tristate "WiMAX Wireless Broadband support"
-	depends on RFKILL || !RFKILL
-	help
-
-	  Select to configure support for devices that provide
-	  wireless broadband connectivity using the WiMAX protocol
-	  (IEEE 802.16).
-
-	  Please note that most of these devices require signing up
-	  for a service plan with a provider.
-
-	  The different WiMAX drivers can be enabled in the menu entry
-
-	  Device Drivers > Network device support > WiMAX Wireless
-	  Broadband devices
-
-	  If unsure, it is safe to select M (module).
-
-config WIMAX_DEBUG_LEVEL
-	int "WiMAX debug level"
-	depends on WIMAX
-	default 8
-	help
-
-	  Select the maximum debug verbosity level to be compiled into
-	  the WiMAX stack code.
-
-	  By default, debug messages are disabled at runtime and can
-	  be selectively enabled for different parts of the code using
-	  the sysfs debug-levels file.
-
-	  If set at zero, this will compile out all the debug code.
-
-	  It is recommended that it is left at 8.
diff --git a/net/wimax/Makefile b/net/wimax/Makefile
deleted file mode 100644
index c2a71ae487ac..000000000000
--- a/net/wimax/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-$(CONFIG_WIMAX)		+= wimax.o
-
-wimax-y :=		\
-	id-table.o	\
-	op-msg.o	\
-	op-reset.o	\
-	op-rfkill.o	\
-	op-state-get.o	\
-	stack.o
-
-wimax-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/net/wimax/debug-levels.h b/net/wimax/debug-levels.h
deleted file mode 100644
index ebc287cde336..000000000000
--- a/net/wimax/debug-levels.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Linux WiMAX Stack
- * Debug levels control file for the wimax module
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-#ifndef __debug_levels__h__
-#define __debug_levels__h__
-
-/* Maximum compile and run time debug level for all submodules */
-#define D_MODULENAME wimax
-#define D_MASTER CONFIG_WIMAX_DEBUG_LEVEL
-
-#include <linux/wimax/debug.h>
-
-/* List of all the enabled modules */
-enum d_module {
-	D_SUBMODULE_DECLARE(debugfs),
-	D_SUBMODULE_DECLARE(id_table),
-	D_SUBMODULE_DECLARE(op_msg),
-	D_SUBMODULE_DECLARE(op_reset),
-	D_SUBMODULE_DECLARE(op_rfkill),
-	D_SUBMODULE_DECLARE(op_state_get),
-	D_SUBMODULE_DECLARE(stack),
-};
-
-#endif /* #ifndef __debug_levels__h__ */
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
deleted file mode 100644
index 3c54bb6b925a..000000000000
--- a/net/wimax/debugfs.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Debugfs support
- *
- * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-#include <linux/debugfs.h>
-#include <linux/wimax.h>
-#include "wimax-internal.h"
-
-#define D_SUBMODULE debugfs
-#include "debug-levels.h"
-
-void wimax_debugfs_add(struct wimax_dev *wimax_dev)
-{
-	struct net_device *net_dev = wimax_dev->net_dev;
-	struct dentry *dentry;
-	char buf[128];
-
-	snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name);
-	dentry = debugfs_create_dir(buf, NULL);
-	wimax_dev->debugfs_dentry = dentry;
-
-	d_level_register_debugfs("wimax_dl_", debugfs, dentry);
-	d_level_register_debugfs("wimax_dl_", id_table, dentry);
-	d_level_register_debugfs("wimax_dl_", op_msg, dentry);
-	d_level_register_debugfs("wimax_dl_", op_reset, dentry);
-	d_level_register_debugfs("wimax_dl_", op_rfkill, dentry);
-	d_level_register_debugfs("wimax_dl_", op_state_get, dentry);
-	d_level_register_debugfs("wimax_dl_", stack, dentry);
-}
-
-void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
-{
-	debugfs_remove_recursive(wimax_dev->debugfs_dentry);
-}
diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c
deleted file mode 100644
index 02eee37b7e31..000000000000
--- a/net/wimax/id-table.c
+++ /dev/null
@@ -1,130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Mappping of generic netlink family IDs to net devices
- *
- * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * We assign a single generic netlink family ID to each device (to
- * simplify lookup).
- *
- * We need a way to map family ID to a wimax_dev pointer.
- *
- * The idea is to use a very simple lookup. Using a netlink attribute
- * with (for example) the interface name implies a heavier search over
- * all the network devices; seemed kind of a waste given that we know
- * we are looking for a WiMAX device and that most systems will have
- * just a single WiMAX adapter.
- *
- * We put all the WiMAX devices in the system in a linked list and
- * match the generic link family ID against the list.
- *
- * By using a linked list, the case of a single adapter in the system
- * becomes (almost) no overhead, while still working for many more. If
- * it ever goes beyond two, I'll be surprised.
- */
-#include <linux/device.h>
-#include <net/genetlink.h>
-#include <linux/netdevice.h>
-#include <linux/list.h>
-#include <linux/wimax.h>
-#include "wimax-internal.h"
-
-
-#define D_SUBMODULE id_table
-#include "debug-levels.h"
-
-
-static DEFINE_SPINLOCK(wimax_id_table_lock);
-static struct list_head wimax_id_table = LIST_HEAD_INIT(wimax_id_table);
-
-
-/*
- * wimax_id_table_add - add a gennetlink familiy ID / wimax_dev mapping
- *
- * @wimax_dev: WiMAX device descriptor to associate to the Generic
- *     Netlink family ID.
- *
- * Look for an empty spot in the ID table; if none found, double the
- * table's size and get the first spot.
- */
-void wimax_id_table_add(struct wimax_dev *wimax_dev)
-{
-	d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
-	spin_lock(&wimax_id_table_lock);
-	list_add(&wimax_dev->id_table_node, &wimax_id_table);
-	spin_unlock(&wimax_id_table_lock);
-	d_fnend(3, NULL, "(wimax_dev %p)\n", wimax_dev);
-}
-
-
-/*
- * wimax_get_netdev_by_info - lookup a wimax_dev from the gennetlink info
- *
- * The generic netlink family ID has been filled out in the
- * nlmsghdr->nlmsg_type field, so we pull it from there, look it up in
- * the mapping table and reference the wimax_dev.
- *
- * When done, the reference should be dropped with
- * 'dev_put(wimax_dev->net_dev)'.
- */
-struct wimax_dev *wimax_dev_get_by_genl_info(
-	struct genl_info *info, int ifindex)
-{
-	struct wimax_dev *wimax_dev = NULL;
-
-	d_fnstart(3, NULL, "(info %p ifindex %d)\n", info, ifindex);
-	spin_lock(&wimax_id_table_lock);
-	list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
-		if (wimax_dev->net_dev->ifindex == ifindex) {
-			dev_hold(wimax_dev->net_dev);
-			goto found;
-		}
-	}
-	wimax_dev = NULL;
-	d_printf(1, NULL, "wimax: no devices found with ifindex %d\n",
-		 ifindex);
-found:
-	spin_unlock(&wimax_id_table_lock);
-	d_fnend(3, NULL, "(info %p ifindex %d) = %p\n",
-		info, ifindex, wimax_dev);
-	return wimax_dev;
-}
-
-
-/*
- * wimax_id_table_rm - Remove a gennetlink familiy ID / wimax_dev mapping
- *
- * @id: family ID to remove from the table
- */
-void wimax_id_table_rm(struct wimax_dev *wimax_dev)
-{
-	spin_lock(&wimax_id_table_lock);
-	list_del_init(&wimax_dev->id_table_node);
-	spin_unlock(&wimax_id_table_lock);
-}
-
-
-/*
- * Release the gennetlink family id / mapping table
- *
- * On debug, verify that the table is empty upon removal. We want the
- * code always compiled, to ensure it doesn't bit rot. It will be
- * compiled out if CONFIG_BUG is disabled.
- */
-void wimax_id_table_release(void)
-{
-	struct wimax_dev *wimax_dev;
-
-#ifndef CONFIG_BUG
-	return;
-#endif
-	spin_lock(&wimax_id_table_lock);
-	list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
-		pr_err("BUG: %s wimax_dev %p ifindex %d not cleared\n",
-		       __func__, wimax_dev, wimax_dev->net_dev->ifindex);
-		WARN_ON(1);
-	}
-	spin_unlock(&wimax_id_table_lock);
-}
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
deleted file mode 100644
index 6460b5785758..000000000000
--- a/net/wimax/op-msg.c
+++ /dev/null
@@ -1,391 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Generic messaging interface between userspace and driver/device
- *
- * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This implements a direct communication channel between user space and
- * the driver/device, by which free form messages can be sent back and
- * forth.
- *
- * This is intended for device-specific features, vendor quirks, etc.
- *
- * See include/net/wimax.h
- *
- * GENERIC NETLINK ENCODING AND CAPACITY
- *
- * A destination "pipe name" is added to each message; it is up to the
- * drivers to assign or use those names (if using them at all).
- *
- * Messages are encoded as a binary netlink attribute using nla_put()
- * using type NLA_UNSPEC (as some versions of libnl still in
- * deployment don't yet understand NLA_BINARY).
- *
- * The maximum capacity of this transport is PAGESIZE per message (so
- * the actual payload will be bit smaller depending on the
- * netlink/generic netlink attributes and headers).
- *
- * RECEPTION OF MESSAGES
- *
- * When a message is received from user space, it is passed verbatim
- * to the driver calling wimax_dev->op_msg_from_user(). The return
- * value from this function is passed back to user space as an ack
- * over the generic netlink protocol.
- *
- * The stack doesn't do any processing or interpretation of these
- * messages.
- *
- * SENDING MESSAGES
- *
- * Messages can be sent with wimax_msg().
- *
- * If the message delivery needs to happen on a different context to
- * that of its creation, wimax_msg_alloc() can be used to get a
- * pointer to the message that can be delivered later on with
- * wimax_msg_send().
- *
- * ROADMAP
- *
- * wimax_gnl_doit_msg_from_user()    Process a message from user space
- *   wimax_dev_get_by_genl_info()
- *   wimax_dev->op_msg_from_user()   Delivery of message to the driver
- *
- * wimax_msg()                       Send a message to user space
- *   wimax_msg_alloc()
- *   wimax_msg_send()
- */
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <net/genetlink.h>
-#include <linux/netdevice.h>
-#include <linux/wimax.h>
-#include <linux/security.h>
-#include <linux/export.h>
-#include "wimax-internal.h"
-
-
-#define D_SUBMODULE op_msg
-#include "debug-levels.h"
-
-
-/**
- * wimax_msg_alloc - Create a new skb for sending a message to userspace
- *
- * @wimax_dev: WiMAX device descriptor
- * @pipe_name: "named pipe" the message will be sent to
- * @msg: pointer to the message data to send
- * @size: size of the message to send (in bytes), including the header.
- * @gfp_flags: flags for memory allocation.
- *
- * Returns: %0 if ok, negative errno code on error
- *
- * Description:
- *
- * Allocates an skb that will contain the message to send to user
- * space over the messaging pipe and initializes it, copying the
- * payload.
- *
- * Once this call is done, you can deliver it with
- * wimax_msg_send().
- *
- * IMPORTANT:
- *
- * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
- * wimax_msg_send() depends on skb->data being placed at the
- * beginning of the user message.
- *
- * Unlike other WiMAX stack calls, this call can be used way early,
- * even before wimax_dev_add() is called, as long as the
- * wimax_dev->net_dev pointer is set to point to a proper
- * net_dev. This is so that drivers can use it early in case they need
- * to send stuff around or communicate with user space.
- */
-struct sk_buff *wimax_msg_alloc(struct wimax_dev *wimax_dev,
-				const char *pipe_name,
-				const void *msg, size_t size,
-				gfp_t gfp_flags)
-{
-	int result;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	size_t msg_size;
-	void *genl_msg;
-	struct sk_buff *skb;
-
-	msg_size = nla_total_size(size)
-		+ nla_total_size(sizeof(u32))
-		+ (pipe_name ? nla_total_size(strlen(pipe_name)) : 0);
-	result = -ENOMEM;
-	skb = genlmsg_new(msg_size, gfp_flags);
-	if (skb == NULL)
-		goto error_new;
-	genl_msg = genlmsg_put(skb, 0, 0, &wimax_gnl_family,
-			       0, WIMAX_GNL_OP_MSG_TO_USER);
-	if (genl_msg == NULL) {
-		dev_err(dev, "no memory to create generic netlink message\n");
-		goto error_genlmsg_put;
-	}
-	result = nla_put_u32(skb, WIMAX_GNL_MSG_IFIDX,
-			     wimax_dev->net_dev->ifindex);
-	if (result < 0) {
-		dev_err(dev, "no memory to add ifindex attribute\n");
-		goto error_nla_put;
-	}
-	if (pipe_name) {
-		result = nla_put_string(skb, WIMAX_GNL_MSG_PIPE_NAME,
-					pipe_name);
-		if (result < 0) {
-			dev_err(dev, "no memory to add pipe_name attribute\n");
-			goto error_nla_put;
-		}
-	}
-	result = nla_put(skb, WIMAX_GNL_MSG_DATA, size, msg);
-	if (result < 0) {
-		dev_err(dev, "no memory to add payload (msg %p size %zu) in "
-			"attribute: %d\n", msg, size, result);
-		goto error_nla_put;
-	}
-	genlmsg_end(skb, genl_msg);
-	return skb;
-
-error_nla_put:
-error_genlmsg_put:
-error_new:
-	nlmsg_free(skb);
-	return ERR_PTR(result);
-}
-EXPORT_SYMBOL_GPL(wimax_msg_alloc);
-
-
-/**
- * wimax_msg_data_len - Return a pointer and size of a message's payload
- *
- * @msg: Pointer to a message created with wimax_msg_alloc()
- * @size: Pointer to where to store the message's size
- *
- * Returns the pointer to the message data.
- */
-const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size)
-{
-	struct nlmsghdr *nlh = (void *) msg->head;
-	struct nlattr *nla;
-
-	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
-			      WIMAX_GNL_MSG_DATA);
-	if (nla == NULL) {
-		pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
-		return NULL;
-	}
-	*size = nla_len(nla);
-	return nla_data(nla);
-}
-EXPORT_SYMBOL_GPL(wimax_msg_data_len);
-
-
-/**
- * wimax_msg_data - Return a pointer to a message's payload
- *
- * @msg: Pointer to a message created with wimax_msg_alloc()
- */
-const void *wimax_msg_data(struct sk_buff *msg)
-{
-	struct nlmsghdr *nlh = (void *) msg->head;
-	struct nlattr *nla;
-
-	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
-			      WIMAX_GNL_MSG_DATA);
-	if (nla == NULL) {
-		pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
-		return NULL;
-	}
-	return nla_data(nla);
-}
-EXPORT_SYMBOL_GPL(wimax_msg_data);
-
-
-/**
- * wimax_msg_len - Return a message's payload length
- *
- * @msg: Pointer to a message created with wimax_msg_alloc()
- */
-ssize_t wimax_msg_len(struct sk_buff *msg)
-{
-	struct nlmsghdr *nlh = (void *) msg->head;
-	struct nlattr *nla;
-
-	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
-			      WIMAX_GNL_MSG_DATA);
-	if (nla == NULL) {
-		pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
-		return -EINVAL;
-	}
-	return nla_len(nla);
-}
-EXPORT_SYMBOL_GPL(wimax_msg_len);
-
-
-/**
- * wimax_msg_send - Send a pre-allocated message to user space
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * @skb: &struct sk_buff returned by wimax_msg_alloc(). Note the
- *     ownership of @skb is transferred to this function.
- *
- * Returns: 0 if ok, < 0 errno code on error
- *
- * Description:
- *
- * Sends a free-form message that was preallocated with
- * wimax_msg_alloc() and filled up.
- *
- * Assumes that once you pass an skb to this function for sending, it
- * owns it and will release it when done (on success).
- *
- * IMPORTANT:
- *
- * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
- * wimax_msg_send() depends on skb->data being placed at the
- * beginning of the user message.
- *
- * Unlike other WiMAX stack calls, this call can be used way early,
- * even before wimax_dev_add() is called, as long as the
- * wimax_dev->net_dev pointer is set to point to a proper
- * net_dev. This is so that drivers can use it early in case they need
- * to send stuff around or communicate with user space.
- */
-int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb)
-{
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	void *msg = skb->data;
-	size_t size = skb->len;
-	might_sleep();
-
-	d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size);
-	d_dump(2, dev, msg, size);
-	genlmsg_multicast(&wimax_gnl_family, skb, 0, 0, GFP_KERNEL);
-	d_printf(1, dev, "CTX: genl multicast done\n");
-	return 0;
-}
-EXPORT_SYMBOL_GPL(wimax_msg_send);
-
-
-/**
- * wimax_msg - Send a message to user space
- *
- * @wimax_dev: WiMAX device descriptor (properly referenced)
- * @pipe_name: "named pipe" the message will be sent to
- * @buf: pointer to the message to send.
- * @size: size of the buffer pointed to by @buf (in bytes).
- * @gfp_flags: flags for memory allocation.
- *
- * Returns: %0 if ok, negative errno code on error.
- *
- * Description:
- *
- * Sends a free-form message to user space on the device @wimax_dev.
- *
- * NOTES:
- *
- * Once the @skb is given to this function, who will own it and will
- * release it when done (unless it returns error).
- */
-int wimax_msg(struct wimax_dev *wimax_dev, const char *pipe_name,
-	      const void *buf, size_t size, gfp_t gfp_flags)
-{
-	int result = -ENOMEM;
-	struct sk_buff *skb;
-
-	skb = wimax_msg_alloc(wimax_dev, pipe_name, buf, size, gfp_flags);
-	if (IS_ERR(skb))
-		result = PTR_ERR(skb);
-	else
-		result = wimax_msg_send(wimax_dev, skb);
-	return result;
-}
-EXPORT_SYMBOL_GPL(wimax_msg);
-
-/*
- * Relays a message from user space to the driver
- *
- * The skb is passed to the driver-specific function with the netlink
- * and generic netlink headers already stripped.
- *
- * This call will block while handling/relaying the message.
- */
-int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info)
-{
-	int result, ifindex;
-	struct wimax_dev *wimax_dev;
-	struct device *dev;
-	struct nlmsghdr *nlh = info->nlhdr;
-	char *pipe_name;
-	void *msg_buf;
-	size_t msg_len;
-
-	might_sleep();
-	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
-	result = -ENODEV;
-	if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) {
-		pr_err("WIMAX_GNL_MSG_FROM_USER: can't find IFIDX attribute\n");
-		goto error_no_wimax_dev;
-	}
-	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]);
-	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
-	if (wimax_dev == NULL)
-		goto error_no_wimax_dev;
-	dev = wimax_dev_to_dev(wimax_dev);
-
-	/* Unpack arguments */
-	result = -EINVAL;
-	if (info->attrs[WIMAX_GNL_MSG_DATA] == NULL) {
-		dev_err(dev, "WIMAX_GNL_MSG_FROM_USER: can't find MSG_DATA "
-			"attribute\n");
-		goto error_no_data;
-	}
-	msg_buf = nla_data(info->attrs[WIMAX_GNL_MSG_DATA]);
-	msg_len = nla_len(info->attrs[WIMAX_GNL_MSG_DATA]);
-
-	if (info->attrs[WIMAX_GNL_MSG_PIPE_NAME] == NULL)
-		pipe_name = NULL;
-	else {
-		struct nlattr *attr = info->attrs[WIMAX_GNL_MSG_PIPE_NAME];
-		size_t attr_len = nla_len(attr);
-		/* libnl-1.1 does not yet support NLA_NUL_STRING */
-		result = -ENOMEM;
-		pipe_name = kstrndup(nla_data(attr), attr_len + 1, GFP_KERNEL);
-		if (pipe_name == NULL)
-			goto error_alloc;
-		pipe_name[attr_len] = 0;
-	}
-	mutex_lock(&wimax_dev->mutex);
-	result = wimax_dev_is_ready(wimax_dev);
-	if (result == -ENOMEDIUM)
-		result = 0;
-	if (result < 0)
-		goto error_not_ready;
-	result = -ENOSYS;
-	if (wimax_dev->op_msg_from_user == NULL)
-		goto error_noop;
-
-	d_printf(1, dev,
-		 "CRX: nlmsghdr len %u type %u flags 0x%04x seq 0x%x pid %u\n",
-		 nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags,
-		 nlh->nlmsg_seq, nlh->nlmsg_pid);
-	d_printf(1, dev, "CRX: wimax message %zu bytes\n", msg_len);
-	d_dump(2, dev, msg_buf, msg_len);
-
-	result = wimax_dev->op_msg_from_user(wimax_dev, pipe_name,
-					     msg_buf, msg_len, info);
-error_noop:
-error_not_ready:
-	mutex_unlock(&wimax_dev->mutex);
-error_alloc:
-	kfree(pipe_name);
-error_no_data:
-	dev_put(wimax_dev->net_dev);
-error_no_wimax_dev:
-	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
-	return result;
-}
diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c
deleted file mode 100644
index 9899b2e56721..000000000000
--- a/net/wimax/op-reset.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Implement and export a method for resetting a WiMAX device
- *
- * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This implements a simple synchronous call to reset a WiMAX device.
- *
- * Resets aim at being warm, keeping the device handles active;
- * however, when that fails, it falls back to a cold reset (that will
- * disconnect and reconnect the device).
- */
-
-#include <net/wimax.h>
-#include <net/genetlink.h>
-#include <linux/wimax.h>
-#include <linux/security.h>
-#include <linux/export.h>
-#include "wimax-internal.h"
-
-#define D_SUBMODULE op_reset
-#include "debug-levels.h"
-
-
-/**
- * wimax_reset - Reset a WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * Returns:
- *
- * %0 if ok and a warm reset was done (the device still exists in
- * the system).
- *
- * -%ENODEV if a cold/bus reset had to be done (device has
- * disconnected and reconnected, so current handle is not valid
- * any more).
- *
- * -%EINVAL if the device is not even registered.
- *
- * Any other negative error code shall be considered as
- * non-recoverable.
- *
- * Description:
- *
- * Called when wanting to reset the device for any reason. Device is
- * taken back to power on status.
- *
- * This call blocks; on successful return, the device has completed the
- * reset process and is ready to operate.
- */
-int wimax_reset(struct wimax_dev *wimax_dev)
-{
-	int result = -EINVAL;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	enum wimax_st state;
-
-	might_sleep();
-	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
-	mutex_lock(&wimax_dev->mutex);
-	dev_hold(wimax_dev->net_dev);
-	state = wimax_dev->state;
-	mutex_unlock(&wimax_dev->mutex);
-
-	if (state >= WIMAX_ST_DOWN) {
-		mutex_lock(&wimax_dev->mutex_reset);
-		result = wimax_dev->op_reset(wimax_dev);
-		mutex_unlock(&wimax_dev->mutex_reset);
-	}
-	dev_put(wimax_dev->net_dev);
-
-	d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
-	return result;
-}
-EXPORT_SYMBOL(wimax_reset);
-
-
-/*
- * Exporting to user space over generic netlink
- *
- * Parse the reset command from user space, return error code.
- *
- * No attributes.
- */
-int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info)
-{
-	int result, ifindex;
-	struct wimax_dev *wimax_dev;
-
-	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
-	result = -ENODEV;
-	if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) {
-		pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n");
-		goto error_no_wimax_dev;
-	}
-	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]);
-	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
-	if (wimax_dev == NULL)
-		goto error_no_wimax_dev;
-	/* Execute the operation and send the result back to user space */
-	result = wimax_reset(wimax_dev);
-	dev_put(wimax_dev->net_dev);
-error_no_wimax_dev:
-	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
-	return result;
-}
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
deleted file mode 100644
index 248d10b60b05..000000000000
--- a/net/wimax/op-rfkill.c
+++ /dev/null
@@ -1,431 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * RF-kill framework integration
- *
- * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This integrates into the Linux Kernel rfkill susbystem so that the
- * drivers just have to do the bare minimal work, which is providing a
- * method to set the software RF-Kill switch and to report changes in
- * the software and hardware switch status.
- *
- * A non-polled generic rfkill device is embedded into the WiMAX
- * subsystem's representation of a device.
- *
- * FIXME: Need polled support? Let drivers provide a poll routine
- *	  and hand it to rfkill ops then?
- *
- * All device drivers have to do is after wimax_dev_init(), call
- * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update
- * initial state and then every time it changes. See wimax.h:struct
- * wimax_dev for more information.
- *
- * ROADMAP
- *
- * wimax_gnl_doit_rfkill()      User space calling wimax_rfkill()
- *   wimax_rfkill()             Kernel calling wimax_rfkill()
- *     __wimax_rf_toggle_radio()
- *
- * wimax_rfkill_set_radio_block()  RF-Kill subsystem calling
- *   __wimax_rf_toggle_radio()
- *
- * __wimax_rf_toggle_radio()
- *   wimax_dev->op_rfkill_sw_toggle() Driver backend
- *   __wimax_state_change()
- *
- * wimax_report_rfkill_sw()     Driver reports state change
- *   __wimax_state_change()
- *
- * wimax_report_rfkill_hw()     Driver reports state change
- *   __wimax_state_change()
- *
- * wimax_rfkill_add()           Initialize/shutdown rfkill support
- * wimax_rfkill_rm()            [called by wimax_dev_add/rm()]
- */
-
-#include <net/wimax.h>
-#include <net/genetlink.h>
-#include <linux/wimax.h>
-#include <linux/security.h>
-#include <linux/rfkill.h>
-#include <linux/export.h>
-#include "wimax-internal.h"
-
-#define D_SUBMODULE op_rfkill
-#include "debug-levels.h"
-
-/**
- * wimax_report_rfkill_hw - Reports changes in the hardware RF switch
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * @state: New state of the RF Kill switch. %WIMAX_RF_ON radio on,
- *     %WIMAX_RF_OFF radio off.
- *
- * When the device detects a change in the state of thehardware RF
- * switch, it must call this function to let the WiMAX kernel stack
- * know that the state has changed so it can be properly propagated.
- *
- * The WiMAX stack caches the state (the driver doesn't need to). As
- * well, as the change is propagated it will come back as a request to
- * change the software state to mirror the hardware state.
- *
- * If the device doesn't have a hardware kill switch, just report
- * it on initialization as always on (%WIMAX_RF_ON, radio on).
- */
-void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
-			    enum wimax_rf_state state)
-{
-	int result;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	enum wimax_st wimax_state;
-
-	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
-	BUG_ON(state == WIMAX_RF_QUERY);
-	BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
-
-	mutex_lock(&wimax_dev->mutex);
-	result = wimax_dev_is_ready(wimax_dev);
-	if (result < 0)
-		goto error_not_ready;
-
-	if (state != wimax_dev->rf_hw) {
-		wimax_dev->rf_hw = state;
-		if (wimax_dev->rf_hw == WIMAX_RF_ON &&
-		    wimax_dev->rf_sw == WIMAX_RF_ON)
-			wimax_state = WIMAX_ST_READY;
-		else
-			wimax_state = WIMAX_ST_RADIO_OFF;
-
-		result = rfkill_set_hw_state(wimax_dev->rfkill,
-					     state == WIMAX_RF_OFF);
-
-		__wimax_state_change(wimax_dev, wimax_state);
-	}
-error_not_ready:
-	mutex_unlock(&wimax_dev->mutex);
-	d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
-		wimax_dev, state, result);
-}
-EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw);
-
-
-/**
- * wimax_report_rfkill_sw - Reports changes in the software RF switch
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on,
- *     %WIMAX_RF_OFF radio off.
- *
- * Reports changes in the software RF switch state to the WiMAX stack.
- *
- * The main use is during initialization, so the driver can query the
- * device for its current software radio kill switch state and feed it
- * to the system.
- *
- * On the side, the device does not change the software state by
- * itself. In practice, this can happen, as the device might decide to
- * switch (in software) the radio off for different reasons.
- */
-void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev,
-			    enum wimax_rf_state state)
-{
-	int result;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	enum wimax_st wimax_state;
-
-	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
-	BUG_ON(state == WIMAX_RF_QUERY);
-	BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
-
-	mutex_lock(&wimax_dev->mutex);
-	result = wimax_dev_is_ready(wimax_dev);
-	if (result < 0)
-		goto error_not_ready;
-
-	if (state != wimax_dev->rf_sw) {
-		wimax_dev->rf_sw = state;
-		if (wimax_dev->rf_hw == WIMAX_RF_ON &&
-		    wimax_dev->rf_sw == WIMAX_RF_ON)
-			wimax_state = WIMAX_ST_READY;
-		else
-			wimax_state = WIMAX_ST_RADIO_OFF;
-		__wimax_state_change(wimax_dev, wimax_state);
-		rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
-	}
-error_not_ready:
-	mutex_unlock(&wimax_dev->mutex);
-	d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
-		wimax_dev, state, result);
-}
-EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw);
-
-
-/*
- * Callback for the RF Kill toggle operation
- *
- * This function is called by:
- *
- * - The rfkill subsystem when the RF-Kill key is pressed in the
- *   hardware and the driver notifies through
- *   wimax_report_rfkill_hw(). The rfkill subsystem ends up calling back
- *   here so the software RF Kill switch state is changed to reflect
- *   the hardware switch state.
- *
- * - When the user sets the state through sysfs' rfkill/state file
- *
- * - When the user calls wimax_rfkill().
- *
- * This call blocks!
- *
- * WARNING! When we call rfkill_unregister(), this will be called with
- * state 0!
- *
- * WARNING: wimax_dev must be locked
- */
-static
-int __wimax_rf_toggle_radio(struct wimax_dev *wimax_dev,
-			    enum wimax_rf_state state)
-{
-	int result = 0;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	enum wimax_st wimax_state;
-
-	might_sleep();
-	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
-	if (wimax_dev->rf_sw == state)
-		goto out_no_change;
-	if (wimax_dev->op_rfkill_sw_toggle != NULL)
-		result = wimax_dev->op_rfkill_sw_toggle(wimax_dev, state);
-	else if (state == WIMAX_RF_OFF)	/* No op? can't turn off */
-		result = -ENXIO;
-	else				/* No op? can turn on */
-		result = 0;		/* should never happen tho */
-	if (result >= 0) {
-		result = 0;
-		wimax_dev->rf_sw = state;
-		wimax_state = state == WIMAX_RF_ON ?
-			WIMAX_ST_READY : WIMAX_ST_RADIO_OFF;
-		__wimax_state_change(wimax_dev, wimax_state);
-	}
-out_no_change:
-	d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
-		wimax_dev, state, result);
-	return result;
-}
-
-
-/*
- * Translate from rfkill state to wimax state
- *
- * NOTE: Special state handling rules here
- *
- *     Just pretend the call didn't happen if we are in a state where
- *     we know for sure it cannot be handled (WIMAX_ST_DOWN or
- *     __WIMAX_ST_QUIESCING). rfkill() needs it to register and
- *     unregister, as it will run this path.
- *
- * NOTE: This call will block until the operation is completed.
- */
-static int wimax_rfkill_set_radio_block(void *data, bool blocked)
-{
-	int result;
-	struct wimax_dev *wimax_dev = data;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	enum wimax_rf_state rf_state;
-
-	d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked);
-	rf_state = WIMAX_RF_ON;
-	if (blocked)
-		rf_state = WIMAX_RF_OFF;
-	mutex_lock(&wimax_dev->mutex);
-	if (wimax_dev->state <= __WIMAX_ST_QUIESCING)
-		result = 0;
-	else
-		result = __wimax_rf_toggle_radio(wimax_dev, rf_state);
-	mutex_unlock(&wimax_dev->mutex);
-	d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n",
-		wimax_dev, blocked, result);
-	return result;
-}
-
-static const struct rfkill_ops wimax_rfkill_ops = {
-	.set_block = wimax_rfkill_set_radio_block,
-};
-
-/**
- * wimax_rfkill - Set the software RF switch state for a WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * @state: New RF state.
- *
- * Returns:
- *
- * >= 0 toggle state if ok, < 0 errno code on error. The toggle state
- * is returned as a bitmap, bit 0 being the hardware RF state, bit 1
- * the software RF state.
- *
- * 0 means disabled (%WIMAX_RF_ON, radio on), 1 means enabled radio
- * off (%WIMAX_RF_OFF).
- *
- * Description:
- *
- * Called by the user when he wants to request the WiMAX radio to be
- * switched on (%WIMAX_RF_ON) or off (%WIMAX_RF_OFF). With
- * %WIMAX_RF_QUERY, just the current state is returned.
- *
- * NOTE:
- *
- * This call will block until the operation is complete.
- */
-int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state)
-{
-	int result;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-
-	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
-	mutex_lock(&wimax_dev->mutex);
-	result = wimax_dev_is_ready(wimax_dev);
-	if (result < 0) {
-		/* While initializing, < 1.4.3 wimax-tools versions use
-		 * this call to check if the device is a valid WiMAX
-		 * device; so we allow it to proceed always,
-		 * considering the radios are all off. */
-		if (result == -ENOMEDIUM && state == WIMAX_RF_QUERY)
-			result = WIMAX_RF_OFF << 1 | WIMAX_RF_OFF;
-		goto error_not_ready;
-	}
-	switch (state) {
-	case WIMAX_RF_ON:
-	case WIMAX_RF_OFF:
-		result = __wimax_rf_toggle_radio(wimax_dev, state);
-		if (result < 0)
-			goto error;
-		rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
-		break;
-	case WIMAX_RF_QUERY:
-		break;
-	default:
-		result = -EINVAL;
-		goto error;
-	}
-	result = wimax_dev->rf_sw << 1 | wimax_dev->rf_hw;
-error:
-error_not_ready:
-	mutex_unlock(&wimax_dev->mutex);
-	d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
-		wimax_dev, state, result);
-	return result;
-}
-EXPORT_SYMBOL(wimax_rfkill);
-
-
-/*
- * Register a new WiMAX device's RF Kill support
- *
- * WARNING: wimax_dev->mutex must be unlocked
- */
-int wimax_rfkill_add(struct wimax_dev *wimax_dev)
-{
-	int result;
-	struct rfkill *rfkill;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-
-	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
-	/* Initialize RF Kill */
-	result = -ENOMEM;
-	rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX,
-			      &wimax_rfkill_ops, wimax_dev);
-	if (rfkill == NULL)
-		goto error_rfkill_allocate;
-
-	d_printf(1, dev, "rfkill %p\n", rfkill);
-
-	wimax_dev->rfkill = rfkill;
-
-	rfkill_init_sw_state(rfkill, 1);
-	result = rfkill_register(wimax_dev->rfkill);
-	if (result < 0)
-		goto error_rfkill_register;
-
-	/* If there is no SW toggle op, SW RFKill is always on */
-	if (wimax_dev->op_rfkill_sw_toggle == NULL)
-		wimax_dev->rf_sw = WIMAX_RF_ON;
-
-	d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev);
-	return 0;
-
-error_rfkill_register:
-	rfkill_destroy(wimax_dev->rfkill);
-error_rfkill_allocate:
-	d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
-	return result;
-}
-
-
-/*
- * Deregister a WiMAX device's RF Kill support
- *
- * Ick, we can't call rfkill_free() after rfkill_unregister()...oh
- * well.
- *
- * WARNING: wimax_dev->mutex must be unlocked
- */
-void wimax_rfkill_rm(struct wimax_dev *wimax_dev)
-{
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
-	rfkill_unregister(wimax_dev->rfkill);
-	rfkill_destroy(wimax_dev->rfkill);
-	d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev);
-}
-
-
-/*
- * Exporting to user space over generic netlink
- *
- * Parse the rfkill command from user space, return a combination
- * value that describe the states of the different toggles.
- *
- * Only one attribute: the new state requested (on, off or no change,
- * just query).
- */
-
-int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info)
-{
-	int result, ifindex;
-	struct wimax_dev *wimax_dev;
-	struct device *dev;
-	enum wimax_rf_state new_state;
-
-	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
-	result = -ENODEV;
-	if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) {
-		pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n");
-		goto error_no_wimax_dev;
-	}
-	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]);
-	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
-	if (wimax_dev == NULL)
-		goto error_no_wimax_dev;
-	dev = wimax_dev_to_dev(wimax_dev);
-	result = -EINVAL;
-	if (info->attrs[WIMAX_GNL_RFKILL_STATE] == NULL) {
-		dev_err(dev, "WIMAX_GNL_RFKILL: can't find RFKILL_STATE "
-			"attribute\n");
-		goto error_no_pid;
-	}
-	new_state = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_STATE]);
-
-	/* Execute the operation and send the result back to user space */
-	result = wimax_rfkill(wimax_dev, new_state);
-error_no_pid:
-	dev_put(wimax_dev->net_dev);
-error_no_wimax_dev:
-	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
-	return result;
-}
diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c
deleted file mode 100644
index 5bc712de1563..000000000000
--- a/net/wimax/op-state-get.c
+++ /dev/null
@@ -1,52 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Implement and export a method for getting a WiMAX device current state
- *
- * Copyright (C) 2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
- *
- * Based on previous WiMAX core work by:
- *  Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
- *  Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- */
-
-#include <net/wimax.h>
-#include <net/genetlink.h>
-#include <linux/wimax.h>
-#include <linux/security.h>
-#include "wimax-internal.h"
-
-#define D_SUBMODULE op_state_get
-#include "debug-levels.h"
-
-
-/*
- * Exporting to user space over generic netlink
- *
- * Parse the state get command from user space, return a combination
- * value that describe the current state.
- *
- * No attributes.
- */
-int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info)
-{
-	int result, ifindex;
-	struct wimax_dev *wimax_dev;
-
-	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
-	result = -ENODEV;
-	if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) {
-		pr_err("WIMAX_GNL_OP_STATE_GET: can't find IFIDX attribute\n");
-		goto error_no_wimax_dev;
-	}
-	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]);
-	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
-	if (wimax_dev == NULL)
-		goto error_no_wimax_dev;
-	/* Execute the operation and send the result back to user space */
-	result = wimax_state_get(wimax_dev);
-	dev_put(wimax_dev->net_dev);
-error_no_wimax_dev:
-	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
-	return result;
-}
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
deleted file mode 100644
index 3a62af3f80bf..000000000000
--- a/net/wimax/stack.c
+++ /dev/null
@@ -1,616 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Linux WiMAX
- * Initialization, addition and removal of wimax devices
- *
- * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This implements:
- *
- *   - basic life cycle of 'struct wimax_dev' [wimax_dev_*()]; on
- *     addition/registration initialize all subfields and allocate
- *     generic netlink resources for user space communication. On
- *     removal/unregistration, undo all that.
- *
- *   - device state machine [wimax_state_change()] and support to send
- *     reports to user space when the state changes
- *     [wimax_gnl_re_state_change*()].
- *
- * See include/net/wimax.h for rationales and design.
- *
- * ROADMAP
- *
- * [__]wimax_state_change()     Called by drivers to update device's state
- *   wimax_gnl_re_state_change_alloc()
- *   wimax_gnl_re_state_change_send()
- *
- * wimax_dev_init()	        Init a device
- * wimax_dev_add()              Register
- *   wimax_rfkill_add()
- *   wimax_gnl_add()            Register all the generic netlink resources.
- *   wimax_id_table_add()
- * wimax_dev_rm()               Unregister
- *   wimax_id_table_rm()
- *   wimax_gnl_rm()
- *   wimax_rfkill_rm()
- */
-#include <linux/device.h>
-#include <linux/gfp.h>
-#include <net/genetlink.h>
-#include <linux/netdevice.h>
-#include <linux/wimax.h>
-#include <linux/module.h>
-#include "wimax-internal.h"
-
-
-#define D_SUBMODULE stack
-#include "debug-levels.h"
-
-static char wimax_debug_params[128];
-module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params),
-		    0644);
-MODULE_PARM_DESC(debug,
-		 "String of space-separated NAME:VALUE pairs, where NAMEs "
-		 "are the different debug submodules and VALUE are the "
-		 "initial debug value to set.");
-
-/*
- * Authoritative source for the RE_STATE_CHANGE attribute policy
- *
- * We don't really use it here, but /me likes to keep the definition
- * close to where the data is generated.
- */
-/*
-static const struct nla_policy wimax_gnl_re_status_change[WIMAX_GNL_ATTR_MAX + 1] = {
-	[WIMAX_GNL_STCH_STATE_OLD] = { .type = NLA_U8 },
-	[WIMAX_GNL_STCH_STATE_NEW] = { .type = NLA_U8 },
-};
-*/
-
-
-/*
- * Allocate a Report State Change message
- *
- * @header: save it, you need it for _send()
- *
- * Creates and fills a basic state change message; different code
- * paths can then add more attributes to the message as needed.
- *
- * Use wimax_gnl_re_state_change_send() to send the returned skb.
- *
- * Returns: skb with the genl message if ok, IS_ERR() ptr on error
- *     with an errno code.
- */
-static
-struct sk_buff *wimax_gnl_re_state_change_alloc(
-	struct wimax_dev *wimax_dev,
-	enum wimax_st new_state, enum wimax_st old_state,
-	void **header)
-{
-	int result;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	void *data;
-	struct sk_buff *report_skb;
-
-	d_fnstart(3, dev, "(wimax_dev %p new_state %u old_state %u)\n",
-		  wimax_dev, new_state, old_state);
-	result = -ENOMEM;
-	report_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (report_skb == NULL) {
-		dev_err(dev, "RE_STCH: can't create message\n");
-		goto error_new;
-	}
-	/* FIXME: sending a group ID as the seq is wrong */
-	data = genlmsg_put(report_skb, 0, wimax_gnl_family.mcgrp_offset,
-			   &wimax_gnl_family, 0, WIMAX_GNL_RE_STATE_CHANGE);
-	if (data == NULL) {
-		dev_err(dev, "RE_STCH: can't put data into message\n");
-		goto error_put;
-	}
-	*header = data;
-
-	result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_OLD, old_state);
-	if (result < 0) {
-		dev_err(dev, "RE_STCH: Error adding OLD attr: %d\n", result);
-		goto error_put;
-	}
-	result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_NEW, new_state);
-	if (result < 0) {
-		dev_err(dev, "RE_STCH: Error adding NEW attr: %d\n", result);
-		goto error_put;
-	}
-	result = nla_put_u32(report_skb, WIMAX_GNL_STCH_IFIDX,
-			     wimax_dev->net_dev->ifindex);
-	if (result < 0) {
-		dev_err(dev, "RE_STCH: Error adding IFINDEX attribute\n");
-		goto error_put;
-	}
-	d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %p\n",
-		wimax_dev, new_state, old_state, report_skb);
-	return report_skb;
-
-error_put:
-	nlmsg_free(report_skb);
-error_new:
-	d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %d\n",
-		wimax_dev, new_state, old_state, result);
-	return ERR_PTR(result);
-}
-
-
-/*
- * Send a Report State Change message (as created with _alloc).
- *
- * @report_skb: as returned by wimax_gnl_re_state_change_alloc()
- * @header: as returned by wimax_gnl_re_state_change_alloc()
- *
- * Returns: 0 if ok, < 0 errno code on error.
- *
- * If the message is  NULL, pretend it didn't happen.
- */
-static
-int wimax_gnl_re_state_change_send(
-	struct wimax_dev *wimax_dev, struct sk_buff *report_skb,
-	void *header)
-{
-	int result = 0;
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n",
-		  wimax_dev, report_skb);
-	if (report_skb == NULL) {
-		result = -ENOMEM;
-		goto out;
-	}
-	genlmsg_end(report_skb, header);
-	genlmsg_multicast(&wimax_gnl_family, report_skb, 0, 0, GFP_KERNEL);
-out:
-	d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n",
-		wimax_dev, report_skb, result);
-	return result;
-}
-
-
-static
-void __check_new_state(enum wimax_st old_state, enum wimax_st new_state,
-		       unsigned int allowed_states_bm)
-{
-	if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) {
-		pr_err("SW BUG! Forbidden state change %u -> %u\n",
-		       old_state, new_state);
-	}
-}
-
-
-/*
- * Set the current state of a WiMAX device [unlocking version of
- * wimax_state_change().
- */
-void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
-{
-	struct device *dev = wimax_dev_to_dev(wimax_dev);
-	enum wimax_st old_state = wimax_dev->state;
-	struct sk_buff *stch_skb;
-	void *header;
-
-	d_fnstart(3, dev, "(wimax_dev %p new_state %u [old %u])\n",
-		  wimax_dev, new_state, old_state);
-
-	if (WARN_ON(new_state >= __WIMAX_ST_INVALID)) {
-		dev_err(dev, "SW BUG: requesting invalid state %u\n",
-			new_state);
-		goto out;
-	}
-	if (old_state == new_state)
-		goto out;
-	header = NULL;	/* gcc complains? can't grok why */
-	stch_skb = wimax_gnl_re_state_change_alloc(
-		wimax_dev, new_state, old_state, &header);
-
-	/* Verify the state transition and do exit-from-state actions */
-	switch (old_state) {
-	case __WIMAX_ST_NULL:
-		__check_new_state(old_state, new_state,
-				  1 << WIMAX_ST_DOWN);
-		break;
-	case WIMAX_ST_DOWN:
-		__check_new_state(old_state, new_state,
-				  1 << __WIMAX_ST_QUIESCING
-				  | 1 << WIMAX_ST_UNINITIALIZED
-				  | 1 << WIMAX_ST_RADIO_OFF);
-		break;
-	case __WIMAX_ST_QUIESCING:
-		__check_new_state(old_state, new_state, 1 << WIMAX_ST_DOWN);
-		break;
-	case WIMAX_ST_UNINITIALIZED:
-		__check_new_state(old_state, new_state,
-				  1 << __WIMAX_ST_QUIESCING
-				  | 1 << WIMAX_ST_RADIO_OFF);
-		break;
-	case WIMAX_ST_RADIO_OFF:
-		__check_new_state(old_state, new_state,
-				  1 << __WIMAX_ST_QUIESCING
-				  | 1 << WIMAX_ST_READY);
-		break;
-	case WIMAX_ST_READY:
-		__check_new_state(old_state, new_state,
-				  1 << __WIMAX_ST_QUIESCING
-				  | 1 << WIMAX_ST_RADIO_OFF
-				  | 1 << WIMAX_ST_SCANNING
-				  | 1 << WIMAX_ST_CONNECTING
-				  | 1 << WIMAX_ST_CONNECTED);
-		break;
-	case WIMAX_ST_SCANNING:
-		__check_new_state(old_state, new_state,
-				  1 << __WIMAX_ST_QUIESCING
-				  | 1 << WIMAX_ST_RADIO_OFF
-				  | 1 << WIMAX_ST_READY
-				  | 1 << WIMAX_ST_CONNECTING
-				  | 1 << WIMAX_ST_CONNECTED);
-		break;
-	case WIMAX_ST_CONNECTING:
-		__check_new_state(old_state, new_state,
-				  1 << __WIMAX_ST_QUIESCING
-				  | 1 << WIMAX_ST_RADIO_OFF
-				  | 1 << WIMAX_ST_READY
-				  | 1 << WIMAX_ST_SCANNING
-				  | 1 << WIMAX_ST_CONNECTED);
-		break;
-	case WIMAX_ST_CONNECTED:
-		__check_new_state(old_state, new_state,
-				  1 << __WIMAX_ST_QUIESCING
-				  | 1 << WIMAX_ST_RADIO_OFF
-				  | 1 << WIMAX_ST_READY);
-		netif_tx_disable(wimax_dev->net_dev);
-		netif_carrier_off(wimax_dev->net_dev);
-		break;
-	case __WIMAX_ST_INVALID:
-	default:
-		dev_err(dev, "SW BUG: wimax_dev %p is in unknown state %u\n",
-			wimax_dev, wimax_dev->state);
-		WARN_ON(1);
-		goto out;
-	}
-
-	/* Execute the actions of entry to the new state */
-	switch (new_state) {
-	case __WIMAX_ST_NULL:
-		dev_err(dev, "SW BUG: wimax_dev %p entering NULL state "
-			"from %u\n", wimax_dev, wimax_dev->state);
-		WARN_ON(1);		/* Nobody can enter this state */
-		break;
-	case WIMAX_ST_DOWN:
-		break;
-	case __WIMAX_ST_QUIESCING:
-		break;
-	case WIMAX_ST_UNINITIALIZED:
-		break;
-	case WIMAX_ST_RADIO_OFF:
-		break;
-	case WIMAX_ST_READY:
-		break;
-	case WIMAX_ST_SCANNING:
-		break;
-	case WIMAX_ST_CONNECTING:
-		break;
-	case WIMAX_ST_CONNECTED:
-		netif_carrier_on(wimax_dev->net_dev);
-		netif_wake_queue(wimax_dev->net_dev);
-		break;
-	case __WIMAX_ST_INVALID:
-	default:
-		BUG();
-	}
-	__wimax_state_set(wimax_dev, new_state);
-	if (!IS_ERR(stch_skb))
-		wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header);
-out:
-	d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n",
-		wimax_dev, new_state, old_state);
-}
-
-
-/**
- * wimax_state_change - Set the current state of a WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor (properly referenced)
- * @new_state: New state to switch to
- *
- * This implements the state changes for the wimax devices. It will
- *
- * - verify that the state transition is legal (for now it'll just
- *   print a warning if not) according to the table in
- *   linux/wimax.h's documentation for 'enum wimax_st'.
- *
- * - perform the actions needed for leaving the current state and
- *   whichever are needed for entering the new state.
- *
- * - issue a report to user space indicating the new state (and an
- *   optional payload with information about the new state).
- *
- * NOTE: @wimax_dev must be locked
- */
-void wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
-{
-	/*
-	 * A driver cannot take the wimax_dev out of the
-	 * __WIMAX_ST_NULL state unless by calling wimax_dev_add(). If
-	 * the wimax_dev's state is still NULL, we ignore any request
-	 * to change its state because it means it hasn't been yet
-	 * registered.
-	 *
-	 * There is no need to complain about it, as routines that
-	 * call this might be shared from different code paths that
-	 * are called before or after wimax_dev_add() has done its
-	 * job.
-	 */
-	mutex_lock(&wimax_dev->mutex);
-	if (wimax_dev->state > __WIMAX_ST_NULL)
-		__wimax_state_change(wimax_dev, new_state);
-	mutex_unlock(&wimax_dev->mutex);
-}
-EXPORT_SYMBOL_GPL(wimax_state_change);
-
-
-/**
- * wimax_state_get() - Return the current state of a WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * Returns: Current state of the device according to its driver.
- */
-enum wimax_st wimax_state_get(struct wimax_dev *wimax_dev)
-{
-	enum wimax_st state;
-	mutex_lock(&wimax_dev->mutex);
-	state = wimax_dev->state;
-	mutex_unlock(&wimax_dev->mutex);
-	return state;
-}
-EXPORT_SYMBOL_GPL(wimax_state_get);
-
-
-/**
- * wimax_dev_init - initialize a newly allocated instance
- *
- * @wimax_dev: WiMAX device descriptor to initialize.
- *
- * Initializes fields of a freshly allocated @wimax_dev instance. This
- * function assumes that after allocation, the memory occupied by
- * @wimax_dev was zeroed.
- */
-void wimax_dev_init(struct wimax_dev *wimax_dev)
-{
-	INIT_LIST_HEAD(&wimax_dev->id_table_node);
-	__wimax_state_set(wimax_dev, __WIMAX_ST_NULL);
-	mutex_init(&wimax_dev->mutex);
-	mutex_init(&wimax_dev->mutex_reset);
-}
-EXPORT_SYMBOL_GPL(wimax_dev_init);
-
-/*
- * There are multiple enums reusing the same values, adding
- * others is only possible if they use a compatible policy.
- */
-static const struct nla_policy wimax_gnl_policy[WIMAX_GNL_ATTR_MAX + 1] = {
-	/*
-	 * WIMAX_GNL_RESET_IFIDX, WIMAX_GNL_RFKILL_IFIDX,
-	 * WIMAX_GNL_STGET_IFIDX, WIMAX_GNL_MSG_IFIDX
-	 */
-	[1] = { .type = NLA_U32, },
-	/*
-	 * WIMAX_GNL_RFKILL_STATE, WIMAX_GNL_MSG_PIPE_NAME
-	 */
-	[2] = { .type = NLA_U32, }, /* enum wimax_rf_state */
-	/*
-	 * WIMAX_GNL_MSG_DATA
-	 */
-	[3] = { .type = NLA_UNSPEC, }, /* libnl doesn't grok BINARY yet */
-};
-
-static const struct genl_small_ops wimax_gnl_ops[] = {
-	{
-		.cmd = WIMAX_GNL_OP_MSG_FROM_USER,
-		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
-		.flags = GENL_ADMIN_PERM,
-		.doit = wimax_gnl_doit_msg_from_user,
-	},
-	{
-		.cmd = WIMAX_GNL_OP_RESET,
-		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
-		.flags = GENL_ADMIN_PERM,
-		.doit = wimax_gnl_doit_reset,
-	},
-	{
-		.cmd = WIMAX_GNL_OP_RFKILL,
-		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
-		.flags = GENL_ADMIN_PERM,
-		.doit = wimax_gnl_doit_rfkill,
-	},
-	{
-		.cmd = WIMAX_GNL_OP_STATE_GET,
-		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
-		.flags = GENL_ADMIN_PERM,
-		.doit = wimax_gnl_doit_state_get,
-	},
-};
-
-
-static
-size_t wimax_addr_scnprint(char *addr_str, size_t addr_str_size,
-			   unsigned char *addr, size_t addr_len)
-{
-	unsigned int cnt, total;
-
-	for (total = cnt = 0; cnt < addr_len; cnt++)
-		total += scnprintf(addr_str + total, addr_str_size - total,
-				   "%02x%c", addr[cnt],
-				   cnt == addr_len - 1 ? '\0' : ':');
-	return total;
-}
-
-
-/**
- * wimax_dev_add - Register a new WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor (as embedded in your @net_dev's
- *     priv data). You must have called wimax_dev_init() on it before.
- *
- * @net_dev: net device the @wimax_dev is associated with. The
- *     function expects SET_NETDEV_DEV() and register_netdev() were
- *     already called on it.
- *
- * Registers the new WiMAX device, sets up the user-kernel control
- * interface (generic netlink) and common WiMAX infrastructure.
- *
- * Note that the parts that will allow interaction with user space are
- * setup at the very end, when the rest is in place, as once that
- * happens, the driver might get user space control requests via
- * netlink or from debugfs that might translate into calls into
- * wimax_dev->op_*().
- */
-int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
-{
-	int result;
-	struct device *dev = net_dev->dev.parent;
-	char addr_str[32];
-
-	d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev);
-
-	/* Do the RFKILL setup before locking, as RFKILL will call
-	 * into our functions.
-	 */
-	wimax_dev->net_dev = net_dev;
-	result = wimax_rfkill_add(wimax_dev);
-	if (result < 0)
-		goto error_rfkill_add;
-
-	/* Set up user-space interaction */
-	mutex_lock(&wimax_dev->mutex);
-	wimax_id_table_add(wimax_dev);
-	wimax_debugfs_add(wimax_dev);
-
-	__wimax_state_set(wimax_dev, WIMAX_ST_DOWN);
-	mutex_unlock(&wimax_dev->mutex);
-
-	wimax_addr_scnprint(addr_str, sizeof(addr_str),
-			    net_dev->dev_addr, net_dev->addr_len);
-	dev_err(dev, "WiMAX interface %s (%s) ready\n",
-		net_dev->name, addr_str);
-	d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev);
-	return 0;
-
-error_rfkill_add:
-	d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n",
-		wimax_dev, net_dev, result);
-	return result;
-}
-EXPORT_SYMBOL_GPL(wimax_dev_add);
-
-
-/**
- * wimax_dev_rm - Unregister an existing WiMAX device
- *
- * @wimax_dev: WiMAX device descriptor
- *
- * Unregisters a WiMAX device previously registered for use with
- * wimax_add_rm().
- *
- * IMPORTANT! Must call before calling unregister_netdev().
- *
- * After this function returns, you will not get any more user space
- * control requests (via netlink or debugfs) and thus to wimax_dev->ops.
- *
- * Reentrancy control is ensured by setting the state to
- * %__WIMAX_ST_QUIESCING. rfkill operations coming through
- * wimax_*rfkill*() will be stopped by the quiescing state; ops coming
- * from the rfkill subsystem will be stopped by the support being
- * removed by wimax_rfkill_rm().
- */
-void wimax_dev_rm(struct wimax_dev *wimax_dev)
-{
-	d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
-
-	mutex_lock(&wimax_dev->mutex);
-	__wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING);
-	wimax_debugfs_rm(wimax_dev);
-	wimax_id_table_rm(wimax_dev);
-	__wimax_state_change(wimax_dev, WIMAX_ST_DOWN);
-	mutex_unlock(&wimax_dev->mutex);
-	wimax_rfkill_rm(wimax_dev);
-	d_fnend(3, NULL, "(wimax_dev %p) = void\n", wimax_dev);
-}
-EXPORT_SYMBOL_GPL(wimax_dev_rm);
-
-
-/* Debug framework control of debug levels */
-struct d_level D_LEVEL[] = {
-	D_SUBMODULE_DEFINE(debugfs),
-	D_SUBMODULE_DEFINE(id_table),
-	D_SUBMODULE_DEFINE(op_msg),
-	D_SUBMODULE_DEFINE(op_reset),
-	D_SUBMODULE_DEFINE(op_rfkill),
-	D_SUBMODULE_DEFINE(op_state_get),
-	D_SUBMODULE_DEFINE(stack),
-};
-size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
-
-
-static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
-	{ .name = "msg", },
-};
-
-struct genl_family wimax_gnl_family __ro_after_init = {
-	.name = "WiMAX",
-	.version = WIMAX_GNL_VERSION,
-	.hdrsize = 0,
-	.maxattr = WIMAX_GNL_ATTR_MAX,
-	.policy = wimax_gnl_policy,
-	.module = THIS_MODULE,
-	.small_ops = wimax_gnl_ops,
-	.n_small_ops = ARRAY_SIZE(wimax_gnl_ops),
-	.mcgrps = wimax_gnl_mcgrps,
-	.n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps),
-};
-
-
-
-/* Shutdown the wimax stack */
-static
-int __init wimax_subsys_init(void)
-{
-	int result;
-
-	d_fnstart(4, NULL, "()\n");
-	d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
-		       "wimax.debug");
-
-	result = genl_register_family(&wimax_gnl_family);
-	if (unlikely(result < 0)) {
-		pr_err("cannot register generic netlink family: %d\n", result);
-		goto error_register_family;
-	}
-
-	d_fnend(4, NULL, "() = 0\n");
-	return 0;
-
-error_register_family:
-	d_fnend(4, NULL, "() = %d\n", result);
-	return result;
-
-}
-module_init(wimax_subsys_init);
-
-
-/* Shutdown the wimax stack */
-static
-void __exit wimax_subsys_exit(void)
-{
-	wimax_id_table_release();
-	genl_unregister_family(&wimax_gnl_family);
-}
-module_exit(wimax_subsys_exit);
-
-MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
-MODULE_DESCRIPTION("Linux WiMAX stack");
-MODULE_LICENSE("GPL");
diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h
deleted file mode 100644
index 40751207296c..000000000000
--- a/net/wimax/wimax-internal.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Linux WiMAX
- * Internal API for kernel space WiMAX stack
- *
- * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This header file is for declarations and definitions internal to
- * the WiMAX stack. For public APIs and documentation, see
- * include/net/wimax.h and include/linux/wimax.h.
- */
-
-#ifndef __WIMAX_INTERNAL_H__
-#define __WIMAX_INTERNAL_H__
-#ifdef __KERNEL__
-
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/device.h>
-#include <net/wimax.h>
-
-
-/*
- * Decide if a (locked) device is ready for use
- *
- * Before using the device structure, it must be locked
- * (wimax_dev->mutex). As well, most operations need to call this
- * function to check if the state is the right one.
- *
- * An error value will be returned if the state is not the right
- * one. In that case, the caller should not attempt to use the device
- * and just unlock it.
- */
-static inline __must_check
-int wimax_dev_is_ready(struct wimax_dev *wimax_dev)
-{
-	if (wimax_dev->state == __WIMAX_ST_NULL)
-		return -EINVAL;	/* Device is not even registered! */
-	if (wimax_dev->state == WIMAX_ST_DOWN)
-		return -ENOMEDIUM;
-	if (wimax_dev->state == __WIMAX_ST_QUIESCING)
-		return -ESHUTDOWN;
-	return 0;
-}
-
-
-static inline
-void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state)
-{
-	wimax_dev->state = state;
-}
-void __wimax_state_change(struct wimax_dev *, enum wimax_st);
-
-#ifdef CONFIG_DEBUG_FS
-void wimax_debugfs_add(struct wimax_dev *);
-void wimax_debugfs_rm(struct wimax_dev *);
-#else
-static inline void wimax_debugfs_add(struct wimax_dev *wimax_dev) {}
-static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {}
-#endif
-
-void wimax_id_table_add(struct wimax_dev *);
-struct wimax_dev *wimax_dev_get_by_genl_info(struct genl_info *, int);
-void wimax_id_table_rm(struct wimax_dev *);
-void wimax_id_table_release(void);
-
-int wimax_rfkill_add(struct wimax_dev *);
-void wimax_rfkill_rm(struct wimax_dev *);
-
-/* generic netlink */
-extern struct genl_family wimax_gnl_family;
-
-/* ops */
-int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info);
-int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info);
-int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info);
-int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info);
-
-#endif /* #ifdef __KERNEL__ */
-#endif /* #ifndef __WIMAX_INTERNAL_H__ */
-- 
cgit 


From 080b6f40763565f65ebb9540219c71ce885cf568 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 28 Oct 2020 18:15:05 +0100
Subject: bpf: Don't rely on GCC __attribute__((optimize)) to disable GCSE

Commit 3193c0836 ("bpf: Disable GCC -fgcse optimization for
___bpf_prog_run()") introduced a __no_fgcse macro that expands to a
function scope __attribute__((optimize("-fno-gcse"))), to disable a
GCC specific optimization that was causing trouble on x86 builds, and
was not expected to have any positive effect in the first place.

However, as the GCC manual documents, __attribute__((optimize))
is not for production use, and results in all other optimization
options to be forgotten for the function in question. This can
cause all kinds of trouble, but in one particular reported case,
it causes -fno-asynchronous-unwind-tables to be disregarded,
resulting in .eh_frame info to be emitted for the function.

This reverts commit 3193c0836, and instead, it disables the -fgcse
optimization for the entire source file, but only when building for
X86 using GCC with CONFIG_BPF_JIT_ALWAYS_ON disabled. Note that the
original commit states that CONFIG_RETPOLINE=n triggers the issue,
whereas CONFIG_RETPOLINE=y performs better without the optimization,
so it is kept disabled in both cases.

Fixes: 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/lkml/CAMuHMdUg0WJHEcq6to0-eODpXPOywLot6UD2=GFHpzoj_hCoBQ@mail.gmail.com/
Link: https://lore.kernel.org/bpf/20201028171506.15682-2-ardb@kernel.org
---
 include/linux/compiler-gcc.h   | 2 --
 include/linux/compiler_types.h | 4 ----
 kernel/bpf/Makefile            | 6 +++++-
 kernel/bpf/core.c              | 2 +-
 4 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index d1e3c6896b71..5deb37024574 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -175,5 +175,3 @@
 #else
 #define __diag_GCC_8(s)
 #endif
-
-#define __no_fgcse __attribute__((optimize("-fno-gcse")))
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 6e390d58a9f8..ac3fa37a84f9 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -247,10 +247,6 @@ struct ftrace_likely_data {
 #define asm_inline asm
 #endif
 
-#ifndef __no_fgcse
-# define __no_fgcse
-#endif
-
 /* Are two types/vars the same type (ignoring qualifiers)? */
 #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index bdc8cd1b6767..c1b9f71ee6aa 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y := core.o
-CFLAGS_core.o += $(call cc-disable-warning, override-init)
+ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
+# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
+cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
+endif
+CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9268d77898b7..55454d2278b1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
  *
  * Decode and execute eBPF instructions.
  */
-static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 {
 #define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
 #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
-- 
cgit 


From 46d6c5ae953cc0be38efd0e469284df7c4328cf8 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 29 Oct 2020 03:56:06 +0100
Subject: netfilter: use actual socket sk rather than skb sk when routing
 harder

If netfilter changes the packet mark when mangling, the packet is
rerouted using the route_me_harder set of functions. Prior to this
commit, there's one big difference between route_me_harder and the
ordinary initial routing functions, described in the comment above
__ip_queue_xmit():

   /* Note: skb->sk can be different from sk, in case of tunnels */
   int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,

That function goes on to correctly make use of sk->sk_bound_dev_if,
rather than skb->sk->sk_bound_dev_if. And indeed the comment is true: a
tunnel will receive a packet in ndo_start_xmit with an initial skb->sk.
It will make some transformations to that packet, and then it will send
the encapsulated packet out of a *new* socket. That new socket will
basically always have a different sk_bound_dev_if (otherwise there'd be
a routing loop). So for the purposes of routing the encapsulated packet,
the routing information as it pertains to the socket should come from
that socket's sk, rather than the packet's original skb->sk. For that
reason __ip_queue_xmit() and related functions all do the right thing.

One might argue that all tunnels should just call skb_orphan(skb) before
transmitting the encapsulated packet into the new socket. But tunnels do
*not* do this -- and this is wisely avoided in skb_scrub_packet() too --
because features like TSQ rely on skb->destructor() being called when
that buffer space is truely available again. Calling skb_orphan(skb) too
early would result in buffers filling up unnecessarily and accounting
info being all wrong. Instead, additional routing must take into account
the new sk, just as __ip_queue_xmit() notes.

So, this commit addresses the problem by fishing the correct sk out of
state->sk -- it's already set properly in the call to nf_hook() in
__ip_local_out(), which receives the sk as part of its normal
functionality. So we make sure to plumb state->sk through the various
route_me_harder functions, and then make correct use of it following the
example of __ip_queue_xmit().

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv4.h       |  2 +-
 include/linux/netfilter_ipv6.h       | 10 +++++-----
 net/ipv4/netfilter.c                 |  8 +++++---
 net/ipv4/netfilter/iptable_mangle.c  |  2 +-
 net/ipv4/netfilter/nf_reject_ipv4.c  |  2 +-
 net/ipv6/netfilter.c                 |  6 +++---
 net/ipv6/netfilter/ip6table_mangle.c |  2 +-
 net/netfilter/ipvs/ip_vs_core.c      |  4 ++--
 net/netfilter/nf_nat_proto.c         |  4 ++--
 net/netfilter/nf_synproxy_core.c     |  2 +-
 net/netfilter/nft_chain_route.c      |  4 ++--
 net/netfilter/utils.c                |  4 ++--
 12 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index 082e2c41b7ff..5b70ca868bb1 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -16,7 +16,7 @@ struct ip_rt_info {
 	u_int32_t mark;
 };
 
-int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type);
+int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned addr_type);
 
 struct nf_queue_entry;
 
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 9b67394471e1..48314ade1506 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -42,7 +42,7 @@ struct nf_ipv6_ops {
 #if IS_MODULE(CONFIG_IPV6)
 	int (*chk_addr)(struct net *net, const struct in6_addr *addr,
 			const struct net_device *dev, int strict);
-	int (*route_me_harder)(struct net *net, struct sk_buff *skb);
+	int (*route_me_harder)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int (*dev_get_saddr)(struct net *net, const struct net_device *dev,
 		       const struct in6_addr *daddr, unsigned int srcprefs,
 		       struct in6_addr *saddr);
@@ -143,9 +143,9 @@ static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk,
 #endif
 }
 
-int ip6_route_me_harder(struct net *net, struct sk_buff *skb);
+int ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb);
 
-static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
+static inline int nf_ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
@@ -153,9 +153,9 @@ static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 	if (!v6_ops)
 		return -EHOSTUNREACH;
 
-	return v6_ops->route_me_harder(net, skb);
+	return v6_ops->route_me_harder(net, sk, skb);
 #elif IS_BUILTIN(CONFIG_IPV6)
-	return ip6_route_me_harder(net, skb);
+	return ip6_route_me_harder(net, sk, skb);
 #else
 	return -EHOSTUNREACH;
 #endif
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index a058213b77a7..7c841037c533 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,17 +17,19 @@
 #include <net/netfilter/nf_queue.h>
 
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type)
+int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi4 fl4 = {};
 	__be32 saddr = iph->saddr;
-	const struct sock *sk = skb_to_full_sk(skb);
-	__u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
+	__u8 flags;
 	struct net_device *dev = skb_dst(skb)->dev;
 	unsigned int hh_len;
 
+	sk = sk_to_full_sk(sk);
+	flags = sk ? inet_sk_flowi_flags(sk) : 0;
+
 	if (addr_type == RTN_UNSPEC)
 		addr_type = inet_addr_type_dev_table(net, dev, saddr);
 	if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index f703a717ab1d..833079589273 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -62,7 +62,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
 		    iph->daddr != daddr ||
 		    skb->mark != mark ||
 		    iph->tos != tos) {
-			err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+			err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
 			if (err < 0)
 				ret = NF_DROP_ERR(err);
 		}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 9dcfa4e461b6..93b07739807b 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -145,7 +145,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
 				   ip4_dst_hoplimit(skb_dst(nskb)));
 	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
 
-	if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+	if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
 		goto free_nskb;
 
 	niph = ip_hdr(nskb);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 6d0e942d082d..ab9a279dd6d4 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -20,10 +20,10 @@
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include "../bridge/br_private.h"
 
-int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
+int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb)
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct sock *sk = sk_to_full_sk(skb->sk);
+	struct sock *sk = sk_to_full_sk(sk_partial);
 	unsigned int hh_len;
 	struct dst_entry *dst;
 	int strict = (ipv6_addr_type(&iph->daddr) &
@@ -84,7 +84,7 @@ static int nf_ip6_reroute(struct sk_buff *skb,
 		if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
 		    !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
 		    skb->mark != rt_info->mark)
-			return ip6_route_me_harder(entry->state.net, skb);
+			return ip6_route_me_harder(entry->state.net, entry->state.sk, skb);
 	}
 	return 0;
 }
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 1a2748611e00..cee74803d7a1 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -57,7 +57,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
 	     skb->mark != mark ||
 	     ipv6_hdr(skb)->hop_limit != hop_limit ||
 	     flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
-		err = ip6_route_me_harder(state->net, skb);
+		err = ip6_route_me_harder(state->net, state->sk, skb);
 		if (err < 0)
 			ret = NF_DROP_ERR(err);
 	}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index cc3c275934f4..c0b8215ab3d4 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -742,12 +742,12 @@ static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
 		struct dst_entry *dst = skb_dst(skb);
 
 		if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
-		    ip6_route_me_harder(ipvs->net, skb) != 0)
+		    ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0)
 			return 1;
 	} else
 #endif
 		if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
-		    ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
+		    ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0)
 			return 1;
 
 	return 0;
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 59151dc07fdc..e87b6bd6b3cd 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -715,7 +715,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
 
 		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
 		    ct->tuplehash[!dir].tuple.src.u3.ip) {
-			err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+			err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
 			if (err < 0)
 				ret = NF_DROP_ERR(err);
 		}
@@ -953,7 +953,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
 
 		if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
 				      &ct->tuplehash[!dir].tuple.src.u3)) {
-			err = nf_ip6_route_me_harder(state->net, skb);
+			err = nf_ip6_route_me_harder(state->net, state->sk, skb);
 			if (err < 0)
 				ret = NF_DROP_ERR(err);
 		}
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 9cca35d22927..d7d34a62d3bf 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -446,7 +446,7 @@ synproxy_send_tcp(struct net *net,
 
 	skb_dst_set_noref(nskb, skb_dst(skb));
 	nskb->protocol = htons(ETH_P_IP);
-	if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+	if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
 		goto free_nskb;
 
 	if (nfct) {
diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c
index 8826bbe71136..edd02cda57fc 100644
--- a/net/netfilter/nft_chain_route.c
+++ b/net/netfilter/nft_chain_route.c
@@ -42,7 +42,7 @@ static unsigned int nf_route_table_hook4(void *priv,
 		    iph->daddr != daddr ||
 		    skb->mark != mark ||
 		    iph->tos != tos) {
-			err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+			err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
 			if (err < 0)
 				ret = NF_DROP_ERR(err);
 		}
@@ -92,7 +92,7 @@ static unsigned int nf_route_table_hook6(void *priv,
 	     skb->mark != mark ||
 	     ipv6_hdr(skb)->hop_limit != hop_limit ||
 	     flowlabel != *((u32 *)ipv6_hdr(skb)))) {
-		err = nf_ip6_route_me_harder(state->net, skb);
+		err = nf_ip6_route_me_harder(state->net, state->sk, skb);
 		if (err < 0)
 			ret = NF_DROP_ERR(err);
 	}
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index cedf47ab3c6f..2182d361e273 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -191,8 +191,8 @@ static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry
 		      skb->mark == rt_info->mark &&
 		      iph->daddr == rt_info->daddr &&
 		      iph->saddr == rt_info->saddr))
-			return ip_route_me_harder(entry->state.net, skb,
-						  RTN_UNSPEC);
+			return ip_route_me_harder(entry->state.net, entry->state.sk,
+						  skb, RTN_UNSPEC);
 	}
 #endif
 	return 0;
-- 
cgit 


From c0391b6ab810381df632677a1dcbbbbd63d05b6d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 29 Oct 2020 13:50:03 +0100
Subject: netfilter: nf_tables: missing validation from the abort path

If userspace does not include the trailing end of batch message, then
nfnetlink aborts the transaction. This allows to check that ruleset
updates trigger no errors.

After this patch, invoking this command from the prerouting chain:

 # nft -c add rule x y fib saddr . oif type local

fails since oif is not supported there.

This patch fixes the lack of rule validation from the abort/check path
to catch configuration errors such as the one above.

Fixes: a654de8fdc18 ("netfilter: nf_tables: fix chain dependency validation")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h |  9 ++++++++-
 net/netfilter/nf_tables_api.c       | 15 ++++++++++-----
 net/netfilter/nfnetlink.c           | 22 ++++++++++++++++++----
 3 files changed, 36 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 89016d08f6a2..f6267e2883f2 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -24,6 +24,12 @@ struct nfnl_callback {
 	const u_int16_t attr_count;		/* number of nlattr's */
 };
 
+enum nfnl_abort_action {
+	NFNL_ABORT_NONE		= 0,
+	NFNL_ABORT_AUTOLOAD,
+	NFNL_ABORT_VALIDATE,
+};
+
 struct nfnetlink_subsystem {
 	const char *name;
 	__u8 subsys_id;			/* nfnetlink subsystem ID */
@@ -31,7 +37,8 @@ struct nfnetlink_subsystem {
 	const struct nfnl_callback *cb;	/* callback for individual types */
 	struct module *owner;
 	int (*commit)(struct net *net, struct sk_buff *skb);
-	int (*abort)(struct net *net, struct sk_buff *skb, bool autoload);
+	int (*abort)(struct net *net, struct sk_buff *skb,
+		     enum nfnl_abort_action action);
 	void (*cleanup)(struct net *net);
 	bool (*valid_genid)(struct net *net, u32 genid);
 };
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9b70e136fb5d..0f58e98542be 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -8053,12 +8053,16 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 	kfree(trans);
 }
 
-static int __nf_tables_abort(struct net *net, bool autoload)
+static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 {
 	struct nft_trans *trans, *next;
 	struct nft_trans_elem *te;
 	struct nft_hook *hook;
 
+	if (action == NFNL_ABORT_VALIDATE &&
+	    nf_tables_validate(net) < 0)
+		return -EAGAIN;
+
 	list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
 					 list) {
 		switch (trans->msg_type) {
@@ -8190,7 +8194,7 @@ static int __nf_tables_abort(struct net *net, bool autoload)
 		nf_tables_abort_release(trans);
 	}
 
-	if (autoload)
+	if (action == NFNL_ABORT_AUTOLOAD)
 		nf_tables_module_autoload(net);
 	else
 		nf_tables_module_autoload_cleanup(net);
@@ -8203,9 +8207,10 @@ static void nf_tables_cleanup(struct net *net)
 	nft_validate_state_update(net, NFT_VALIDATE_SKIP);
 }
 
-static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload)
+static int nf_tables_abort(struct net *net, struct sk_buff *skb,
+			   enum nfnl_abort_action action)
 {
-	int ret = __nf_tables_abort(net, autoload);
+	int ret = __nf_tables_abort(net, action);
 
 	mutex_unlock(&net->nft.commit_mutex);
 
@@ -8836,7 +8841,7 @@ static void __net_exit nf_tables_exit_net(struct net *net)
 {
 	mutex_lock(&net->nft.commit_mutex);
 	if (!list_empty(&net->nft.commit_list))
-		__nf_tables_abort(net, false);
+		__nf_tables_abort(net, NFNL_ABORT_NONE);
 	__nft_release_tables(net);
 	mutex_unlock(&net->nft.commit_mutex);
 	WARN_ON_ONCE(!list_empty(&net->nft.tables));
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 2daa1f6ae344..d3df66a39b5e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -333,7 +333,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return netlink_ack(skb, nlh, -EINVAL, NULL);
 replay:
 	status = 0;
-
+replay_abort:
 	skb = netlink_skb_clone(oskb, GFP_KERNEL);
 	if (!skb)
 		return netlink_ack(oskb, nlh, -ENOMEM, NULL);
@@ -499,7 +499,7 @@ ack:
 	}
 done:
 	if (status & NFNL_BATCH_REPLAY) {
-		ss->abort(net, oskb, true);
+		ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD);
 		nfnl_err_reset(&err_list);
 		kfree_skb(skb);
 		module_put(ss->owner);
@@ -510,11 +510,25 @@ done:
 			status |= NFNL_BATCH_REPLAY;
 			goto done;
 		} else if (err) {
-			ss->abort(net, oskb, false);
+			ss->abort(net, oskb, NFNL_ABORT_NONE);
 			netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
 		}
 	} else {
-		ss->abort(net, oskb, false);
+		enum nfnl_abort_action abort_action;
+
+		if (status & NFNL_BATCH_FAILURE)
+			abort_action = NFNL_ABORT_NONE;
+		else
+			abort_action = NFNL_ABORT_VALIDATE;
+
+		err = ss->abort(net, oskb, abort_action);
+		if (err == -EAGAIN) {
+			nfnl_err_reset(&err_list);
+			kfree_skb(skb);
+			module_put(ss->owner);
+			status |= NFNL_BATCH_FAILURE;
+			goto replay_abort;
+		}
 	}
 	if (ss->cleanup)
 		ss->cleanup(net);
-- 
cgit 


From 77f6c0b87479c4578ac0798fc249637092ac45a3 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 24 Sep 2020 12:30:50 +0200
Subject: timekeeping: remove arch_gettimeoffset

With Arm EBSA110 gone, nothing uses it any more, so the corresponding
code and the Kconfig option can be removed.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 .../time/modern-timekeeping/arch-support.txt       | 33 ----------------------
 drivers/Makefile                                   |  2 --
 drivers/clocksource/Kconfig                        |  2 +-
 include/linux/time.h                               | 13 ---------
 kernel/time/Kconfig                                |  9 ------
 kernel/time/clocksource.c                          |  8 ------
 kernel/time/timekeeping.c                          | 25 +---------------
 kernel/trace/Kconfig                               |  2 --
 8 files changed, 2 insertions(+), 92 deletions(-)
 delete mode 100644 Documentation/features/time/modern-timekeeping/arch-support.txt

(limited to 'include/linux')

diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt
deleted file mode 100644
index a84c3b9d9a94..000000000000
--- a/Documentation/features/time/modern-timekeeping/arch-support.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Feature name:          modern-timekeeping
-#         Kconfig:       !ARCH_USES_GETTIMEOFFSET
-#         description:   arch does not use arch_gettimeoffset() anymore
-#
-    -----------------------
-    |         arch |status|
-    -----------------------
-    |       alpha: |  ok  |
-    |         arc: |  ok  |
-    |         arm: | TODO |
-    |       arm64: |  ok  |
-    |         c6x: |  ok  |
-    |        csky: |  ok  |
-    |       h8300: |  ok  |
-    |     hexagon: |  ok  |
-    |        ia64: |  ok  |
-    |        m68k: |  ok  |
-    |  microblaze: |  ok  |
-    |        mips: |  ok  |
-    |       nds32: |  ok  |
-    |       nios2: |  ok  |
-    |    openrisc: |  ok  |
-    |      parisc: |  ok  |
-    |     powerpc: |  ok  |
-    |       riscv: |  ok  |
-    |        s390: |  ok  |
-    |          sh: |  ok  |
-    |       sparc: |  ok  |
-    |          um: |  ok  |
-    |         x86: |  ok  |
-    |      xtensa: |  ok  |
-    -----------------------
diff --git a/drivers/Makefile b/drivers/Makefile
index c0cd1b9075e3..4ff1e4459512 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -135,9 +135,7 @@ obj-$(CONFIG_INFINIBAND)	+= infiniband/
 obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
 obj-$(CONFIG_SUPERH)		+= sh/
-ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 obj-y				+= clocksource/
-endif
 obj-$(CONFIG_DCA)		+= dca/
 obj-$(CONFIG_HID)		+= hid/
 obj-$(CONFIG_PPC_PS3)		+= ps3/
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 68b087bff59c..764936bfcb2c 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -563,7 +563,7 @@ config CLKSRC_QCOM
 
 config CLKSRC_VERSATILE
 	bool "ARM Versatile (Express) reference platforms clock source" if COMPILE_TEST
-	depends on GENERIC_SCHED_CLOCK && !ARCH_USES_GETTIMEOFFSET
+	depends on GENERIC_SCHED_CLOCK
 	select TIMER_OF
 	default y if (ARCH_VEXPRESS || ARCH_VERSATILE) && ARM
 	help
diff --git a/include/linux/time.h b/include/linux/time.h
index b142cb5f5a53..16cf4522d6f3 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -21,19 +21,6 @@ extern time64_t mktime64(const unsigned int year, const unsigned int mon,
 			const unsigned int day, const unsigned int hour,
 			const unsigned int min, const unsigned int sec);
 
-/* Some architectures do not supply their own clocksource.
- * This is mainly the case in architectures that get their
- * inter-tick times by reading the counter on their interval
- * timer. Since these timers wrap every tick, they're not really
- * useful as clocksources. Wrapping them to act like one is possible
- * but not very efficient. So we provide a callout these arches
- * can implement for use with the jiffies clocksource to provide
- * finer then tick granular time.
- */
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-extern u32 (*arch_gettimeoffset)(void);
-#endif
-
 #ifdef CONFIG_POSIX_TIMERS
 extern void clear_itimer(void);
 #else
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a09b1d61df6a..51d298ccbe05 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -26,10 +26,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
 config GENERIC_TIME_VSYSCALL
 	bool
 
-# Old style timekeeping
-config ARCH_USES_GETTIMEOFFSET
-	bool
-
 # The generic clock events infrastructure
 config GENERIC_CLOCKEVENTS
 	bool
@@ -72,7 +68,6 @@ config TICK_ONESHOT
 
 config NO_HZ_COMMON
 	bool
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	select TICK_ONESHOT
 
 choice
@@ -87,7 +82,6 @@ config HZ_PERIODIC
 
 config NO_HZ_IDLE
 	bool "Idle dynticks system (tickless idle)"
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	select NO_HZ_COMMON
 	help
 	  This option enables a tickless idle system: timer interrupts
@@ -99,7 +93,6 @@ config NO_HZ_IDLE
 config NO_HZ_FULL
 	bool "Full dynticks system (tickless)"
 	# NO_HZ_COMMON dependency
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	# We need at least one periodic CPU for timekeeping
 	depends on SMP
 	depends on HAVE_CONTEXT_TRACKING
@@ -158,7 +151,6 @@ config CONTEXT_TRACKING_FORCE
 
 config NO_HZ
 	bool "Old Idle dynticks config"
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	help
 	  This is the old config entry that enables dynticks idle.
 	  We keep it around for a little while to enforce backward
@@ -166,7 +158,6 @@ config NO_HZ
 
 config HIGH_RES_TIMERS
 	bool "High Resolution Timer Support"
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	select TICK_ONESHOT
 	help
 	  This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 02441ead3c3b..cce484a2cc7c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -705,8 +705,6 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
 						&cs->max_cycles);
 }
 
-#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
-
 static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
 {
 	struct clocksource *cs;
@@ -798,12 +796,6 @@ static void clocksource_select_fallback(void)
 	__clocksource_select(true);
 }
 
-#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
-static inline void clocksource_select(void) { }
-static inline void clocksource_select_fallback(void) { }
-
-#endif
-
 /*
  * clocksource_done_booting - Called near the end of core bootup
  *
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6858a31364b6..52fff7e9edcd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -369,13 +369,6 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 
 /* Timekeeper helper functions. */
 
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-static u32 default_arch_gettimeoffset(void) { return 0; }
-u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
-#else
-static inline u32 arch_gettimeoffset(void) { return 0; }
-#endif
-
 static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
 {
 	u64 nsec;
@@ -383,8 +376,7 @@ static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 de
 	nsec = delta * tkr->mult + tkr->xtime_nsec;
 	nsec >>= tkr->shift;
 
-	/* If arch requires, add in get_arch_timeoffset() */
-	return nsec + arch_gettimeoffset();
+	return nsec;
 }
 
 static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
@@ -778,16 +770,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	tk->tkr_raw.cycle_last  = cycle_now;
 
 	tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
-
-	/* If arch requires, add in get_arch_timeoffset() */
-	tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
-
-
 	tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
 
-	/* If arch requires, add in get_arch_timeoffset() */
-	tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
-
 	tk_normalize_xtime(tk);
 }
 
@@ -2133,19 +2117,12 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
 	if (unlikely(timekeeping_suspended))
 		goto out;
 
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-	offset = real_tk->cycle_interval;
-
-	if (mode != TK_ADV_TICK)
-		goto out;
-#else
 	offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
 				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
 
 	/* Check if there's really nothing to do */
 	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
 		goto out;
-#endif
 
 	/* Do some additional sanity checking */
 	timekeeping_check_update(tk, offset);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a4020c0b4508..b74099f990bf 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -253,7 +253,6 @@ config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
 	depends on TRACE_IRQFLAGS_SUPPORT
-	depends on !ARCH_USES_GETTIMEOFFSET
 	select TRACE_IRQFLAGS
 	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
@@ -277,7 +276,6 @@ config IRQSOFF_TRACER
 config PREEMPT_TRACER
 	bool "Preemption-off Latency Tracer"
 	default n
-	depends on !ARCH_USES_GETTIMEOFFSET
 	depends on PREEMPTION
 	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
-- 
cgit 


From b3550164a19d62e515af6cacb5a31f0b2b3f9501 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 24 Sep 2020 15:21:43 +0200
Subject: timekeeping: add CONFIG_LEGACY_TIMER_TICK

All platforms that currently do not use generic clockevents roughly call
the same set of functions in their timer interrupts: xtime_update(),
update_process_times() and profile_tick(), sometimes in a different
sequence.

Add a helper function that performs all three of them, to make the
callers more uniform and simplify the interface.

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/timekeeping.h |  1 +
 kernel/time/Kconfig         |  7 +++++++
 kernel/time/Makefile        |  1 +
 kernel/time/tick-legacy.c   | 19 +++++++++++++++++++
 4 files changed, 28 insertions(+)
 create mode 100644 kernel/time/tick-legacy.c

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 7f7e4a3f4394..3670cb1670ff 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -12,6 +12,7 @@ extern int timekeeping_suspended;
 /* Architecture timer tick functions: */
 extern void update_process_times(int user);
 extern void xtime_update(unsigned long ticks);
+extern void legacy_timer_tick(unsigned long ticks);
 
 /*
  * Get and set timeofday
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 51d298ccbe05..c6867f29d279 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -57,6 +57,13 @@ config POSIX_CPU_TIMERS_TASK_WORK
 	bool
 	default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
 
+config LEGACY_TIMER_TICK
+	bool
+	help
+	  The legacy timer tick helper is used by platforms that
+	  lack support for the generic clockevent framework.
+	  New platforms should use generic clockevents instead.
+
 if GENERIC_CLOCKEVENTS
 menu "Timers subsystem"
 
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c8f00168afe8..1fb1c1ef6a19 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -16,6 +16,7 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
 endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o tick-sched.o
+obj-$(CONFIG_LEGACY_TIMER_TICK)			+= tick-legacy.o
 obj-$(CONFIG_HAVE_GENERIC_VDSO)			+= vsyscall.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
diff --git a/kernel/time/tick-legacy.c b/kernel/time/tick-legacy.c
new file mode 100644
index 000000000000..73c5a0af4743
--- /dev/null
+++ b/kernel/time/tick-legacy.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Timer tick function for architectures that lack generic clockevents,
+ * consolidated here from m68k/ia64/parisc/arm.
+ */
+
+#include <linux/irq.h>
+#include <linux/profile.h>
+#include <linux/timekeeper_internal.h>
+
+#include "tick-internal.h"
+
+void legacy_timer_tick(unsigned long ticks)
+{
+	if (ticks)
+		xtime_update(ticks);
+	update_process_times(user_mode(get_irq_regs()));
+	profile_tick(CPU_PROFILING);
+}
-- 
cgit 


From 56cc7b8acfb7c763f71c0492fa8da01dca7c1760 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 24 Sep 2020 17:39:11 +0200
Subject: timekeeping: remove xtime_update

There are no more users of xtime_update aside from legacy_timer_tick(),
so fold it into that function and remove the declaration.

update_process_times() is now only called inside of the kernel/time/
code, so the declaration can be moved there.

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/timekeeping.h |  2 --
 kernel/time/tick-legacy.c   | 22 ++++++++++++++++++++--
 kernel/time/timekeeping.c   | 16 ----------------
 kernel/time/timekeeping.h   |  1 +
 4 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 3670cb1670ff..d47009611109 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -10,8 +10,6 @@ void timekeeping_init(void);
 extern int timekeeping_suspended;
 
 /* Architecture timer tick functions: */
-extern void update_process_times(int user);
-extern void xtime_update(unsigned long ticks);
 extern void legacy_timer_tick(unsigned long ticks);
 
 /*
diff --git a/kernel/time/tick-legacy.c b/kernel/time/tick-legacy.c
index 73c5a0af4743..af225b32f5b3 100644
--- a/kernel/time/tick-legacy.c
+++ b/kernel/time/tick-legacy.c
@@ -10,10 +10,28 @@
 
 #include "tick-internal.h"
 
+/**
+ * legacy_timer_tick() - advances the timekeeping infrastructure
+ * @ticks:	number of ticks, that have elapsed since the last call.
+ *
+ * This is used by platforms that have not been converted to
+ * generic clockevents.
+ *
+ * If 'ticks' is zero, the CPU is not handling timekeeping, so
+ * only perform process accounting and profiling.
+ *
+ * Must be called with interrupts disabled.
+ */
 void legacy_timer_tick(unsigned long ticks)
 {
-	if (ticks)
-		xtime_update(ticks);
+	if (ticks) {
+		raw_spin_lock(&jiffies_lock);
+		write_seqcount_begin(&jiffies_seq);
+		do_timer(ticks);
+		write_seqcount_end(&jiffies_seq);
+		raw_spin_unlock(&jiffies_lock);
+		update_wall_time();
+	}
 	update_process_times(user_mode(get_irq_regs()));
 	profile_tick(CPU_PROFILING);
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 52fff7e9edcd..daa0ff017819 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2438,19 +2438,3 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
 }
 EXPORT_SYMBOL(hardpps);
 #endif /* CONFIG_NTP_PPS */
-
-/**
- * xtime_update() - advances the timekeeping infrastructure
- * @ticks:	number of ticks, that have elapsed since the last call.
- *
- * Must be called with interrupts disabled.
- */
-void xtime_update(unsigned long ticks)
-{
-	raw_spin_lock(&jiffies_lock);
-	write_seqcount_begin(&jiffies_seq);
-	do_timer(ticks);
-	write_seqcount_end(&jiffies_seq);
-	raw_spin_unlock(&jiffies_lock);
-	update_wall_time();
-}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 099737f6f10c..d94b69c5b869 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -22,6 +22,7 @@ static inline int sched_clock_suspend(void) { return 0; }
 static inline void sched_clock_resume(void) { }
 #endif
 
+extern void update_process_times(int user);
 extern void do_timer(unsigned long ticks);
 extern void update_wall_time(void);
 
-- 
cgit 


From e38d86b354f96e3ed6f5e6c7dbb442d7107fc0bd Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 29 Oct 2020 15:05:08 +0800
Subject: sctp: add the error cause for new encapsulation port restart

This patch is to add the function to make the abort chunk with
the error cause for new encapsulation port restart, defined
on Section 4.4 in draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.

v1->v2:
  - no change.
v2->v3:
  - no need to call htons() when setting nep.cur_port/new_port.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/sctp.h     | 20 ++++++++++++++++++++
 include/net/sctp/sm.h    |  3 +++
 net/sctp/sm_make_chunk.c | 20 ++++++++++++++++++++
 3 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 76731230bbc5..bb1926589693 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -482,11 +482,13 @@ enum sctp_error {
 	 *  11  Restart of an association with new addresses
 	 *  12  User Initiated Abort
 	 *  13  Protocol Violation
+	 *  14  Restart of an Association with New Encapsulation Port
 	 */
 
 	SCTP_ERROR_RESTART         = cpu_to_be16(0x0b),
 	SCTP_ERROR_USER_ABORT      = cpu_to_be16(0x0c),
 	SCTP_ERROR_PROTO_VIOLATION = cpu_to_be16(0x0d),
+	SCTP_ERROR_NEW_ENCAP_PORT  = cpu_to_be16(0x0e),
 
 	/* ADDIP Section 3.3  New Error Causes
 	 *
@@ -793,4 +795,22 @@ enum {
 	SCTP_FLOWLABEL_VAL_MASK = 0xfffff
 };
 
+/* UDP Encapsulation
+ * draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.html#section-4-4
+ *
+ *   The error cause indicating an "Restart of an Association with
+ *   New Encapsulation Port"
+ *
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |        Cause Code = 14        |       Cause Length = 8        |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |   Current Encapsulation Port  |     New Encapsulation Port    |
+ * +-------------------------------+-------------------------------+
+ */
+struct sctp_new_encap_port_hdr {
+	__be16 cur_port;
+	__be16 new_port;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index a499341472ad..fd223c94589a 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -221,6 +221,9 @@ struct sctp_chunk *sctp_make_violation_paramlen(
 struct sctp_chunk *sctp_make_violation_max_retrans(
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *chunk);
+struct sctp_chunk *sctp_make_new_encap_port(
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk);
 struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
 				       const struct sctp_transport *transport);
 struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 21d0ff1c6ab9..f77484df097b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1142,6 +1142,26 @@ nodata:
 	return retval;
 }
 
+struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc,
+					    const struct sctp_chunk *chunk)
+{
+	struct sctp_new_encap_port_hdr nep;
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_abort(asoc, chunk,
+				 sizeof(struct sctp_errhdr) + sizeof(nep));
+	if (!retval)
+		goto nodata;
+
+	sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep));
+	nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
+	nep.new_port = chunk->transport->encap_port;
+	sctp_addto_chunk(retval, sizeof(nep), &nep);
+
+nodata:
+	return retval;
+}
+
 /* Make a HEARTBEAT chunk.  */
 struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
 				       const struct sctp_transport *transport)
-- 
cgit 


From 1887023a5e96f98249574e0785b7aa2e5742ca68 Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock@calian.com>
Date: Wed, 28 Oct 2020 11:15:40 -0600
Subject: net: phy: marvell: add special handling of Finisar modules with
 88E1111

The Finisar FCLF8520P2BTL 1000BaseT SFP module uses a Marvel 88E1111 PHY
with a modified PHY ID. Add support for this ID using the 88E1111
methods.

By default these modules do not have 1000BaseX auto-negotiation enabled,
which is not generally desirable with Linux networking drivers. Add
handling to enable 1000BaseX auto-negotiation when these modules are
used in 1000BaseX mode. Also, some special handling is required to ensure
that 1000BaseT auto-negotiation is enabled properly when desired.

Based on existing handling in the AMD xgbe driver and the information in
the Finisar FAQ:
https://www.finisar.com/sites/default/files/resources/an-2036_1000base-t_sfp_faqreve1.pdf

Signed-off-by: Robert Hancock <robert.hancock@calian.com>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/20201028171540.1700032-1-robert.hancock@calian.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/marvell.c   | 100 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/marvell_phy.h |   3 ++
 2 files changed, 102 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 77540f5d2622..2563526bf4a6 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -80,8 +80,11 @@
 #define MII_M1111_HWCFG_MODE_FIBER_RGMII	0x3
 #define MII_M1111_HWCFG_MODE_SGMII_NO_CLK	0x4
 #define MII_M1111_HWCFG_MODE_RTBI		0x7
+#define MII_M1111_HWCFG_MODE_COPPER_1000X_AN	0x8
 #define MII_M1111_HWCFG_MODE_COPPER_RTBI	0x9
 #define MII_M1111_HWCFG_MODE_COPPER_RGMII	0xb
+#define MII_M1111_HWCFG_MODE_COPPER_1000X_NOAN	0xc
+#define MII_M1111_HWCFG_SERIAL_AN_BYPASS	BIT(12)
 #define MII_M1111_HWCFG_FIBER_COPPER_RES	BIT(13)
 #define MII_M1111_HWCFG_FIBER_COPPER_AUTO	BIT(15)
 
@@ -629,6 +632,51 @@ static int marvell_config_aneg_fiber(struct phy_device *phydev)
 	return genphy_check_and_restart_aneg(phydev, changed);
 }
 
+static int m88e1111_config_aneg(struct phy_device *phydev)
+{
+	int extsr = phy_read(phydev, MII_M1111_PHY_EXT_SR);
+	int err;
+
+	if (extsr < 0)
+		return extsr;
+
+	/* If not using SGMII or copper 1000BaseX modes, use normal process.
+	 * Steps below are only required for these modes.
+	 */
+	if (phydev->interface != PHY_INTERFACE_MODE_SGMII &&
+	    (extsr & MII_M1111_HWCFG_MODE_MASK) !=
+	    MII_M1111_HWCFG_MODE_COPPER_1000X_AN)
+		return marvell_config_aneg(phydev);
+
+	err = marvell_set_page(phydev, MII_MARVELL_COPPER_PAGE);
+	if (err < 0)
+		goto error;
+
+	/* Configure the copper link first */
+	err = marvell_config_aneg(phydev);
+	if (err < 0)
+		goto error;
+
+	/* Do not touch the fiber page if we're in copper->sgmii mode */
+	if (phydev->interface == PHY_INTERFACE_MODE_SGMII)
+		return 0;
+
+	/* Then the fiber link */
+	err = marvell_set_page(phydev, MII_MARVELL_FIBER_PAGE);
+	if (err < 0)
+		goto error;
+
+	err = marvell_config_aneg_fiber(phydev);
+	if (err < 0)
+		goto error;
+
+	return marvell_set_page(phydev, MII_MARVELL_COPPER_PAGE);
+
+error:
+	marvell_set_page(phydev, MII_MARVELL_COPPER_PAGE);
+	return err;
+}
+
 static int m88e1510_config_aneg(struct phy_device *phydev)
 {
 	int err;
@@ -814,6 +862,28 @@ static int m88e1111_config_init_rtbi(struct phy_device *phydev)
 		MII_M1111_HWCFG_FIBER_COPPER_AUTO);
 }
 
+static int m88e1111_config_init_1000basex(struct phy_device *phydev)
+{
+	int extsr = phy_read(phydev, MII_M1111_PHY_EXT_SR);
+	int err, mode;
+
+	if (extsr < 0)
+		return extsr;
+
+	/* If using copper mode, ensure 1000BaseX auto-negotiation is enabled */
+	mode = extsr & MII_M1111_HWCFG_MODE_MASK;
+	if (mode == MII_M1111_HWCFG_MODE_COPPER_1000X_NOAN) {
+		err = phy_modify(phydev, MII_M1111_PHY_EXT_SR,
+				 MII_M1111_HWCFG_MODE_MASK |
+				 MII_M1111_HWCFG_SERIAL_AN_BYPASS,
+				 MII_M1111_HWCFG_MODE_COPPER_1000X_AN |
+				 MII_M1111_HWCFG_SERIAL_AN_BYPASS);
+		if (err < 0)
+			return err;
+	}
+	return 0;
+}
+
 static int m88e1111_config_init(struct phy_device *phydev)
 {
 	int err;
@@ -836,6 +906,12 @@ static int m88e1111_config_init(struct phy_device *phydev)
 			return err;
 	}
 
+	if (phydev->interface == PHY_INTERFACE_MODE_1000BASEX) {
+		err = m88e1111_config_init_1000basex(phydev);
+		if (err < 0)
+			return err;
+	}
+
 	err = marvell_of_reg_init(phydev);
 	if (err < 0)
 		return err;
@@ -2658,7 +2734,28 @@ static struct phy_driver marvell_drivers[] = {
 		/* PHY_GBIT_FEATURES */
 		.probe = marvell_probe,
 		.config_init = m88e1111_config_init,
-		.config_aneg = marvell_config_aneg,
+		.config_aneg = m88e1111_config_aneg,
+		.read_status = marvell_read_status,
+		.ack_interrupt = marvell_ack_interrupt,
+		.config_intr = marvell_config_intr,
+		.resume = genphy_resume,
+		.suspend = genphy_suspend,
+		.read_page = marvell_read_page,
+		.write_page = marvell_write_page,
+		.get_sset_count = marvell_get_sset_count,
+		.get_strings = marvell_get_strings,
+		.get_stats = marvell_get_stats,
+		.get_tunable = m88e1111_get_tunable,
+		.set_tunable = m88e1111_set_tunable,
+	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E1111_FINISAR,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E1111 (Finisar)",
+		/* PHY_GBIT_FEATURES */
+		.probe = marvell_probe,
+		.config_init = m88e1111_config_init,
+		.config_aneg = m88e1111_config_aneg,
 		.read_status = marvell_read_status,
 		.ack_interrupt = marvell_ack_interrupt,
 		.config_intr = marvell_config_intr,
@@ -2989,6 +3086,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E1101, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1112, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1111, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E1111_FINISAR, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1118, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1121R, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1145, MARVELL_PHY_ID_MASK },
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index ff7b7607c8cf..52b1610eae68 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -25,6 +25,9 @@
 #define MARVELL_PHY_ID_88X3310		0x002b09a0
 #define MARVELL_PHY_ID_88E2110		0x002b09b0
 
+/* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
+#define MARVELL_PHY_ID_88E1111_FINISAR	0x01ff0cc0
+
 /* The MV88e6390 Ethernet switch contains embedded PHYs. These PHYs do
  * not have a model ID. So the switch driver traps reads to the ID2
  * register and returns the switch family ID
-- 
cgit 


From 955062b03fa62b802a1ee34fbb04e39f7a70ae73 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 29 Oct 2020 01:38:31 +0200
Subject: net: bridge: mcast: add support for raw L2 multicast groups

Extend the bridge multicast control and data path to configure routes
for L2 (non-IP) multicast groups.

The uapi struct br_mdb_entry union u is extended with another variant,
mac_addr, which does not change the structure size, and which is valid
when the proto field is zero.

To be compatible with the forwarding code that is already in place,
which acts as an IGMP/MLD snooping bridge with querier capabilities, we
need to declare that for L2 MDB entries (for which there exists no such
thing as IGMP/MLD snooping/querying), that there is always a querier.
Otherwise, these entries would be flooded to all bridge ports and not
just to those that are members of the L2 multicast group.

Needless to say, only permanent L2 multicast groups can be installed on
a bridge port.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20201028233831.610076-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_bridge.h      |  1 +
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_device.c         |  2 +-
 net/bridge/br_input.c          |  2 +-
 net/bridge/br_mdb.c            | 24 ++++++++++++++++++++++--
 net/bridge/br_multicast.c      | 13 +++++++++----
 net/bridge/br_private.h        | 10 ++++++++--
 7 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 556caed00258..b979005ea39c 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -25,6 +25,7 @@ struct br_ip {
 #if IS_ENABLED(CONFIG_IPV6)
 		struct in6_addr ip6;
 #endif
+		unsigned char	mac_addr[ETH_ALEN];
 	} dst;
 	__be16		proto;
 	__u16           vid;
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index d975e1223884..13d59c51ef5b 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -651,6 +651,7 @@ struct br_mdb_entry {
 		union {
 			__be32	ip4;
 			struct in6_addr ip6;
+			unsigned char mac_addr[ETH_ALEN];
 		} u;
 		__be16		proto;
 	} addr;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 9b5d62744acc..2400a66fe76e 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -93,7 +93,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		mdst = br_mdb_get(br, skb, vid);
 		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
-		    br_multicast_querier_exists(br, eth_hdr(skb)))
+		    br_multicast_querier_exists(br, eth_hdr(skb), mdst))
 			br_multicast_flood(mdst, skb, false, true);
 		else
 			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index bece03bf83c4..21808985f268 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -134,7 +134,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	case BR_PKT_MULTICAST:
 		mdst = br_mdb_get(br, skb, vid);
 		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
-		    br_multicast_querier_exists(br, eth_hdr(skb))) {
+		    br_multicast_querier_exists(br, eth_hdr(skb), mdst)) {
 			if ((mdst && mdst->host_joined) ||
 			    br_multicast_is_router(br)) {
 				local_rcv = true;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index e15bab19a012..3c8863418d0b 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -87,6 +87,8 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip,
 			ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]);
 		break;
 #endif
+	default:
+		ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr);
 	}
 
 }
@@ -174,9 +176,11 @@ static int __mdb_fill_info(struct sk_buff *skb,
 	if (mp->addr.proto == htons(ETH_P_IP))
 		e.addr.u.ip4 = mp->addr.dst.ip4;
 #if IS_ENABLED(CONFIG_IPV6)
-	if (mp->addr.proto == htons(ETH_P_IPV6))
+	else if (mp->addr.proto == htons(ETH_P_IPV6))
 		e.addr.u.ip6 = mp->addr.dst.ip6;
 #endif
+	else
+		ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
 	e.addr.proto = mp->addr.proto;
 	nest_ent = nla_nest_start_noflag(skb,
 					 MDBA_MDB_ENTRY_INFO);
@@ -210,6 +214,8 @@ static int __mdb_fill_info(struct sk_buff *skb,
 		}
 		break;
 #endif
+	default:
+		ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
 	}
 	if (p) {
 		if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol))
@@ -562,9 +568,12 @@ void br_mdb_notify(struct net_device *dev,
 		if (mp->addr.proto == htons(ETH_P_IP))
 			ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr);
 #if IS_ENABLED(CONFIG_IPV6)
-		else
+		else if (mp->addr.proto == htons(ETH_P_IPV6))
 			ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr);
 #endif
+		else
+			ether_addr_copy(mdb.addr, mp->addr.dst.mac_addr);
+
 		mdb.obj.orig_dev = pg->key.port->dev;
 		switch (type) {
 		case RTM_NEWMDB:
@@ -693,6 +702,12 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry,
 			return false;
 		}
 #endif
+	} else if (entry->addr.proto == 0) {
+		/* L2 mdb */
+		if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) {
+			NL_SET_ERR_MSG_MOD(extack, "L2 entry group is not multicast");
+			return false;
+		}
 	} else {
 		NL_SET_ERR_MSG_MOD(extack, "Unknown entry protocol");
 		return false;
@@ -849,6 +864,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 		}
 	}
 
+	if (br_group_is_l2(&group) && entry->state != MDB_PERMANENT) {
+		NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed");
+		return -EINVAL;
+	}
+
 	mp = br_mdb_ip_get(br, &group);
 	if (!mp) {
 		mp = br_multicast_new_group(br, &group);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index eae898c3cff7..484820c223a3 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -179,7 +179,8 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
 		break;
 #endif
 	default:
-		return NULL;
+		ip.proto = 0;
+		ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest);
 	}
 
 	return br_mdb_ip_get_rcu(br, &ip);
@@ -1203,6 +1204,10 @@ void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify)
 		if (notify)
 			br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB);
 	}
+
+	if (br_group_is_l2(&mp->addr))
+		return;
+
 	mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval);
 }
 
@@ -1254,8 +1259,8 @@ __br_multicast_add_group(struct net_bridge *br,
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode,
-					RTPROT_KERNEL);
+	p = br_multicast_new_port_group(port, group, *pp, 0, src,
+					filter_mode, RTPROT_KERNEL);
 	if (unlikely(!p)) {
 		p = ERR_PTR(-ENOMEM);
 		goto out;
@@ -3690,7 +3695,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
 	memset(&eth, 0, sizeof(eth));
 	eth.h_proto = htons(proto);
 
-	ret = br_multicast_querier_exists(br, &eth);
+	ret = br_multicast_querier_exists(br, &eth, NULL);
 
 unlock:
 	rcu_read_unlock();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 905d406a2fc7..4c691c371884 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -854,6 +854,11 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg,
 void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
 				       struct net_bridge_port_group *sg);
 
+static inline bool br_group_is_l2(const struct br_ip *group)
+{
+	return group->proto == 0;
+}
+
 #define mlock_dereference(X, br) \
 	rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
 
@@ -885,7 +890,8 @@ __br_multicast_querier_exists(struct net_bridge *br,
 }
 
 static inline bool br_multicast_querier_exists(struct net_bridge *br,
-					       struct ethhdr *eth)
+					       struct ethhdr *eth,
+					       const struct net_bridge_mdb_entry *mdb)
 {
 	switch (eth->h_proto) {
 	case (htons(ETH_P_IP)):
@@ -897,7 +903,7 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
 			&br->ip6_other_query, true);
 #endif
 	default:
-		return false;
+		return !!mdb && br_group_is_l2(&mdb->addr);
 	}
 }
 
-- 
cgit 


From ccf0a4b7fc688561428290265e4effde41446668 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Thu, 29 Oct 2020 16:39:48 +0100
Subject: netfilter: ipset: Add bucketsize parameter to all hash types

The parameter defines the upper limit in any hash bucket at adding new entries
from userspace - if the limit would be exceeded, ipset doubles the hash size
and rehashes. It means the set may consume more memory but gives faster
evaluation at matching in the set.

Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h       |  5 ++++
 include/uapi/linux/netfilter/ipset/ip_set.h  |  4 ++-
 net/netfilter/ipset/ip_set_core.c            |  2 ++
 net/netfilter/ipset/ip_set_hash_gen.h        | 38 +++++++++++++++++-----------
 net/netfilter/ipset/ip_set_hash_ip.c         |  6 +++--
 net/netfilter/ipset/ip_set_hash_ipmac.c      |  5 ++--
 net/netfilter/ipset/ip_set_hash_ipmark.c     |  6 +++--
 net/netfilter/ipset/ip_set_hash_ipport.c     |  6 +++--
 net/netfilter/ipset/ip_set_hash_ipportip.c   |  6 +++--
 net/netfilter/ipset/ip_set_hash_ipportnet.c  |  6 +++--
 net/netfilter/ipset/ip_set_hash_mac.c        |  5 ++--
 net/netfilter/ipset/ip_set_hash_net.c        |  6 +++--
 net/netfilter/ipset/ip_set_hash_netiface.c   |  6 +++--
 net/netfilter/ipset/ip_set_hash_netnet.c     |  6 +++--
 net/netfilter/ipset/ip_set_hash_netport.c    |  6 +++--
 net/netfilter/ipset/ip_set_hash_netportnet.c |  6 +++--
 16 files changed, 79 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index ab192720e2d6..46d9a0c26c67 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -198,6 +198,9 @@ struct ip_set_region {
 	u32 elements;		/* Number of elements vs timeout */
 };
 
+/* The max revision number supported by any set type + 1 */
+#define IPSET_REVISION_MAX	9
+
 /* The core set type structure */
 struct ip_set_type {
 	struct list_head list;
@@ -215,6 +218,8 @@ struct ip_set_type {
 	u8 family;
 	/* Type revisions */
 	u8 revision_min, revision_max;
+	/* Revision-specific supported (create) flags */
+	u8 create_flags[IPSET_REVISION_MAX+1];
 	/* Set features to control swapping */
 	u16 features;
 
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 11a72a938eb1..398f7b909b7d 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -96,7 +96,7 @@ enum {
 	IPSET_ATTR_HASHSIZE,
 	IPSET_ATTR_MAXELEM,
 	IPSET_ATTR_NETMASK,
-	IPSET_ATTR_PROBES,
+	IPSET_ATTR_BUCKETSIZE,	/* was unused IPSET_ATTR_PROBES */
 	IPSET_ATTR_RESIZE,
 	IPSET_ATTR_SIZE,
 	/* Kernel-only */
@@ -214,6 +214,8 @@ enum ipset_cadt_flags {
 enum ipset_create_flags {
 	IPSET_CREATE_FLAG_BIT_FORCEADD = 0,
 	IPSET_CREATE_FLAG_FORCEADD = (1 << IPSET_CREATE_FLAG_BIT_FORCEADD),
+	IPSET_CREATE_FLAG_BIT_BUCKETSIZE = 1,
+	IPSET_CREATE_FLAG_BUCKETSIZE = (1 << IPSET_CREATE_FLAG_BIT_BUCKETSIZE),
 	IPSET_CREATE_FLAG_BIT_MAX = 7,
 };
 
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index e3c00dacec5c..e76bfca2d3ef 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1109,6 +1109,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
 		ret = -IPSET_ERR_PROTOCOL;
 		goto put_out;
 	}
+	/* Set create flags depending on the type revision */
+	set->flags |= set->type->create_flags[revision];
 
 	ret = set->type->create(net, set, tb, flags);
 	if (ret != 0)
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 521e970be402..4e3544442b26 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -37,18 +37,18 @@
  */
 
 /* Number of elements to store in an initial array block */
-#define AHASH_INIT_SIZE			4
+#define AHASH_INIT_SIZE			2
 /* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE			(3 * AHASH_INIT_SIZE)
+#define AHASH_MAX_SIZE			(6 * AHASH_INIT_SIZE)
 /* Max muber of elements in the array block when tuned */
 #define AHASH_MAX_TUNED			64
 
+#define AHASH_MAX(h)			((h)->bucketsize)
+
 /* Max number of elements can be tuned */
 #ifdef IP_SET_HASH_WITH_MULTI
-#define AHASH_MAX(h)			((h)->ahash_max)
-
 static u8
-tune_ahash_max(u8 curr, u32 multi)
+tune_bucketsize(u8 curr, u32 multi)
 {
 	u32 n;
 
@@ -61,12 +61,10 @@ tune_ahash_max(u8 curr, u32 multi)
 	 */
 	return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
 }
-
-#define TUNE_AHASH_MAX(h, multi)	\
-	((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
+#define TUNE_BUCKETSIZE(h, multi)	\
+	((h)->bucketsize = tune_bucketsize((h)->bucketsize, multi))
 #else
-#define AHASH_MAX(h)			AHASH_MAX_SIZE
-#define TUNE_AHASH_MAX(h, multi)
+#define TUNE_BUCKETSIZE(h, multi)
 #endif
 
 /* A hash bucket */
@@ -321,9 +319,7 @@ struct htype {
 #ifdef IP_SET_HASH_WITH_MARKMASK
 	u32 markmask;		/* markmask value for mark mask to store */
 #endif
-#ifdef IP_SET_HASH_WITH_MULTI
-	u8 ahash_max;		/* max elements in an array block */
-#endif
+	u8 bucketsize;		/* max elements in an array block */
 #ifdef IP_SET_HASH_WITH_NETMASK
 	u8 netmask;		/* netmask value for subnets to store */
 #endif
@@ -950,7 +946,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		goto set_full;
 	/* Create a new slot */
 	if (n->pos >= n->size) {
-		TUNE_AHASH_MAX(h, multi);
+		TUNE_BUCKETSIZE(h, multi);
 		if (n->size >= AHASH_MAX(h)) {
 			/* Trigger rehashing */
 			mtype_data_next(&h->next, d);
@@ -1305,6 +1301,9 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 	if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
 		goto nla_put_failure;
 #endif
+	if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE &&
+	    nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize))
+		goto nla_put_failure;
 	if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
 	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
 	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
@@ -1548,7 +1547,16 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	h->markmask = markmask;
 #endif
 	get_random_bytes(&h->initval, sizeof(h->initval));
-
+	h->bucketsize = AHASH_MAX_SIZE;
+	if (tb[IPSET_ATTR_BUCKETSIZE]) {
+		h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]);
+		if (h->bucketsize < AHASH_INIT_SIZE)
+			h->bucketsize = AHASH_INIT_SIZE;
+		else if (h->bucketsize > AHASH_MAX_SIZE)
+			h->bucketsize = AHASH_MAX_SIZE;
+		else if (h->bucketsize % 2)
+			h->bucketsize += 1;
+	}
 	t->htable_bits = hbits;
 	t->maxelem = h->maxelem / ahash_numof_locks(hbits);
 	RCU_INIT_POINTER(h->table, t);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 5d6d68eaf6a9..0495d515c498 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -23,7 +23,8 @@
 /*				1	   Counters support */
 /*				2	   Comments support */
 /*				3	   Forceadd support */
-#define IPSET_TYPE_REV_MAX	4	/* skbinfo support  */
+/*				4	   skbinfo support */
+#define IPSET_TYPE_REV_MAX	5	/* bucketsize support  */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -277,11 +278,12 @@ static struct ip_set_type hash_ip_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_ip_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_NETMASK]	= { .type = NLA_U8  },
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
index eceb7bc4a93a..2655501f9fe3 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmac.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -23,7 +23,7 @@
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
 #define IPSET_TYPE_REV_MIN	0
-#define IPSET_TYPE_REV_MAX	0
+#define IPSET_TYPE_REV_MAX	1	/* bucketsize support  */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>");
@@ -268,11 +268,12 @@ static struct ip_set_type hash_ipmac_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_ipmac_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index aba1df617d6e..5bbed85d0e47 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -21,7 +21,8 @@
 
 #define IPSET_TYPE_REV_MIN	0
 /*				1	   Forceadd support */
-#define IPSET_TYPE_REV_MAX	2	/* skbinfo support  */
+/*				2	   skbinfo support */
+#define IPSET_TYPE_REV_MAX	3	/* bucketsize support  */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
@@ -274,12 +275,13 @@ static struct ip_set_type hash_ipmark_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_ipmark_create,
 	.create_policy	= {
 		[IPSET_ATTR_MARKMASK]	= { .type = NLA_U32 },
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 1ff228717e29..c1ac2e89e2d3 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -25,7 +25,8 @@
 /*				2    Counters support added */
 /*				3    Comments support added */
 /*				4    Forceadd support added */
-#define IPSET_TYPE_REV_MAX	5 /* skbinfo support added */
+/*				5    skbinfo support added */
+#define IPSET_TYPE_REV_MAX	6 /* bucketsize support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -341,11 +342,12 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_ipport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index fa88afd812fa..d3f4a672986e 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -25,7 +25,8 @@
 /*				2    Counters support added */
 /*				3    Comments support added */
 /*				4    Forceadd support added */
-#define IPSET_TYPE_REV_MAX	5 /* skbinfo support added */
+/*				5    skbinfo support added */
+#define IPSET_TYPE_REV_MAX	6 /* bucketsize support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -356,11 +357,12 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_ipportip_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index eef6ecfcb409..8f7fe360736a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -27,7 +27,8 @@
 /*				4    Counters support added */
 /*				5    Comments support added */
 /*				6    Forceadd support added */
-#define IPSET_TYPE_REV_MAX	7 /* skbinfo support added */
+/*				7    skbinfo support added */
+#define IPSET_TYPE_REV_MAX	8 /* bucketsize support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -513,11 +514,12 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_ipportnet_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 0b61593165ef..00dd7e20df3c 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -16,7 +16,7 @@
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
 #define IPSET_TYPE_REV_MIN	0
-#define IPSET_TYPE_REV_MAX	0
+#define IPSET_TYPE_REV_MAX	1	/* bucketsize support */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -125,11 +125,12 @@ static struct ip_set_type hash_mac_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_mac_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 136cf0781d3a..d366e816b6ed 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -24,7 +24,8 @@
 /*				3    Counters support added */
 /*				4    Comments support added */
 /*				5    Forceadd support added */
-#define IPSET_TYPE_REV_MAX	6 /* skbinfo mapping support added */
+/*				6    skbinfo support added */
+#define IPSET_TYPE_REV_MAX	7 /* bucketsize support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -354,11 +355,12 @@ static struct ip_set_type hash_net_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_net_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index be5e95a0d876..38b1d77584d4 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -26,7 +26,8 @@
 /*				4    Comments support added */
 /*				5    Forceadd support added */
 /*				6    skbinfo support added */
-#define IPSET_TYPE_REV_MAX	7 /* interface wildcard support added */
+/*				7    interface wildcard support added */
+#define IPSET_TYPE_REV_MAX	8 /* bucketsize support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -470,11 +471,12 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_netiface_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index da4ef910b12d..0cc7970f36e9 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -22,7 +22,8 @@
 
 #define IPSET_TYPE_REV_MIN	0
 /*				1	   Forceadd support added */
-#define IPSET_TYPE_REV_MAX	2	/* skbinfo support added */
+/*				2	   skbinfo support added */
+#define IPSET_TYPE_REV_MAX	3	/* bucketsize support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -459,11 +460,12 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_netnet_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 34448df80fb9..b356d7d85e34 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -26,7 +26,8 @@
 /*				4    Counters support added */
 /*				5    Comments support added */
 /*				6    Forceadd support added */
-#define IPSET_TYPE_REV_MAX	7 /* skbinfo support added */
+/*				7    skbinfo support added */
+#define IPSET_TYPE_REV_MAX	8 /* bucketsize support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -460,11 +461,12 @@ static struct ip_set_type hash_netport_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_netport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 934c1712cba8..eeb39688f26f 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -23,7 +23,8 @@
 #define IPSET_TYPE_REV_MIN	0
 /*				0    Comments support added */
 /*				1    Forceadd support added */
-#define IPSET_TYPE_REV_MAX	2 /* skbinfo support added */
+/*				2    skbinfo support added */
+#define IPSET_TYPE_REV_MAX	3 /* bucketsize support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -558,11 +559,12 @@ static struct ip_set_type hash_netportnet_type __read_mostly = {
 	.family		= NFPROTO_UNSPEC,
 	.revision_min	= IPSET_TYPE_REV_MIN,
 	.revision_max	= IPSET_TYPE_REV_MAX,
+	.create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
 	.create		= hash_netportnet_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
 		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
-		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BUCKETSIZE]	= { .type = NLA_U8 },
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
-- 
cgit 


From 3ebc0ef06e4a78522e9d1488dcf61b7d8fcfb792 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 11 Sep 2020 16:33:42 +0200
Subject: serial: s3c: Update path of Samsung S3C machine file

Correct the path to Samsung S3C24xx machine file, mentioned in
documentation.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20200911143343.498-2-krzk@kernel.org
---
 include/linux/serial_s3c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index 463ed28d2b27..ca2c5393dc6b 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -254,7 +254,7 @@
  * serial port
  *
  * the pointer is setup by the machine specific initialisation from the
- * arch/arm/mach-s3c2410/ directory.
+ * arch/arm/mach-s3c/ directory.
 */
 
 struct s3c2410_uartcfg {
-- 
cgit 


From 3a096c2bda7d2f0c0866d466447c41bc4b8ecdec Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 23 Oct 2020 18:33:32 +0200
Subject: iio: fix a kernel-doc markup

A function has a different name between their prototype
and its kernel-doc markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/46622c3bdcffb76e79719f0fe5011c2952960b32.1603469755.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/trigger.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index cad8325903f9..f930c4ccf378 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -97,7 +97,7 @@ static inline struct iio_trigger *iio_trigger_get(struct iio_trigger *trig)
 }
 
 /**
- * iio_device_set_drvdata() - Set trigger driver data
+ * iio_trigger_set_drvdata() - Set trigger driver data
  * @trig: IIO trigger structure
  * @data: Driver specific data
  *
-- 
cgit 


From d3fd65484c78102bac92f176c53537a7691a38b2 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 29 Oct 2020 18:29:59 +0100
Subject: net: core: add dev_sw_netstats_tx_add

Add dev_sw_netstats_tx_add(), complementing already existing
dev_sw_netstats_rx_add(). Other than dev_sw_netstats_rx_add allow to
pass the number of packets as function argument.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 964b494b0e8d..568fab708ac6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2557,6 +2557,18 @@ static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int l
 	u64_stats_update_end(&tstats->syncp);
 }
 
+static inline void dev_sw_netstats_tx_add(struct net_device *dev,
+					  unsigned int packets,
+					  unsigned int len)
+{
+	struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
+
+	u64_stats_update_begin(&tstats->syncp);
+	tstats->tx_bytes += len;
+	tstats->tx_packets += packets;
+	u64_stats_update_end(&tstats->syncp);
+}
+
 static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
 {
 	struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);
-- 
cgit 


From 81b01894d792db8d4746c9c1b39c6e2a94fa0a04 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 29 Oct 2020 18:31:21 +0100
Subject: net: core: add devm_netdev_alloc_pcpu_stats

We have netdev_alloc_pcpu_stats(), and we have devm_alloc_percpu().
Add a managed version of netdev_alloc_pcpu_stats, e.g. for allocating
the per-cpu stats in the probe() callback of a driver. It needs to be
a macro for dealing properly with the type argument.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 568fab708ac6..a53ed2d1ed1d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2596,6 +2596,20 @@ static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
 #define netdev_alloc_pcpu_stats(type)					\
 	__netdev_alloc_pcpu_stats(type, GFP_KERNEL)
 
+#define devm_netdev_alloc_pcpu_stats(dev, type)				\
+({									\
+	typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\
+	if (pcpu_stats) {						\
+		int __cpu;						\
+		for_each_possible_cpu(__cpu) {				\
+			typeof(type) *stat;				\
+			stat = per_cpu_ptr(pcpu_stats, __cpu);		\
+			u64_stats_init(&stat->syncp);			\
+		}							\
+	}								\
+	pcpu_stats;							\
+})
+
 enum netdev_lag_tx_type {
 	NETDEV_LAG_TX_TYPE_UNKNOWN,
 	NETDEV_LAG_TX_TYPE_RANDOM,
-- 
cgit 


From 1c552e08b29895d31bbf82bdb549811cfde31db4 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:02 -0700
Subject: firmware: ti_sci: rm: Add support for tx_tdtype parameter for tx
 channel

The system controller's resource manager have support for configuring the
TDTYPE of TCHAN_CFG register on j721e.
With this parameter the teardown completion can be controlled:
TDTYPE == 0: Return without waiting for peer to complete the teardown
TDTYPE == 1: Wait for peer to complete the teardown

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Tero Kristo <t-kristo@ti.com>
Tested-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 1 +
 drivers/firmware/ti_sci.h              | 7 +++++++
 include/linux/soc/ti/ti_sci_protocol.h | 2 ++
 3 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 896f53ec7857..65a8c2e82093 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2362,6 +2362,7 @@ static int ti_sci_cmd_rm_udmap_tx_ch_cfg(const struct ti_sci_handle *handle,
 	req->fdepth = params->fdepth;
 	req->tx_sched_priority = params->tx_sched_priority;
 	req->tx_burst_size = params->tx_burst_size;
+	req->tx_tdtype = params->tx_tdtype;
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 57cd04062994..dca19ca5fc49 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -910,6 +910,7 @@ struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
  *   12 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_credit_count
  *   13 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::fdepth
  *   14 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_burst_size
+ *   15 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_tdtype
  *
  * @nav_id: SoC device ID of Navigator Subsystem where tx channel is located
  *
@@ -973,6 +974,11 @@ struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
  *
  * @tx_burst_size: UDMAP transmit channel burst size configuration to be
  * programmed into the tx_burst_size field of the TCHAN_TCFG register.
+ *
+ * @tx_tdtype: UDMAP transmit channel teardown type configuration to be
+ * programmed into the tdtype field of the TCHAN_TCFG register:
+ * 0 - Return immediately
+ * 1 - Wait for completion message from remote peer
  */
 struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
 	struct ti_sci_msg_hdr hdr;
@@ -994,6 +1000,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
 	u16 fdepth;
 	u8 tx_sched_priority;
 	u8 tx_burst_size;
+	u8 tx_tdtype;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index cf27b080e148..d254d99fd45b 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -345,6 +345,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg {
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_SUPR_TDPKT_VALID        BIT(11)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_CREDIT_COUNT_VALID      BIT(12)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FDEPTH_VALID            BIT(13)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_TDTYPE_VALID            BIT(15)
 	u16 nav_id;
 	u16 index;
 	u8 tx_pause_on_err;
@@ -362,6 +363,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg {
 	u16 fdepth;
 	u8 tx_sched_priority;
 	u8 tx_burst_size;
+	u8 tx_tdtype;
 };
 
 /**
-- 
cgit 


From 967a020bd3deddf9a0af73aeb4d4b46d90030937 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:03 -0700
Subject: firmware: ti_sci: Use struct ti_sci_resource_desc in get_range ops

Use the ti_sci_resource_desc directly and update it's start and num members
directly instead of requiring individual parameters for them.

This will allow easy extension of the RM parameters without changing API.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 32 +++++++++++++++-----------------
 include/linux/soc/ti/ti_sci_protocol.h | 32 ++++++++++++++++----------------
 2 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 65a8c2e82093..7a777e91ce3e 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -1703,14 +1703,14 @@ fail:
  * @subtype:		Resource assignment subtype that is being requested
  *			from the given device.
  * @s_host:		Host processor ID to which the resources are allocated
- * @range_start:	Start index of the resource range
- * @range_num:		Number of resources in the range
+ * @desc:		Pointer to ti_sci_resource_desc to be updated with the
+ *			resource range start index and number of resources
  *
  * Return: 0 if all went fine, else return appropriate error.
  */
 static int ti_sci_get_resource_range(const struct ti_sci_handle *handle,
 				     u32 dev_id, u8 subtype, u8 s_host,
-				     u16 *range_start, u16 *range_num)
+				     struct ti_sci_resource_desc *desc)
 {
 	struct ti_sci_msg_resp_get_resource_range *resp;
 	struct ti_sci_msg_req_get_resource_range *req;
@@ -1721,7 +1721,7 @@ static int ti_sci_get_resource_range(const struct ti_sci_handle *handle,
 
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
-	if (!handle)
+	if (!handle || !desc)
 		return -EINVAL;
 
 	info = handle_to_ti_sci_info(handle);
@@ -1754,8 +1754,8 @@ static int ti_sci_get_resource_range(const struct ti_sci_handle *handle,
 	} else if (!resp->range_start && !resp->range_num) {
 		ret = -ENODEV;
 	} else {
-		*range_start = resp->range_start;
-		*range_num = resp->range_num;
+		desc->start = resp->range_start;
+		desc->num = resp->range_num;
 	};
 
 fail:
@@ -1771,18 +1771,18 @@ fail:
  * @dev_id:		TISCI device ID.
  * @subtype:		Resource assignment subtype that is being requested
  *			from the given device.
- * @range_start:	Start index of the resource range
- * @range_num:		Number of resources in the range
+ * @desc:		Pointer to ti_sci_resource_desc to be updated with the
+ *			resource range start index and number of resources
  *
  * Return: 0 if all went fine, else return appropriate error.
  */
 static int ti_sci_cmd_get_resource_range(const struct ti_sci_handle *handle,
 					 u32 dev_id, u8 subtype,
-					 u16 *range_start, u16 *range_num)
+					 struct ti_sci_resource_desc *desc)
 {
 	return ti_sci_get_resource_range(handle, dev_id, subtype,
 					 TI_SCI_IRQ_SECONDARY_HOST_INVALID,
-					 range_start, range_num);
+					 desc);
 }
 
 /**
@@ -1793,18 +1793,17 @@ static int ti_sci_cmd_get_resource_range(const struct ti_sci_handle *handle,
  * @subtype:		Resource assignment subtype that is being requested
  *			from the given device.
  * @s_host:		Host processor ID to which the resources are allocated
- * @range_start:	Start index of the resource range
- * @range_num:		Number of resources in the range
+ * @desc:		Pointer to ti_sci_resource_desc to be updated with the
+ *			resource range start index and number of resources
  *
  * Return: 0 if all went fine, else return appropriate error.
  */
 static
 int ti_sci_cmd_get_resource_range_from_shost(const struct ti_sci_handle *handle,
 					     u32 dev_id, u8 subtype, u8 s_host,
-					     u16 *range_start, u16 *range_num)
+					     struct ti_sci_resource_desc *desc)
 {
-	return ti_sci_get_resource_range(handle, dev_id, subtype, s_host,
-					 range_start, range_num);
+	return ti_sci_get_resource_range(handle, dev_id, subtype, s_host, desc);
 }
 
 /**
@@ -3243,8 +3242,7 @@ devm_ti_sci_get_resource_sets(const struct ti_sci_handle *handle,
 	for (i = 0; i < res->sets; i++) {
 		ret = handle->ops.rm_core_ops.get_range(handle, dev_id,
 							sub_types[i],
-							&res->desc[i].start,
-							&res->desc[i].num);
+							&res->desc[i]);
 		if (ret) {
 			dev_dbg(dev, "dev = %d subtype %d not allocated for this host\n",
 				dev_id, sub_types[i]);
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index d254d99fd45b..6cd537db4d33 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -195,6 +195,18 @@ struct ti_sci_clk_ops {
 			u64 *current_freq);
 };
 
+/**
+ * struct ti_sci_resource_desc - Description of TI SCI resource instance range.
+ * @start:	Start index of the resource.
+ * @num:	Number of resources.
+ * @res_map:	Bitmap to manage the allocation of these resources.
+ */
+struct ti_sci_resource_desc {
+	u16 start;
+	u16 num;
+	unsigned long *res_map;
+};
+
 /**
  * struct ti_sci_rm_core_ops - Resource management core operations
  * @get_range:		Get a range of resources belonging to ti sci host.
@@ -209,15 +221,15 @@ struct ti_sci_clk_ops {
  * - dev_id:	TISCI device ID.
  * - subtype:	Resource assignment subtype that is being requested
  *		from the given device.
- * - range_start:	Start index of the resource range
- * - range_end:		Number of resources in the range
+ * - desc:	Pointer to ti_sci_resource_desc to be updated with the resource
+ *		range start index and number of resources
  */
 struct ti_sci_rm_core_ops {
 	int (*get_range)(const struct ti_sci_handle *handle, u32 dev_id,
-			 u8 subtype, u16 *range_start, u16 *range_num);
+			 u8 subtype, struct ti_sci_resource_desc *desc);
 	int (*get_range_from_shost)(const struct ti_sci_handle *handle,
 				    u32 dev_id, u8 subtype, u8 s_host,
-				    u16 *range_start, u16 *range_num);
+				    struct ti_sci_resource_desc *desc);
 };
 
 #define TI_SCI_RESASG_SUBTYPE_IR_OUTPUT		0
@@ -522,18 +534,6 @@ struct ti_sci_handle {
 
 #define TI_SCI_RESOURCE_NULL	0xffff
 
-/**
- * struct ti_sci_resource_desc - Description of TI SCI resource instance range.
- * @start:	Start index of the resource.
- * @num:	Number of resources.
- * @res_map:	Bitmap to manage the allocation of these resources.
- */
-struct ti_sci_resource_desc {
-	u16 start;
-	u16 num;
-	unsigned long *res_map;
-};
-
 /**
  * struct ti_sci_resource - Structure representing a resource assigned
  *			    to a device.
-- 
cgit 


From 519c5c0c558b529f835c9bb30f9a1eb2034d585c Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:03 -0700
Subject: firmware: ti_sci: rm: Add support for second resource range

Sysfw added support for a second range in the resource range API to be able
to describe complex allocations mainly for DMA channels.

Update the ti_sci part to consider the second range as well.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 48 ++++++++++++++++++++++------------
 drivers/firmware/ti_sci.h              |  8 ++++--
 include/linux/soc/ti/ti_sci_protocol.h |  8 ++++--
 3 files changed, 43 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 7a777e91ce3e..2793bb923881 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -1751,11 +1751,14 @@ static int ti_sci_get_resource_range(const struct ti_sci_handle *handle,
 
 	if (!ti_sci_is_response_ack(resp)) {
 		ret = -ENODEV;
-	} else if (!resp->range_start && !resp->range_num) {
+	} else if (!resp->range_num && !resp->range_num_sec) {
+		/* Neither of the two resource range is valid */
 		ret = -ENODEV;
 	} else {
 		desc->start = resp->range_start;
 		desc->num = resp->range_num;
+		desc->start_sec = resp->range_start_sec;
+		desc->num_sec = resp->range_num_sec;
 	};
 
 fail:
@@ -3157,12 +3160,18 @@ u16 ti_sci_get_free_resource(struct ti_sci_resource *res)
 
 	raw_spin_lock_irqsave(&res->lock, flags);
 	for (set = 0; set < res->sets; set++) {
-		free_bit = find_first_zero_bit(res->desc[set].res_map,
-					       res->desc[set].num);
-		if (free_bit != res->desc[set].num) {
-			set_bit(free_bit, res->desc[set].res_map);
+		struct ti_sci_resource_desc *desc = &res->desc[set];
+		int res_count = desc->num + desc->num_sec;
+
+		free_bit = find_first_zero_bit(desc->res_map, res_count);
+		if (free_bit != res_count) {
+			set_bit(free_bit, desc->res_map);
 			raw_spin_unlock_irqrestore(&res->lock, flags);
-			return res->desc[set].start + free_bit;
+
+			if (desc->num && free_bit < desc->num)
+				return desc->start + free_bit;
+			else
+				return desc->start_sec + free_bit;
 		}
 	}
 	raw_spin_unlock_irqrestore(&res->lock, flags);
@@ -3183,10 +3192,14 @@ void ti_sci_release_resource(struct ti_sci_resource *res, u16 id)
 
 	raw_spin_lock_irqsave(&res->lock, flags);
 	for (set = 0; set < res->sets; set++) {
-		if (res->desc[set].start <= id &&
-		    (res->desc[set].num + res->desc[set].start) > id)
-			clear_bit(id - res->desc[set].start,
-				  res->desc[set].res_map);
+		struct ti_sci_resource_desc *desc = &res->desc[set];
+
+		if (desc->num && desc->start <= id &&
+		    (desc->start + desc->num) > id)
+			clear_bit(id - desc->start, desc->res_map);
+		else if (desc->num_sec && desc->start_sec <= id &&
+			 (desc->start_sec + desc->num_sec) > id)
+			clear_bit(id - desc->start_sec, desc->res_map);
 	}
 	raw_spin_unlock_irqrestore(&res->lock, flags);
 }
@@ -3203,7 +3216,7 @@ u32 ti_sci_get_num_resources(struct ti_sci_resource *res)
 	u32 set, count = 0;
 
 	for (set = 0; set < res->sets; set++)
-		count += res->desc[set].num;
+		count += res->desc[set].num + res->desc[set].num_sec;
 
 	return count;
 }
@@ -3227,7 +3240,7 @@ devm_ti_sci_get_resource_sets(const struct ti_sci_handle *handle,
 {
 	struct ti_sci_resource *res;
 	bool valid_set = false;
-	int i, ret;
+	int i, ret, res_count;
 
 	res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
 	if (!res)
@@ -3246,18 +3259,19 @@ devm_ti_sci_get_resource_sets(const struct ti_sci_handle *handle,
 		if (ret) {
 			dev_dbg(dev, "dev = %d subtype %d not allocated for this host\n",
 				dev_id, sub_types[i]);
-			res->desc[i].start = 0;
-			res->desc[i].num = 0;
+			memset(&res->desc[i], 0, sizeof(res->desc[i]));
 			continue;
 		}
 
-		dev_dbg(dev, "dev = %d, subtype = %d, start = %d, num = %d\n",
+		dev_dbg(dev, "dev/sub_type: %d/%d, start/num: %d/%d | %d/%d\n",
 			dev_id, sub_types[i], res->desc[i].start,
-			res->desc[i].num);
+			res->desc[i].num, res->desc[i].start_sec,
+			res->desc[i].num_sec);
 
 		valid_set = true;
+		res_count = res->desc[i].num + res->desc[i].num_sec;
 		res->desc[i].res_map =
-			devm_kzalloc(dev, BITS_TO_LONGS(res->desc[i].num) *
+			devm_kzalloc(dev, BITS_TO_LONGS(res_count) *
 				     sizeof(*res->desc[i].res_map), GFP_KERNEL);
 		if (!res->desc[i].res_map)
 			return ERR_PTR(-ENOMEM);
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index dca19ca5fc49..4d980eb592c4 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -574,8 +574,10 @@ struct ti_sci_msg_req_get_resource_range {
 /**
  * struct ti_sci_msg_resp_get_resource_range - Response to resource get range.
  * @hdr:		Generic Header
- * @range_start:	Start index of the resource range.
- * @range_num:		Number of resources in the range.
+ * @range_start:	Start index of the first resource range.
+ * @range_num:		Number of resources in the first range.
+ * @range_start_sec:	Start index of the second resource range.
+ * @range_num_sec:	Number of resources in the second range.
  *
  * Response to request TI_SCI_MSG_GET_RESOURCE_RANGE.
  */
@@ -583,6 +585,8 @@ struct ti_sci_msg_resp_get_resource_range {
 	struct ti_sci_msg_hdr hdr;
 	u16 range_start;
 	u16 range_num;
+	u16 range_start_sec;
+	u16 range_num_sec;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 6cd537db4d33..9699b260de59 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -197,13 +197,17 @@ struct ti_sci_clk_ops {
 
 /**
  * struct ti_sci_resource_desc - Description of TI SCI resource instance range.
- * @start:	Start index of the resource.
- * @num:	Number of resources.
+ * @start:	Start index of the first resource range.
+ * @num:	Number of resources in the first range.
+ * @start_sec:	Start index of the second resource range.
+ * @num_sec:	Number of resources in the second range.
  * @res_map:	Bitmap to manage the allocation of these resources.
  */
 struct ti_sci_resource_desc {
 	u16 start;
 	u16 num;
+	u16 start_sec;
+	u16 num_sec;
 	unsigned long *res_map;
 };
 
-- 
cgit 


From ce1feed58534d8489afb4900bee75dff15d950e0 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:05 -0700
Subject: firmware: ti_sci: rm: Add support for extended_ch_type for tx channel

Sysfw added 'extended_ch_type' to the tx_ch_cfg_req message which should be
used when BCDMA block copy channels are configured:
extended_ch_type = 0 : the channel is split tx channel (tchan)
extended_ch_type = 1 : the channel is block copy channel (bchan)

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 1 +
 drivers/firmware/ti_sci.h              | 6 ++++++
 include/linux/soc/ti/ti_sci_protocol.h | 5 +++++
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 2793bb923881..0dd3fbb4f964 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2365,6 +2365,7 @@ static int ti_sci_cmd_rm_udmap_tx_ch_cfg(const struct ti_sci_handle *handle,
 	req->tx_sched_priority = params->tx_sched_priority;
 	req->tx_burst_size = params->tx_burst_size;
 	req->tx_tdtype = params->tx_tdtype;
+	req->extended_ch_type = params->extended_ch_type;
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 4d980eb592c4..ca15d8f1f8de 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -915,6 +915,7 @@ struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
  *   13 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::fdepth
  *   14 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_burst_size
  *   15 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_tdtype
+ *   16 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::extended_ch_type
  *
  * @nav_id: SoC device ID of Navigator Subsystem where tx channel is located
  *
@@ -983,6 +984,10 @@ struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
  * programmed into the tdtype field of the TCHAN_TCFG register:
  * 0 - Return immediately
  * 1 - Wait for completion message from remote peer
+ *
+ * @extended_ch_type: Valid for BCDMA.
+ * 0 - the channel is split tx channel (tchan)
+ * 1 - the channel is block copy channel (bchan)
  */
 struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
 	struct ti_sci_msg_hdr hdr;
@@ -1005,6 +1010,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
 	u8 tx_sched_priority;
 	u8 tx_burst_size;
 	u8 tx_tdtype;
+	u8 extended_ch_type;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 9699b260de59..6978afc00823 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -336,6 +336,9 @@ struct ti_sci_rm_psil_ops {
 #define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES	2
 #define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES	3
 
+#define TI_SCI_RM_BCDMA_EXTENDED_CH_TYPE_TCHAN		0
+#define TI_SCI_RM_BCDMA_EXTENDED_CH_TYPE_BCHAN		1
+
 /* UDMAP TX/RX channel valid_params common declarations */
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID		BIT(0)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID                BIT(1)
@@ -362,6 +365,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg {
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_CREDIT_COUNT_VALID      BIT(12)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FDEPTH_VALID            BIT(13)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_TDTYPE_VALID            BIT(15)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_EXTENDED_CH_TYPE_VALID	BIT(16)
 	u16 nav_id;
 	u16 index;
 	u8 tx_pause_on_err;
@@ -380,6 +384,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg {
 	u8 tx_sched_priority;
 	u8 tx_burst_size;
 	u8 tx_tdtype;
+	u8 extended_ch_type;
 };
 
 /**
-- 
cgit 


From 4d8ddf673a420aa25668eceeb4fbf33e2521fdf2 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:05 -0700
Subject: firmware: ti_sci: rm: Remove ring_get_config support

The ring_get_cfg (0x1111 message) is not used and it is not supported by
sysfw for a long time.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 80 ----------------------------------
 drivers/firmware/ti_sci.h              | 44 -------------------
 include/linux/soc/ti/ti_sci_protocol.h |  6 ---
 3 files changed, 130 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 0dd3fbb4f964..0b801e67e672 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2119,85 +2119,6 @@ fail:
 	return ret;
 }
 
-/**
- * ti_sci_cmd_ring_get_config() - get RA ring configuration
- * @handle:	Pointer to TI SCI handle.
- * @nav_id:	Device ID of Navigator Subsystem from which the ring is
- *		allocated
- * @index:	Ring index
- * @addr_lo:	Returns ring's base address lo 32 bits
- * @addr_hi:	Returns ring's base address hi 32 bits
- * @count:	Returns number of ring elements
- * @mode:	Returns mode of the ring
- * @size:	Returns ring element size
- * @order_id:	Returns ring's bus order ID
- *
- * Return: 0 if all went well, else returns appropriate error value.
- *
- * See @ti_sci_msg_rm_ring_get_cfg_req for more info.
- */
-static int ti_sci_cmd_ring_get_config(const struct ti_sci_handle *handle,
-				      u32 nav_id, u32 index, u8 *mode,
-				      u32 *addr_lo, u32 *addr_hi,
-				      u32 *count, u8 *size, u8 *order_id)
-{
-	struct ti_sci_msg_rm_ring_get_cfg_resp *resp;
-	struct ti_sci_msg_rm_ring_get_cfg_req *req;
-	struct ti_sci_xfer *xfer;
-	struct ti_sci_info *info;
-	struct device *dev;
-	int ret = 0;
-
-	if (IS_ERR_OR_NULL(handle))
-		return -EINVAL;
-
-	info = handle_to_ti_sci_info(handle);
-	dev = info->dev;
-
-	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_GET_CFG,
-				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
-				   sizeof(*req), sizeof(*resp));
-	if (IS_ERR(xfer)) {
-		ret = PTR_ERR(xfer);
-		dev_err(dev,
-			"RM_RA:Message get config failed(%d)\n", ret);
-		return ret;
-	}
-	req = (struct ti_sci_msg_rm_ring_get_cfg_req *)xfer->xfer_buf;
-	req->nav_id = nav_id;
-	req->index = index;
-
-	ret = ti_sci_do_xfer(info, xfer);
-	if (ret) {
-		dev_err(dev, "RM_RA:Mbox get config send fail %d\n", ret);
-		goto fail;
-	}
-
-	resp = (struct ti_sci_msg_rm_ring_get_cfg_resp *)xfer->xfer_buf;
-
-	if (!ti_sci_is_response_ack(resp)) {
-		ret = -ENODEV;
-	} else {
-		if (mode)
-			*mode = resp->mode;
-		if (addr_lo)
-			*addr_lo = resp->addr_lo;
-		if (addr_hi)
-			*addr_hi = resp->addr_hi;
-		if (count)
-			*count = resp->count;
-		if (size)
-			*size = resp->size;
-		if (order_id)
-			*order_id = resp->order_id;
-	};
-
-fail:
-	ti_sci_put_one_xfer(&info->minfo, xfer);
-	dev_dbg(dev, "RM_RA:get config ring %u ret:%d\n", index, ret);
-	return ret;
-}
-
 /**
  * ti_sci_cmd_rm_psil_pair() - Pair PSI-L source to destination thread
  * @handle:	Pointer to TI SCI handle.
@@ -2926,7 +2847,6 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->free_event_map = ti_sci_cmd_free_event_map;
 
 	rops->config = ti_sci_cmd_ring_config;
-	rops->get_config = ti_sci_cmd_ring_get_config;
 
 	psilops->pair = ti_sci_cmd_rm_psil_pair;
 	psilops->unpair = ti_sci_cmd_rm_psil_unpair;
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index ca15d8f1f8de..1cdf918be861 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -49,7 +49,6 @@
 #define TI_SCI_MSG_RM_RING_RECONFIG		0x1102
 #define TI_SCI_MSG_RM_RING_RESET		0x1103
 #define TI_SCI_MSG_RM_RING_CFG			0x1110
-#define TI_SCI_MSG_RM_RING_GET_CFG		0x1111
 
 /* PSI-L requests */
 #define TI_SCI_MSG_RM_PSIL_PAIR			0x1280
@@ -687,49 +686,6 @@ struct ti_sci_msg_rm_ring_cfg_req {
 	u8 order_id;
 } __packed;
 
-/**
- * struct ti_sci_msg_rm_ring_get_cfg_req - Get RA ring's configuration
- *
- * Gets the configuration of the non-real-time register fields of a ring.  The
- * host, or a supervisor of the host, who owns the ring must be the requesting
- * host.  The values of the non-real-time registers are returned in
- * @ti_sci_msg_rm_ring_get_cfg_resp.
- *
- * @hdr: Generic Header
- * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
- * @index: ring index.
- */
-struct ti_sci_msg_rm_ring_get_cfg_req {
-	struct ti_sci_msg_hdr hdr;
-	u16 nav_id;
-	u16 index;
-} __packed;
-
-/**
- * struct ti_sci_msg_rm_ring_get_cfg_resp -  Ring get configuration response
- *
- * Response received by host processor after RM has handled
- * @ti_sci_msg_rm_ring_get_cfg_req. The response contains the ring's
- * non-real-time register values.
- *
- * @hdr: Generic Header
- * @addr_lo: Ring 32 LSBs of base address
- * @addr_hi: Ring 16 MSBs of base address.
- * @count: Ring number of elements.
- * @mode: Ring mode.
- * @size: encoded Ring element size
- * @order_id: ing order ID.
- */
-struct ti_sci_msg_rm_ring_get_cfg_resp {
-	struct ti_sci_msg_hdr hdr;
-	u32 addr_lo;
-	u32 addr_hi;
-	u32 count;
-	u8 mode;
-	u8 size;
-	u8 order_id;
-} __packed;
-
 /**
  * struct ti_sci_msg_psil_pair - Pairs a PSI-L source thread to a destination
  *				 thread
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 6978afc00823..6710d7ac7a72 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -286,8 +286,6 @@ struct ti_sci_rm_irq_ops {
 /**
  * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
  * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
- * @get_config: get the SoC Navigator Subsystem Ring Accelerator ring
- *		configuration
  */
 struct ti_sci_rm_ringacc_ops {
 	int (*config)(const struct ti_sci_handle *handle,
@@ -295,10 +293,6 @@ struct ti_sci_rm_ringacc_ops {
 		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
 		      u8 size, u8 order_id
 	);
-	int (*get_config)(const struct ti_sci_handle *handle,
-			  u32 nav_id, u32 index, u8 *mode,
-			  u32 *addr_lo, u32 *addr_hi, u32 *count,
-			  u8 *size, u8 *order_id);
 };
 
 /**
-- 
cgit 


From 3c2017536f3a122bf246cc87f9327e9ec138db92 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:06 -0700
Subject: firmware: ti_sci: rm: Add new ops for ring configuration

The sysfw ring configuration message has been extended to include virtid
and asel value for the ring.
Add the ASEL_VALID to TI_SCI_MSG_VALUE_RM_ALL_NO_ORDER as it is required
for DMA rings.

Instead of extending the current .config() ops - which would need same
patch change in the ringacc driver - add ti_sci_msg_rm_ring_cfg struct and
a new ops using it to configure the ring.

This will allow easy update path in case new members are added for the ring
configuration.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 63 ++++++++++++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              |  7 ++++
 include/linux/soc/ti/ti_sci_protocol.h | 31 ++++++++++++++++-
 3 files changed, 100 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 0b801e67e672..a4d2b318795c 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2119,6 +2119,68 @@ fail:
 	return ret;
 }
 
+/**
+ * ti_sci_cmd_rm_ring_cfg() - Configure a NAVSS ring
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_ring_cfg ring config structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_ring_cfg and @ti_sci_msg_rm_ring_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_ring_cfg(const struct ti_sci_handle *handle,
+				  const struct ti_sci_msg_rm_ring_cfg *params)
+{
+	struct ti_sci_msg_rm_ring_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RM_RA:Message config failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_ring_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->index = params->index;
+	req->addr_lo = params->addr_lo;
+	req->addr_hi = params->addr_hi;
+	req->count = params->count;
+	req->mode = params->mode;
+	req->size = params->size;
+	req->order_id = params->order_id;
+	req->virtid = params->virtid;
+	req->asel = params->asel;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RM_RA:Mbox config send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(dev, "RM_RA:config ring %u ret:%d\n", params->index, ret);
+	return ret;
+}
+
 /**
  * ti_sci_cmd_rm_psil_pair() - Pair PSI-L source to destination thread
  * @handle:	Pointer to TI SCI handle.
@@ -2847,6 +2909,7 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->free_event_map = ti_sci_cmd_free_event_map;
 
 	rops->config = ti_sci_cmd_ring_config;
+	rops->set_cfg = ti_sci_cmd_rm_ring_cfg;
 
 	psilops->pair = ti_sci_cmd_rm_psil_pair;
 	psilops->unpair = ti_sci_cmd_rm_psil_unpair;
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 1cdf918be861..ef3a8214d002 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -659,6 +659,8 @@ struct ti_sci_msg_req_manage_irq {
  *	3 - Valid bit for @tisci_msg_rm_ring_cfg_req mode
  *	4 - Valid bit for @tisci_msg_rm_ring_cfg_req size
  *	5 - Valid bit for @tisci_msg_rm_ring_cfg_req order_id
+ *	6 - Valid bit for @tisci_msg_rm_ring_cfg_req virtid
+ *	7 - Valid bit for @tisci_msg_rm_ring_cfg_req ASEL
  * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
  * @index: ring index to be configured.
  * @addr_lo: 32 LSBs of ring base address to be programmed into the ring's
@@ -672,6 +674,9 @@ struct ti_sci_msg_req_manage_irq {
  *	the formula (log2(size_bytes) - 2), where size_bytes cannot be
  *	greater than 256.
  * @order_id: Specifies the ring's bus order ID.
+ * @virtid: Ring virt ID value
+ * @asel: Ring ASEL (address select) value to be set into the ASEL field of the
+ *	ring's RING_BA_HI register.
  */
 struct ti_sci_msg_rm_ring_cfg_req {
 	struct ti_sci_msg_hdr hdr;
@@ -684,6 +689,8 @@ struct ti_sci_msg_rm_ring_cfg_req {
 	u8 mode;
 	u8 size;
 	u8 order_id;
+	u16 virtid;
+	u8 asel;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 6710d7ac7a72..d1711050cd9d 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -275,17 +275,44 @@ struct ti_sci_rm_irq_ops {
 #define TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID	BIT(4)
 /* RA config.order_id parameter is valid for RM ring configure TISCI message */
 #define TI_SCI_MSG_VALUE_RM_RING_ORDER_ID_VALID	BIT(5)
+/* RA config.virtid parameter is valid for RM ring configure TISCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_VIRTID_VALID	BIT(6)
+/* RA config.asel parameter is valid for RM ring configure TISCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ASEL_VALID	BIT(7)
 
 #define TI_SCI_MSG_VALUE_RM_ALL_NO_ORDER \
 	(TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID | \
 	TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID | \
 	TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID | \
 	TI_SCI_MSG_VALUE_RM_RING_MODE_VALID | \
-	TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID)
+	TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_ASEL_VALID)
+
+/**
+ * struct ti_sci_msg_rm_ring_cfg - Ring configuration
+ *
+ * Parameters for Navigator Subsystem ring configuration
+ * See @ti_sci_msg_rm_ring_cfg_req
+ */
+struct ti_sci_msg_rm_ring_cfg {
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 count;
+	u8 mode;
+	u8 size;
+	u8 order_id;
+	u16 virtid;
+	u8 asel;
+};
 
 /**
  * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
  * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
+ *	    Deprecated
+ * @set_cfg: configure the SoC Navigator Subsystem Ring Accelerator ring
  */
 struct ti_sci_rm_ringacc_ops {
 	int (*config)(const struct ti_sci_handle *handle,
@@ -293,6 +320,8 @@ struct ti_sci_rm_ringacc_ops {
 		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
 		      u8 size, u8 order_id
 	);
+	int (*set_cfg)(const struct ti_sci_handle *handle,
+		       const struct ti_sci_msg_rm_ring_cfg *params);
 };
 
 /**
-- 
cgit 


From fed7552f1e69296461fca62ebaa0bb5a06fec0df Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:07 -0700
Subject: firmware: ti_sci: rm: Remove unused config() from
 ti_sci_rm_ringacc_ops

The ringacc driver has been converted to use the new set_cfg function to
configure the ring, the old config ops can be removed.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 72 ----------------------------------
 include/linux/soc/ti/ti_sci_protocol.h |  7 ----
 2 files changed, 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index a4d2b318795c..235c7e7869aa 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2048,77 +2048,6 @@ static int ti_sci_cmd_free_event_map(const struct ti_sci_handle *handle,
 			       ia_id, vint, global_event, vint_status_bit, 0);
 }
 
-/**
- * ti_sci_cmd_ring_config() - configure RA ring
- * @handle:		Pointer to TI SCI handle.
- * @valid_params:	Bitfield defining validity of ring configuration
- *			parameters
- * @nav_id:		Device ID of Navigator Subsystem from which the ring is
- *			allocated
- * @index:		Ring index
- * @addr_lo:		The ring base address lo 32 bits
- * @addr_hi:		The ring base address hi 32 bits
- * @count:		Number of ring elements
- * @mode:		The mode of the ring
- * @size:		The ring element size.
- * @order_id:		Specifies the ring's bus order ID
- *
- * Return: 0 if all went well, else returns appropriate error value.
- *
- * See @ti_sci_msg_rm_ring_cfg_req for more info.
- */
-static int ti_sci_cmd_ring_config(const struct ti_sci_handle *handle,
-				  u32 valid_params, u16 nav_id, u16 index,
-				  u32 addr_lo, u32 addr_hi, u32 count,
-				  u8 mode, u8 size, u8 order_id)
-{
-	struct ti_sci_msg_rm_ring_cfg_req *req;
-	struct ti_sci_msg_hdr *resp;
-	struct ti_sci_xfer *xfer;
-	struct ti_sci_info *info;
-	struct device *dev;
-	int ret = 0;
-
-	if (IS_ERR_OR_NULL(handle))
-		return -EINVAL;
-
-	info = handle_to_ti_sci_info(handle);
-	dev = info->dev;
-
-	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_CFG,
-				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
-				   sizeof(*req), sizeof(*resp));
-	if (IS_ERR(xfer)) {
-		ret = PTR_ERR(xfer);
-		dev_err(dev, "RM_RA:Message config failed(%d)\n", ret);
-		return ret;
-	}
-	req = (struct ti_sci_msg_rm_ring_cfg_req *)xfer->xfer_buf;
-	req->valid_params = valid_params;
-	req->nav_id = nav_id;
-	req->index = index;
-	req->addr_lo = addr_lo;
-	req->addr_hi = addr_hi;
-	req->count = count;
-	req->mode = mode;
-	req->size = size;
-	req->order_id = order_id;
-
-	ret = ti_sci_do_xfer(info, xfer);
-	if (ret) {
-		dev_err(dev, "RM_RA:Mbox config send fail %d\n", ret);
-		goto fail;
-	}
-
-	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
-	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
-
-fail:
-	ti_sci_put_one_xfer(&info->minfo, xfer);
-	dev_dbg(dev, "RM_RA:config ring %u ret:%d\n", index, ret);
-	return ret;
-}
-
 /**
  * ti_sci_cmd_rm_ring_cfg() - Configure a NAVSS ring
  * @handle:	Pointer to TI SCI handle.
@@ -2908,7 +2837,6 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->free_irq = ti_sci_cmd_free_irq;
 	iops->free_event_map = ti_sci_cmd_free_event_map;
 
-	rops->config = ti_sci_cmd_ring_config;
 	rops->set_cfg = ti_sci_cmd_rm_ring_cfg;
 
 	psilops->pair = ti_sci_cmd_rm_psil_pair;
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index d1711050cd9d..0aad7009b50e 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -310,16 +310,9 @@ struct ti_sci_msg_rm_ring_cfg {
 
 /**
  * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
- * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
- *	    Deprecated
  * @set_cfg: configure the SoC Navigator Subsystem Ring Accelerator ring
  */
 struct ti_sci_rm_ringacc_ops {
-	int (*config)(const struct ti_sci_handle *handle,
-		      u32 valid_params, u16 nav_id, u16 index,
-		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
-		      u8 size, u8 order_id
-	);
 	int (*set_cfg)(const struct ti_sci_handle *handle,
 		       const struct ti_sci_msg_rm_ring_cfg *params);
 };
-- 
cgit 


From 8c42379e40e2db4199ceeb6a6ef9fff73ff132cf Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:22 -0700
Subject: soc: ti: k3-ringacc: Use correct device for allocation in RING mode

In RING mode the ringacc does not access the ring memory. In this access
mode the ringacc coherency does not have meaning.

If the ring is configured in RING mode, then the ringacc itself will not
access to the ring memory. Only the requester (user) of the ring is going
to read/write to the memory.
Extend the ring configuration parameters with a device pointer to be used
for DMA API when the ring is configured in RING mode.

Extending the ring configuration struct will allow per ring selection of
device to be used for allocation, thus allowing per ring coherency.

To avoid regression, fall back to use the ringacc dev in case the alloc_dev
is not provided.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/soc/ti/k3-ringacc.c       | 18 +++++++++++++-----
 include/linux/soc/ti/k3-ringacc.h |  5 +++++
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c
index 9ddd77113c5a..7fdb688452f7 100644
--- a/drivers/soc/ti/k3-ringacc.c
+++ b/drivers/soc/ti/k3-ringacc.c
@@ -141,6 +141,7 @@ struct k3_ring_state {
  * @parent: Pointer on struct @k3_ringacc
  * @use_count: Use count for shared rings
  * @proxy_id: RA Ring Proxy Id (only if @K3_RINGACC_RING_USE_PROXY)
+ * @dma_dev: device to be used for DMA API (allocation, mapping)
  */
 struct k3_ring {
 	struct k3_ring_rt_regs __iomem *rt;
@@ -160,6 +161,7 @@ struct k3_ring {
 	struct k3_ringacc	*parent;
 	u32		use_count;
 	int		proxy_id;
+	struct device	*dma_dev;
 };
 
 struct k3_ringacc_ops {
@@ -508,11 +510,12 @@ int k3_ringacc_ring_free(struct k3_ring *ring)
 
 	k3_ringacc_ring_free_sci(ring);
 
-	dma_free_coherent(ringacc->dev,
+	dma_free_coherent(ring->dma_dev,
 			  ring->size * (4 << ring->elm_size),
 			  ring->ring_mem_virt, ring->ring_mem_dma);
 	ring->flags = 0;
 	ring->ops = NULL;
+	ring->dma_dev = NULL;
 	if (ring->proxy_id != K3_RINGACC_PROXY_NOT_USED) {
 		clear_bit(ring->proxy_id, ringacc->proxy_inuse);
 		ring->proxy = NULL;
@@ -633,8 +636,12 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 	switch (ring->mode) {
 	case K3_RINGACC_RING_MODE_RING:
 		ring->ops = &k3_ring_mode_ring_ops;
+		ring->dma_dev = cfg->dma_dev;
+		if (!ring->dma_dev)
+			ring->dma_dev = ringacc->dev;
 		break;
 	case K3_RINGACC_RING_MODE_MESSAGE:
+		ring->dma_dev = ringacc->dev;
 		if (ring->proxy)
 			ring->ops = &k3_ring_mode_proxy_ops;
 		else
@@ -646,9 +653,9 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 		goto err_free_proxy;
 	}
 
-	ring->ring_mem_virt = dma_alloc_coherent(ringacc->dev,
-					ring->size * (4 << ring->elm_size),
-					&ring->ring_mem_dma, GFP_KERNEL);
+	ring->ring_mem_virt = dma_alloc_coherent(ring->dma_dev,
+						 ring->size * (4 << ring->elm_size),
+						 &ring->ring_mem_dma, GFP_KERNEL);
 	if (!ring->ring_mem_virt) {
 		dev_err(ringacc->dev, "Failed to alloc ring mem\n");
 		ret = -ENOMEM;
@@ -669,12 +676,13 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 	return 0;
 
 err_free_mem:
-	dma_free_coherent(ringacc->dev,
+	dma_free_coherent(ring->dma_dev,
 			  ring->size * (4 << ring->elm_size),
 			  ring->ring_mem_virt,
 			  ring->ring_mem_dma);
 err_free_ops:
 	ring->ops = NULL;
+	ring->dma_dev = NULL;
 err_free_proxy:
 	ring->proxy = NULL;
 	return ret;
diff --git a/include/linux/soc/ti/k3-ringacc.h b/include/linux/soc/ti/k3-ringacc.h
index 5a472eca5ee4..658dc71d2901 100644
--- a/include/linux/soc/ti/k3-ringacc.h
+++ b/include/linux/soc/ti/k3-ringacc.h
@@ -67,6 +67,9 @@ struct k3_ring;
  *	 few times. It's usable when the same ring is used as Free Host PD ring
  *	 for different flows, for example.
  *	 Note: Locking should be done by consumer if required
+ * @dma_dev: Master device which is using and accessing to the ring
+ *	memory when the mode is K3_RINGACC_RING_MODE_RING. Memory allocations
+ *	should be done using this device.
  */
 struct k3_ring_cfg {
 	u32 size;
@@ -74,6 +77,8 @@ struct k3_ring_cfg {
 	enum k3_ring_mode mode;
 #define K3_RINGACC_RING_SHARED BIT(1)
 	u32 flags;
+
+	struct device *dma_dev;
 };
 
 #define K3_RINGACC_RING_ID_ANY (-1)
-- 
cgit 


From f51778db088b2407ec177f2f4da0f6290602aa3f Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 2 Nov 2020 12:43:27 +1100
Subject: swiotlb: using SIZE_MAX needs limits.h included

After merging the drm-misc tree, linux-next build (arm
multi_v7_defconfig) failed like this:

In file included from drivers/gpu/drm/nouveau/nouveau_ttm.c:26:
include/linux/swiotlb.h: In function 'swiotlb_max_mapping_size':
include/linux/swiotlb.h:99:9: error: 'SIZE_MAX' undeclared (first use in this function)
   99 |  return SIZE_MAX;
      |         ^~~~~~~~
include/linux/swiotlb.h:7:1: note: 'SIZE_MAX' is defined in header '<stdint.h>'; did you forget to '#include <stdint.h>'?
    6 | #include <linux/init.h>
  +++ |+#include <stdint.h>
    7 | #include <linux/types.h>
include/linux/swiotlb.h:99:9: note: each undeclared identifier is reported only once for each function it appears in
   99 |  return SIZE_MAX;
      |         ^~~~~~~~

Caused by commit

  abe420bfae52 ("swiotlb: Introduce swiotlb_max_mapping_size()")

but only exposed by commit "drm/nouveu: fix swiotlb include"

Fix it by including linux/limits.h as appropriate.

Fixes: abe420bfae52 ("swiotlb: Introduce swiotlb_max_mapping_size()")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/r/20201102124327.2f82b2a7@canb.auug.org.au
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/swiotlb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 046bb94bd4d6..fa5122c6711e 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -5,6 +5,7 @@
 #include <linux/dma-direction.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/limits.h>
 
 struct device;
 struct page;
-- 
cgit 


From 7a60c2dd0f575ab14a457e99582af0ca1e072a74 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 28 Oct 2020 16:15:26 -0300
Subject: drm: Remove SCATTERLIST_MAX_SEGMENT

Since commit 9a40401cfa13 ("lib/scatterlist: Do not limit max_segment to
PAGE_ALIGNED values") the max_segment input to sg_alloc_table_from_pages()
does not have to be any special value. The new algorithm will always
create something less than what the user provides. Thus eliminate this
confusing constant.

- vmwgfx should use the HW capability, not mix in the OS page size for
  calling dma_set_max_seg_size()

- i915 uses i915_sg_segment_size() both for sg_alloc_table_from_pages
  and for some open coded sgl construction. This doesn't change the value
  since rounddown(size, UINT_MAX) == SCATTERLIST_MAX_SEGMENT

- drm_prime_pages_to_sg uses it as a default if max_segment is zero,
  UINT_MAX is fine to use directly.

Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Qian Cai <cai@lca.pw>
Cc: "Ursulin, Tvrtko" <tvrtko.ursulin@intel.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/0-v1-44733fccd781+13d-rm_scatterlist_max_jgg@nvidia.com
---
 drivers/gpu/drm/drm_prime.c             | 4 ++--
 drivers/gpu/drm/i915/i915_scatterlist.h | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c     | 3 +--
 include/linux/scatterlist.h             | 6 ------
 tools/testing/scatterlist/main.c        | 2 +-
 5 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 187b55ede62e..a7b61c2d9190 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -820,8 +820,8 @@ struct sg_table *drm_prime_pages_to_sg(struct drm_device *dev,
 
 	if (dev)
 		max_segment = dma_max_mapping_size(dev->dev);
-	if (max_segment == 0 || max_segment > SCATTERLIST_MAX_SEGMENT)
-		max_segment = SCATTERLIST_MAX_SEGMENT;
+	if (max_segment == 0)
+		max_segment = UINT_MAX;
 	sge = __sg_alloc_table_from_pages(sg, pages, nr_pages, 0,
 					  nr_pages << PAGE_SHIFT,
 					  max_segment,
diff --git a/drivers/gpu/drm/i915/i915_scatterlist.h b/drivers/gpu/drm/i915/i915_scatterlist.h
index b7b59328cb76..883dd8d09d6b 100644
--- a/drivers/gpu/drm/i915/i915_scatterlist.h
+++ b/drivers/gpu/drm/i915/i915_scatterlist.h
@@ -112,7 +112,7 @@ static inline unsigned int i915_sg_segment_size(void)
 	unsigned int size = swiotlb_max_segment();
 
 	if (size == 0)
-		return SCATTERLIST_MAX_SEGMENT;
+		size = UINT_MAX;
 
 	size = rounddown(size, PAGE_SIZE);
 	/* swiotlb_max_segment_size can return 1 byte when it means one page. */
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index b3a60959b5d5..0c42d2c05f43 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -794,8 +794,7 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 	if (unlikely(ret != 0))
 		goto out_err0;
 
-	dma_set_max_seg_size(dev->dev, min_t(unsigned int, U32_MAX & PAGE_MASK,
-					     SCATTERLIST_MAX_SEGMENT));
+	dma_set_max_seg_size(dev->dev, U32_MAX);
 
 	if (dev_priv->capabilities & SVGA_CAP_GMR2) {
 		DRM_INFO("Max GMR ids is %u\n",
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 36c47e7e66a2..6f70572b2938 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -18,12 +18,6 @@ struct scatterlist {
 #endif
 };
 
-/*
- * Since the above length field is an unsigned int, below we define the maximum
- * length in bytes that can be stored in one scatterlist entry.
- */
-#define SCATTERLIST_MAX_SEGMENT (UINT_MAX & PAGE_MASK)
-
 /*
  * These macros should be used after a dma_map_sg call has been done
  * to get bus addresses of each of the SG entries and their lengths.
diff --git a/tools/testing/scatterlist/main.c b/tools/testing/scatterlist/main.c
index b2c7e9f7b8d3..d264bf853034 100644
--- a/tools/testing/scatterlist/main.c
+++ b/tools/testing/scatterlist/main.c
@@ -50,7 +50,7 @@ static void fail(struct test *test, struct sg_table *st, const char *cond)
 
 int main(void)
 {
-	const unsigned int sgmax = SCATTERLIST_MAX_SEGMENT;
+	const unsigned int sgmax = UINT_MAX;
 	struct test *test, tests[] = {
 		{ -EINVAL, 1, pfn(0), PAGE_SIZE, PAGE_SIZE + 1, 1 },
 		{ -EINVAL, 1, pfn(0), PAGE_SIZE, 0, 1 },
-- 
cgit 


From fc0021aa340af65a0a37d77be39e22aa886a6132 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Oct 2020 08:33:09 +0200
Subject: swiotlb: remove the tbl_dma_addr argument to swiotlb_tbl_map_single

The tbl_dma_addr argument is used to check the DMA boundary for the
allocations, and thus needs to be a dma_addr_t.  swiotlb-xen instead
passed a physical address, which could lead to incorrect results for
strange offsets.  Fix this by removing the parameter entirely and hard
code the DMA address for io_tlb_start instead.

Fixes: 91ffe4ad534a ("swiotlb-xen: introduce phys_to_dma/dma_to_phys translations")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/iommu/intel/iommu.c |  5 ++---
 drivers/xen/swiotlb-xen.c   |  3 +--
 include/linux/swiotlb.h     | 10 +++-------
 kernel/dma/swiotlb.c        | 16 ++++++----------
 4 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 8651f6d4dfa0..6b560e6f1930 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3815,9 +3815,8 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
 	 * page aligned, we don't need to use a bounce page.
 	 */
 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
-		tlb_addr = swiotlb_tbl_map_single(dev,
-				phys_to_dma_unencrypted(dev, io_tlb_start),
-				paddr, size, aligned_size, dir, attrs);
+		tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
+				aligned_size, dir, attrs);
 		if (tlb_addr == DMA_MAPPING_ERROR) {
 			goto swiotlb_error;
 		} else {
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 71ce1b7a23d1..2b385c1b4a99 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -395,8 +395,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 */
 	trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
 
-	map = swiotlb_tbl_map_single(dev, virt_to_phys(xen_io_tlb_start),
-				     phys, size, size, dir, attrs);
+	map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs);
 	if (map == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 513913ff7486..3bb72266a75a 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -45,13 +45,9 @@ enum dma_sync_target {
 	SYNC_FOR_DEVICE = 1,
 };
 
-extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
-					  dma_addr_t tbl_dma_addr,
-					  phys_addr_t phys,
-					  size_t mapping_size,
-					  size_t alloc_size,
-					  enum dma_data_direction dir,
-					  unsigned long attrs);
+phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
+		size_t mapping_size, size_t alloc_size,
+		enum dma_data_direction dir, unsigned long attrs);
 
 extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     phys_addr_t tlb_addr,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 54078f0d4c87..781b9dca197c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -445,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
 	}
 }
 
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
-				   dma_addr_t tbl_dma_addr,
-				   phys_addr_t orig_addr,
-				   size_t mapping_size,
-				   size_t alloc_size,
-				   enum dma_data_direction dir,
-				   unsigned long attrs)
+phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
+		size_t mapping_size, size_t alloc_size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
+	dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start);
 	unsigned long flags;
 	phys_addr_t tlb_addr;
 	unsigned int nslots, stride, index, wrap;
@@ -671,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
 			      swiotlb_force);
 
-	swiotlb_addr = swiotlb_tbl_map_single(dev,
-			phys_to_dma_unencrypted(dev, io_tlb_start),
-			paddr, size, size, dir, attrs);
+	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
+			attrs);
 	if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
-- 
cgit 


From e0e398e204634db8fb71bd89cf2f6e3e5bd09b51 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 21 Oct 2020 21:12:15 +0200
Subject: PM: runtime: Drop runtime PM references to supplier on link removal

While removing a device link, drop the supplier device's runtime PM
usage counter as many times as needed to drop all of the runtime PM
references to it from the consumer in addition to dropping the
consumer's link count.

Fixes: baa8809f6097 ("PM / runtime: Optimize the use of device links")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: 5.1+ <stable@vger.kernel.org> # 5.1+
Tested-by: Xiang Chen <chenxiang66@hisilicon.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c          |  6 ++----
 drivers/base/power/runtime.c | 21 ++++++++++++++++++++-
 include/linux/pm_runtime.h   |  4 ++--
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 78114ddac755..d661ada1518f 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -773,8 +773,7 @@ static void __device_link_del(struct kref *kref)
 	dev_dbg(link->consumer, "Dropping the link to %s\n",
 		dev_name(link->supplier));
 
-	if (link->flags & DL_FLAG_PM_RUNTIME)
-		pm_runtime_drop_link(link->consumer);
+	pm_runtime_drop_link(link);
 
 	list_del_rcu(&link->s_node);
 	list_del_rcu(&link->c_node);
@@ -788,8 +787,7 @@ static void __device_link_del(struct kref *kref)
 	dev_info(link->consumer, "Dropping the link to %s\n",
 		 dev_name(link->supplier));
 
-	if (link->flags & DL_FLAG_PM_RUNTIME)
-		pm_runtime_drop_link(link->consumer);
+	pm_runtime_drop_link(link);
 
 	list_del(&link->s_node);
 	list_del(&link->c_node);
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 6f605f7820bb..6919f7fc226b 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1729,7 +1729,7 @@ void pm_runtime_new_link(struct device *dev)
 	spin_unlock_irq(&dev->power.lock);
 }
 
-void pm_runtime_drop_link(struct device *dev)
+static void pm_runtime_drop_link_count(struct device *dev)
 {
 	spin_lock_irq(&dev->power.lock);
 	WARN_ON(dev->power.links_count == 0);
@@ -1737,6 +1737,25 @@ void pm_runtime_drop_link(struct device *dev)
 	spin_unlock_irq(&dev->power.lock);
 }
 
+/**
+ * pm_runtime_drop_link - Prepare for device link removal.
+ * @link: Device link going away.
+ *
+ * Drop the link count of the consumer end of @link and decrement the supplier
+ * device's runtime PM usage counter as many times as needed to drop all of the
+ * PM runtime reference to it from the consumer.
+ */
+void pm_runtime_drop_link(struct device_link *link)
+{
+	if (!(link->flags & DL_FLAG_PM_RUNTIME))
+		return;
+
+	pm_runtime_drop_link_count(link->consumer);
+
+	while (refcount_dec_not_one(&link->rpm_active))
+		pm_runtime_put(link->supplier);
+}
+
 static bool pm_runtime_need_not_resume(struct device *dev)
 {
 	return atomic_read(&dev->power.usage_count) <= 1 &&
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 18b02dcc168e..eadc1fdebce6 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -58,7 +58,7 @@ extern void pm_runtime_clean_up_links(struct device *dev);
 extern void pm_runtime_get_suppliers(struct device *dev);
 extern void pm_runtime_put_suppliers(struct device *dev);
 extern void pm_runtime_new_link(struct device *dev);
-extern void pm_runtime_drop_link(struct device *dev);
+extern void pm_runtime_drop_link(struct device_link *link);
 
 /**
  * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter.
@@ -280,7 +280,7 @@ static inline void pm_runtime_clean_up_links(struct device *dev) {}
 static inline void pm_runtime_get_suppliers(struct device *dev) {}
 static inline void pm_runtime_put_suppliers(struct device *dev) {}
 static inline void pm_runtime_new_link(struct device *dev) {}
-static inline void pm_runtime_drop_link(struct device *dev) {}
+static inline void pm_runtime_drop_link(struct device_link *link) {}
 
 #endif /* !CONFIG_PM */
 
-- 
cgit 


From d6e36668598154820177bfd78c1621d8e6c580a2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 21 Oct 2020 21:13:10 +0200
Subject: PM: runtime: Drop pm_runtime_clean_up_links()

After commit d12544fb2aa9 ("PM: runtime: Remove link state checks in
rpm_get/put_supplier()") nothing prevents the consumer device's
runtime PM from acquiring additional references to the supplier
device after pm_runtime_clean_up_links() has run (or even while it
is running), so calling this function from __device_release_driver()
may be pointless (or even harmful).

Moreover, it ignores stateless device links, so the runtime PM
handling of managed and stateless device links is inconsistent
because of it, so better get rid of it entirely.

Fixes: d12544fb2aa9 ("PM: runtime: Remove link state checks in rpm_get/put_supplier()")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: 5.1+ <stable@vger.kernel.org> # 5.1+
Tested-by: Xiang Chen <chenxiang66@hisilicon.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/dd.c            |  1 -
 drivers/base/power/runtime.c | 36 ------------------------------------
 include/linux/pm_runtime.h   |  2 --
 3 files changed, 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index b42229b74fd6..122b0372fdc9 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -1133,7 +1133,6 @@ static void __device_release_driver(struct device *dev, struct device *parent)
 		}
 
 		pm_runtime_get_sync(dev);
-		pm_runtime_clean_up_links(dev);
 
 		driver_sysfs_remove(dev);
 
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 6919f7fc226b..bfda153b1a41 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1642,42 +1642,6 @@ void pm_runtime_remove(struct device *dev)
 	pm_runtime_reinit(dev);
 }
 
-/**
- * pm_runtime_clean_up_links - Prepare links to consumers for driver removal.
- * @dev: Device whose driver is going to be removed.
- *
- * Check links from this device to any consumers and if any of them have active
- * runtime PM references to the device, drop the usage counter of the device
- * (as many times as needed).
- *
- * Links with the DL_FLAG_MANAGED flag unset are ignored.
- *
- * Since the device is guaranteed to be runtime-active at the point this is
- * called, nothing else needs to be done here.
- *
- * Moreover, this is called after device_links_busy() has returned 'false', so
- * the status of each link is guaranteed to be DL_STATE_SUPPLIER_UNBIND and
- * therefore rpm_active can't be manipulated concurrently.
- */
-void pm_runtime_clean_up_links(struct device *dev)
-{
-	struct device_link *link;
-	int idx;
-
-	idx = device_links_read_lock();
-
-	list_for_each_entry_rcu(link, &dev->links.consumers, s_node,
-				device_links_read_lock_held()) {
-		if (!(link->flags & DL_FLAG_MANAGED))
-			continue;
-
-		while (refcount_dec_not_one(&link->rpm_active))
-			pm_runtime_put_noidle(dev);
-	}
-
-	device_links_read_unlock(idx);
-}
-
 /**
  * pm_runtime_get_suppliers - Resume and reference-count supplier devices.
  * @dev: Consumer device.
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index eadc1fdebce6..4b708f4e8eed 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -54,7 +54,6 @@ extern u64 pm_runtime_autosuspend_expiration(struct device *dev);
 extern void pm_runtime_update_max_time_suspended(struct device *dev,
 						 s64 delta_ns);
 extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
-extern void pm_runtime_clean_up_links(struct device *dev);
 extern void pm_runtime_get_suppliers(struct device *dev);
 extern void pm_runtime_put_suppliers(struct device *dev);
 extern void pm_runtime_new_link(struct device *dev);
@@ -276,7 +275,6 @@ static inline u64 pm_runtime_autosuspend_expiration(
 				struct device *dev) { return 0; }
 static inline void pm_runtime_set_memalloc_noio(struct device *dev,
 						bool enable){}
-static inline void pm_runtime_clean_up_links(struct device *dev) {}
 static inline void pm_runtime_get_suppliers(struct device *dev) {}
 static inline void pm_runtime_put_suppliers(struct device *dev) {}
 static inline void pm_runtime_new_link(struct device *dev) {}
-- 
cgit 


From 56a7ff75cd08987812209971e319f78156ea2bb1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 22 Oct 2020 13:57:45 +0200
Subject: cpufreq: Drop restore_freq from struct cpufreq_policy

The restore_freq field in struct cpufreq_policy is only used by
__target_index() in one place and a local variable in that function
may as well be used instead of it, so drop it and modify
__target_index() accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 10 +++++-----
 include/linux/cpufreq.h   |  5 -----
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 336b5e94cbc8..f2d96175f62d 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2123,7 +2123,7 @@ static int __target_intermediate(struct cpufreq_policy *policy,
 static int __target_index(struct cpufreq_policy *policy, int index)
 {
 	struct cpufreq_freqs freqs = {.old = policy->cur, .flags = 0};
-	unsigned int intermediate_freq = 0;
+	unsigned int restore_freq, intermediate_freq = 0;
 	unsigned int newfreq = policy->freq_table[index].frequency;
 	int retval = -EINVAL;
 	bool notify;
@@ -2131,6 +2131,9 @@ static int __target_index(struct cpufreq_policy *policy, int index)
 	if (newfreq == policy->cur)
 		return 0;
 
+	/* Save last value to restore later on errors */
+	restore_freq = policy->cur;
+
 	notify = !(cpufreq_driver->flags & CPUFREQ_ASYNC_NOTIFICATION);
 	if (notify) {
 		/* Handle switching to intermediate frequency */
@@ -2168,7 +2171,7 @@ static int __target_index(struct cpufreq_policy *policy, int index)
 		 */
 		if (unlikely(retval && intermediate_freq)) {
 			freqs.old = intermediate_freq;
-			freqs.new = policy->restore_freq;
+			freqs.new = restore_freq;
 			cpufreq_freq_transition_begin(policy, &freqs);
 			cpufreq_freq_transition_end(policy, &freqs, 0);
 		}
@@ -2203,9 +2206,6 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 	    !(cpufreq_driver->flags & CPUFREQ_NEED_UPDATE_LIMITS))
 		return 0;
 
-	/* Save last value to restore later on errors */
-	policy->restore_freq = policy->cur;
-
 	if (cpufreq_driver->target)
 		return cpufreq_driver->target(policy, target_freq, relation);
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1eaa04f1bae6..9779a6cd8baa 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -65,7 +65,6 @@ struct cpufreq_policy {
 	unsigned int		max;    /* in kHz */
 	unsigned int		cur;    /* in kHz, only needed if cpufreq
 					 * governors are used */
-	unsigned int		restore_freq; /* = policy->cur before transition */
 	unsigned int		suspend_freq; /* freq to set during suspend */
 
 	unsigned int		policy; /* see above */
@@ -308,10 +307,6 @@ struct cpufreq_driver {
 	/* define one out of two */
 	int		(*setpolicy)(struct cpufreq_policy *policy);
 
-	/*
-	 * On failure, should always restore frequency to policy->restore_freq
-	 * (i.e. old freq).
-	 */
 	int		(*target)(struct cpufreq_policy *policy,
 				  unsigned int target_freq,
 				  unsigned int relation);	/* Deprecated */
-- 
cgit 


From b000d5cb954fe25ac1ea929ae6da321033ace927 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 13 Oct 2020 10:18:04 +0200
Subject: ima: defer arch_ima_get_secureboot() call to IMA init time

Chester reports that it is necessary to introduce a new way to pass
the EFI secure boot status between the EFI stub and the core kernel
on ARM systems. The usual way of obtaining this information is by
checking the SecureBoot and SetupMode EFI variables, but this can
only be done after the EFI variable workqueue is created, which
occurs in a subsys_initcall(), whereas arch_ima_get_secureboot()
is called much earlier by the IMA framework.

However, the IMA framework itself is started as a late_initcall,
and the only reason the call to arch_ima_get_secureboot() occurs
so early is because it happens in the context of a __setup()
callback that parses the ima_appraise= command line parameter.

So let's refactor this code a little bit, by using a core_param()
callback to capture the command line argument, and deferring any
reasoning based on its contents to the IMA init routine.

Cc: Chester Lin <clin@suse.com>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: James Morris <jmorris@namei.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Link: https://lore.kernel.org/linux-arm-kernel/20200904072905.25332-2-clin@suse.com/
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reported-by: kernel test robot <lkp@intel.com> [missing core_param()]
[zohar@linux.ibm.com: included linux/module.h]
Tested-by: Chester Lin <clin@suse.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/ima.h                   |  6 ++++++
 security/integrity/ima/ima_appraise.c | 17 +++++++++++------
 security/integrity/ima/ima_main.c     |  1 +
 3 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 8fa7bcfb2da2..ac3d82f962f2 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -31,6 +31,12 @@ extern void ima_post_path_mknod(struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
 
+#ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
+extern void ima_appraise_parse_cmdline(void);
+#else
+static inline void ima_appraise_parse_cmdline(void) {}
+#endif
+
 #ifdef CONFIG_IMA_KEXEC
 extern void ima_add_kexec_buffer(struct kimage *image);
 #endif
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 3dd8c2e4314e..8361941ee0a1 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -5,6 +5,7 @@
  * Author:
  * Mimi Zohar <zohar@us.ibm.com>
  */
+#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -16,12 +17,19 @@
 
 #include "ima.h"
 
-static int __init default_appraise_setup(char *str)
-{
 #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
+static char *ima_appraise_cmdline_default __initdata;
+core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0);
+
+void __init ima_appraise_parse_cmdline(void)
+{
+	const char *str = ima_appraise_cmdline_default;
 	bool sb_state = arch_ima_get_secureboot();
 	int appraisal_state = ima_appraise;
 
+	if (!str)
+		return;
+
 	if (strncmp(str, "off", 3) == 0)
 		appraisal_state = 0;
 	else if (strncmp(str, "log", 3) == 0)
@@ -42,11 +50,8 @@ static int __init default_appraise_setup(char *str)
 	} else {
 		ima_appraise = appraisal_state;
 	}
-#endif
-	return 1;
 }
-
-__setup("ima_appraise=", default_appraise_setup);
+#endif
 
 /*
  * is_ima_appraise_enabled - return appraise status
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 2d1af8899cab..a962b23e0429 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -904,6 +904,7 @@ static int __init init_ima(void)
 {
 	int error;
 
+	ima_appraise_parse_cmdline();
 	ima_init_template_list();
 	hash_setup(CONFIG_IMA_DEFAULT_HASH);
 	error = ima_init();
-- 
cgit 


From 24269999027e6b161c0078ad9c1557f9a1575128 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 23 Oct 2020 18:32:57 +0200
Subject: EDAC: Fix some kernel-doc markups

Kernel-doc markup should use this format:
        identifier - description

Correct that and also fix some enums' names in the kernel-doc markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/1d291393ba58c7b80908a3fedf02d2f53921ffe9.1603469755.git.mchehab+huawei@kernel.org
---
 drivers/edac/edac_device.h | 11 +++++------
 include/linux/edac.h       |  4 ++--
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
index c4c0e0bdce14..fc2d2c218064 100644
--- a/drivers/edac/edac_device.h
+++ b/drivers/edac/edac_device.h
@@ -258,7 +258,7 @@ extern struct edac_device_ctl_info *edac_device_alloc_ctl_info(
 extern void edac_device_free_ctl_info(struct edac_device_ctl_info *ctl_info);
 
 /**
- * edac_device_add_device: Insert the 'edac_dev' structure into the
+ * edac_device_add_device - Insert the 'edac_dev' structure into the
  *	 edac_device global list and create sysfs entries associated with
  *	 edac_device structure.
  *
@@ -271,9 +271,8 @@ extern void edac_device_free_ctl_info(struct edac_device_ctl_info *ctl_info);
 extern int edac_device_add_device(struct edac_device_ctl_info *edac_dev);
 
 /**
- * edac_device_del_device:
- *	Remove sysfs entries for specified edac_device structure and
- *	then remove edac_device structure from global list
+ * edac_device_del_device - Remove sysfs entries for specified edac_device
+ *	structure and then remove edac_device structure from global list
  *
  * @dev:
  *	Pointer to struct &device representing the edac device
@@ -286,7 +285,7 @@ extern int edac_device_add_device(struct edac_device_ctl_info *edac_dev);
 extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
 
 /**
- * Log correctable errors.
+ * edac_device_handle_ce_count - Log correctable errors.
  *
  * @edac_dev: pointer to struct &edac_device_ctl_info
  * @inst_nr: number of the instance where the CE error happened
@@ -299,7 +298,7 @@ void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
 				 const char *msg);
 
 /**
- * Log uncorrectable errors.
+ * edac_device_handle_ue_count - Log uncorrectable errors.
  *
  * @edac_dev: pointer to struct &edac_device_ctl_info
  * @inst_nr: number of the instance where the CE error happened
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 15e8f3d8a895..52d7487f6bd4 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -229,7 +229,7 @@ enum mem_type {
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
 
 /**
- * enum edac-type - Error Detection and Correction capabilities and mode
+ * enum edac_type - Error Detection and Correction capabilities and mode
  * @EDAC_UNKNOWN:	Unknown if ECC is available
  * @EDAC_NONE:		Doesn't support ECC
  * @EDAC_RESERVED:	Reserved ECC type
@@ -309,7 +309,7 @@ enum scrub_type {
 #define OP_OFFLINE		0x300
 
 /**
- * enum edac_mc_layer - memory controller hierarchy layer
+ * enum edac_mc_layer_type - memory controller hierarchy layer
  *
  * @EDAC_MC_LAYER_BRANCH:	memory layer is named "branch"
  * @EDAC_MC_LAYER_CHANNEL:	memory layer is named "channel"
-- 
cgit 


From f8f6ae5d077a9bdaf5cbf2ac960a5d1a04b47482 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Sun, 1 Nov 2020 17:08:00 -0800
Subject: mm: always have io_remap_pfn_range() set pgprot_decrypted()

The purpose of io_remap_pfn_range() is to map IO memory, such as a
memory mapped IO exposed through a PCI BAR.  IO devices do not
understand encryption, so this memory must always be decrypted.
Automatically call pgprot_decrypted() as part of the generic
implementation.

This fixes a bug where enabling AMD SME causes subsystems, such as RDMA,
using io_remap_pfn_range() to expose BAR pages to user space to fail.
The CPU will encrypt access to those BAR pages instead of passing
unencrypted IO directly to the device.

Places not mapping IO should use remap_pfn_range().

Fixes: aca20d546214 ("x86/mm: Add support to make use of Secure Memory Encryption")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Dave Young" <dyoung@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/0-v1-025d64bdf6c4+e-amd_sme_fix_jgg@nvidia.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h      | 9 +++++++++
 include/linux/pgtable.h | 4 ----
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ef360fe70aaf..db6ae4d3fb4e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2759,6 +2759,15 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
 	return VM_FAULT_NOPAGE;
 }
 
+#ifndef io_remap_pfn_range
+static inline int io_remap_pfn_range(struct vm_area_struct *vma,
+				     unsigned long addr, unsigned long pfn,
+				     unsigned long size, pgprot_t prot)
+{
+	return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
+}
+#endif
+
 static inline vm_fault_t vmf_error(int err)
 {
 	if (err == -ENOMEM)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 38c33eabea89..71125a4676c4 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1427,10 +1427,6 @@ typedef unsigned int pgtbl_mod_mask;
 
 #endif /* !__ASSEMBLY__ */
 
-#ifndef io_remap_pfn_range
-#define io_remap_pfn_range remap_pfn_range
-#endif
-
 #ifndef has_transparent_hugepage
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define has_transparent_hugepage() 1
-- 
cgit 


From 9f14cb030d987ae5e201e88cd345c6d772bcce51 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 16 Sep 2020 11:45:22 -0700
Subject: sched: Un-hide lockdep_tasklist_lock_is_held() for !LOCKDEP

Currently, variables used only within lockdep expressions are flagged as
unused, requiring that these variables' declarations be decorated with
either #ifdef or __maybe_unused.  This results in ugly code.  This commit
therefore causes the lockdep_tasklist_lock_is_held() function to be
visible even when lockdep is not enabled, thus removing the need for
these decorations.  This approach further relies on dead-code elimination
to remove any references to functions or variables that are not available
in non-lockdep kernels.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/sched/task.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 85fb2f34c59b..c0f71f2e7160 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -47,9 +47,7 @@ extern spinlock_t mmlist_lock;
 extern union thread_union init_thread_union;
 extern struct task_struct init_task;
 
-#ifdef CONFIG_PROVE_RCU
 extern int lockdep_tasklist_lock_is_held(void);
-#endif /* #ifdef CONFIG_PROVE_RCU */
 
 extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
-- 
cgit 


From 891cd1f99dd94746f0caf5eea0121079178ee9bf Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 16 Sep 2020 11:45:23 -0700
Subject: rcu: Un-hide lockdep maps for !LOCKDEP

Currently, variables used only within lockdep expressions are flagged as
unused, requiring that these variables' declarations be decorated with
either #ifdef or __maybe_unused.  This results in ugly code.  This commit
therefore causes the RCU lock maps to be visible even when lockdep is not
enabled, thus removing the need for these decorations.  This approach
further relies on dead-code elimination to remove any references to
functions or variables that are not available in non-lockdep kernels.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h       | 9 +++++----
 include/linux/rcupdate_trace.h | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6cdd0152c253..f9533bbcbb36 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -241,6 +241,11 @@ bool rcu_lockdep_current_cpu_online(void);
 static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
 #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
 
+extern struct lockdep_map rcu_lock_map;
+extern struct lockdep_map rcu_bh_lock_map;
+extern struct lockdep_map rcu_sched_lock_map;
+extern struct lockdep_map rcu_callback_map;
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 static inline void rcu_lock_acquire(struct lockdep_map *map)
@@ -253,10 +258,6 @@ static inline void rcu_lock_release(struct lockdep_map *map)
 	lock_release(map, _THIS_IP_);
 }
 
-extern struct lockdep_map rcu_lock_map;
-extern struct lockdep_map rcu_bh_lock_map;
-extern struct lockdep_map rcu_sched_lock_map;
-extern struct lockdep_map rcu_callback_map;
 int debug_lockdep_rcu_enabled(void);
 int rcu_read_lock_held(void);
 int rcu_read_lock_bh_held(void);
diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index 3e7919fc5f34..86c8f6c98412 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -11,10 +11,10 @@
 #include <linux/sched.h>
 #include <linux/rcupdate.h>
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 extern struct lockdep_map rcu_trace_lock_map;
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
 static inline int rcu_read_lock_trace_held(void)
 {
 	return lock_is_held(&rcu_trace_lock_map);
-- 
cgit 


From cd539cff9470fe1dacf0bf5ab3f54f37b854d6fc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 16 Sep 2020 11:45:27 -0700
Subject: lockdep: Provide dummy forward declaration of *_is_held() helpers

When CONFIG_LOCKDEP is not set, lock_is_held() and lockdep_is_held()
are not declared or defined.  This forces all callers to use #ifdefs
around these checks.

Recent RCU changes added a lot of lockdep_is_held() calls inside
rcu_dereference_protected().  This macro hides its argument on !LOCKDEP
builds, which can lead to false-positive unused-variable warnings.

This commit therefore provides forward declarations of lock_is_held()
and lockdep_is_held() but without defining them.  This way callers
(including those internal to RCU) can keep them visible to the compiler
on !LOCKDEP builds and instead depend on dead code elimination to remove
the references, which in turn prevents the linker from complaining about
the lack of the corresponding function definitions.

[ paulmck: Apply Peter Zijlstra feedback on "extern". ]
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
--
CC: peterz@infradead.org
CC: mingo@redhat.com
CC: will@kernel.org
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/lockdep.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index f5594879175a..ccc3ce66c7e0 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -375,6 +375,12 @@ static inline void lockdep_unregister_key(struct lock_class_key *key)
 
 #define lockdep_depth(tsk)	(0)
 
+/*
+ * Dummy forward declarations, allow users to write less ifdef-y code
+ * and depend on dead code elimination.
+ */
+extern int lock_is_held(const void *);
+extern int lockdep_is_held(const void *);
 #define lockdep_is_held_type(l, r)		(1)
 
 #define lockdep_assert_held(l)			do { (void)(l); } while (0)
-- 
cgit 


From 65e9eb1ccfe56b41a0d8bfec651ea014968413cb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 16 Sep 2020 11:45:28 -0700
Subject: rcu: Prevent RCU_LOCKDEP_WARN() from swallowing the condition

We run into a unused variable warning in bridge code when variable is
only used inside the condition of rcu_dereference_protected().

 #define mlock_dereference(X, br) \
	rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))

Since on builds with CONFIG_PROVE_RCU=n rcu_dereference_protected()
compiles to nothing the compiler doesn't see the variable use.

This commit therefore prevents this warning by adding the condition as
dead code.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
--
CC: paulmck@kernel.org
CC: josh@joshtriplett.org
CC: rostedt@goodmis.org
CC: mathieu.desnoyers@efficios.com
CC: joel@joelfernandes.org
CC: jiangshanlai@gmail.com
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f9533bbcbb36..de0826411311 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -328,7 +328,7 @@ static inline void rcu_preempt_sleep_check(void) { }
 
 #else /* #ifdef CONFIG_PROVE_RCU */
 
-#define RCU_LOCKDEP_WARN(c, s) do { } while (0)
+#define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c))
 #define rcu_sleep_check() do { } while (0)
 
 #endif /* #else #ifdef CONFIG_PROVE_RCU */
-- 
cgit 


From 6370cc3bbd8a0f9bf975b013781243ab147876c6 Mon Sep 17 00:00:00 2001
From: Aleksandr Nogikh <nogikh@google.com>
Date: Thu, 29 Oct 2020 17:36:19 +0000
Subject: net: add kcov handle to skb extensions

Remote KCOV coverage collection enables coverage-guided fuzzing of the
code that is not reachable during normal system call execution. It is
especially helpful for fuzzing networking subsystems, where it is
common to perform packet handling in separate work queues even for the
packets that originated directly from the user space.

Enable coverage-guided frame injection by adding kcov remote handle to
skb extensions. Default initialization in __alloc_skb and
__build_skb_around ensures that no socket buffer that was generated
during a system call will be missed.

Code that is of interest and that performs packet processing should be
annotated with kcov_remote_start()/kcov_remote_stop().

An alternative approach is to determine kcov_handle solely on the
basis of the device/interface that received the specific socket
buffer. However, in this case it would be impossible to distinguish
between packets that originated during normal background network
processes or were intentionally injected from the user space.

Signed-off-by: Aleksandr Nogikh <nogikh@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 33 +++++++++++++++++++++++++++++++++
 lib/Kconfig.debug      |  1 +
 net/core/skbuff.c      | 11 +++++++++++
 3 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a828cf99c521..2d01b2bbb746 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4150,6 +4150,9 @@ enum skb_ext_id {
 #endif
 #if IS_ENABLED(CONFIG_MPTCP)
 	SKB_EXT_MPTCP,
+#endif
+#if IS_ENABLED(CONFIG_KCOV)
+	SKB_EXT_KCOV_HANDLE,
 #endif
 	SKB_EXT_NUM, /* must be last */
 };
@@ -4605,5 +4608,35 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 #endif
 }
 
+#ifdef CONFIG_KCOV
+static inline void skb_set_kcov_handle(struct sk_buff *skb,
+				       const u64 kcov_handle)
+{
+	/* Do not allocate skb extensions only to set kcov_handle to zero
+	 * (as it is zero by default). However, if the extensions are
+	 * already allocated, update kcov_handle anyway since
+	 * skb_set_kcov_handle can be called to zero a previously set
+	 * value.
+	 */
+	if (skb_has_extensions(skb) || kcov_handle) {
+		u64 *kcov_handle_ptr = skb_ext_add(skb, SKB_EXT_KCOV_HANDLE);
+
+		if (kcov_handle_ptr)
+			*kcov_handle_ptr = kcov_handle;
+	}
+}
+
+static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
+{
+	u64 *kcov_handle = skb_ext_find(skb, SKB_EXT_KCOV_HANDLE);
+
+	return kcov_handle ? *kcov_handle : 0;
+}
+#else
+static inline void skb_set_kcov_handle(struct sk_buff *skb,
+				       const u64 kcov_handle) { }
+static inline u64 skb_get_kcov_handle(struct sk_buff *skb) { return 0; }
+#endif /* CONFIG_KCOV */
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d7a7bc3b6098..d171a032db78 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1870,6 +1870,7 @@ config KCOV
 	depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS
 	select DEBUG_FS
 	select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC
+	select SKB_EXTENSIONS
 	help
 	  KCOV exposes kernel code coverage information in a form suitable
 	  for coverage-guided fuzzing (randomized testing).
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1ba8f0163744..c5e6c0b83a92 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -249,6 +249,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 
 		fclones->skb2.fclone = SKB_FCLONE_CLONE;
 	}
+
+	skb_set_kcov_handle(skb, kcov_common_handle());
+
 out:
 	return skb;
 nodata:
@@ -282,6 +285,8 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb,
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
 
+	skb_set_kcov_handle(skb, kcov_common_handle());
+
 	return skb;
 }
 
@@ -4203,6 +4208,9 @@ static const u8 skb_ext_type_len[] = {
 #if IS_ENABLED(CONFIG_MPTCP)
 	[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
 #endif
+#if IS_ENABLED(CONFIG_KCOV)
+	[SKB_EXT_KCOV_HANDLE] = SKB_EXT_CHUNKSIZEOF(u64),
+#endif
 };
 
 static __always_inline unsigned int skb_ext_total_length(void)
@@ -4219,6 +4227,9 @@ static __always_inline unsigned int skb_ext_total_length(void)
 #endif
 #if IS_ENABLED(CONFIG_MPTCP)
 		skb_ext_type_len[SKB_EXT_MPTCP] +
+#endif
+#if IS_ENABLED(CONFIG_KCOV)
+		skb_ext_type_len[SKB_EXT_KCOV_HANDLE] +
 #endif
 		0;
 }
-- 
cgit 


From 31909e3330c8d65e9a928d40833ebd5feb4f64e6 Mon Sep 17 00:00:00 2001
From: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Date: Tue, 27 Oct 2020 21:42:56 +0100
Subject: timens: additional helper functions for boottime offset handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide functions for time_namespace to subtract the boottime offset
from a timespec64 as well as to apply the boottime offset to u64 types in
nanoseconds.

Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20201027204258.7869-2-michael.weiss@aisec.fraunhofer.de
---
 include/linux/time_namespace.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 5b6031385db0..68770ac9ba89 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -77,6 +77,20 @@ static inline void timens_add_boottime(struct timespec64 *ts)
 	*ts = timespec64_add(*ts, ns_offsets->boottime);
 }
 
+static inline u64 timens_add_boottime_ns(u64 nsec)
+{
+	struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;
+
+	return nsec + timespec64_to_ns(&ns_offsets->boottime);
+}
+
+static inline void timens_sub_boottime(struct timespec64 *ts)
+{
+	struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;
+
+	*ts = timespec64_sub(*ts, ns_offsets->boottime);
+}
+
 ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
 				struct timens_offsets *offsets);
 
@@ -130,6 +144,14 @@ static inline int timens_on_fork(struct nsproxy *nsproxy,
 
 static inline void timens_add_monotonic(struct timespec64 *ts) { }
 static inline void timens_add_boottime(struct timespec64 *ts) { }
+
+static inline u64 timens_add_boottime_ns(u64 nsec)
+{
+	return nsec;
+}
+
+static inline void timens_sub_boottime(struct timespec64 *ts) { }
+
 static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
 {
 	return tim;
-- 
cgit 


From 5190db9fdd20fa5ba6084c98a3bc71c2fdf6a871 Mon Sep 17 00:00:00 2001
From: Roman Anufriev <dotdot@yandex-team.ru>
Date: Sun, 18 Oct 2020 05:56:54 +0300
Subject: fs/quota: update quota state flags scheme with project quota flags

Current quota state flags scheme doesn't include project quota and thus
shows all flags after DQUOT_USAGE_ENABLED wrong. Fix this and also add
DQUOT_NOLIST_DIRTY to the scheme.

Link: https://lore.kernel.org/r/1602989814-28922-1-git-send-email-dotdot@yandex-team.ru
Signed-off-by: Roman Anufriev <dotdot@yandex-team.ru>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quota.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 27aab84fcbaa..18ebd39c9487 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -448,17 +448,18 @@ struct quota_format_type {
 };
 
 /**
- * Quota state flags - they actually come in two flavors - for users and groups.
+ * Quota state flags - they come in three flavors - for users, groups and projects.
  *
  * Actual typed flags layout:
- *				USRQUOTA	GRPQUOTA
- *  DQUOT_USAGE_ENABLED		0x0001		0x0002
- *  DQUOT_LIMITS_ENABLED	0x0004		0x0008
- *  DQUOT_SUSPENDED		0x0010		0x0020
+ *				USRQUOTA	GRPQUOTA	PRJQUOTA
+ *  DQUOT_USAGE_ENABLED		0x0001		0x0002		0x0004
+ *  DQUOT_LIMITS_ENABLED	0x0008		0x0010		0x0020
+ *  DQUOT_SUSPENDED		0x0040		0x0080		0x0100
  *
  * Following bits are used for non-typed flags:
- *  DQUOT_QUOTA_SYS_FILE	0x0040
- *  DQUOT_NEGATIVE_USAGE	0x0080
+ *  DQUOT_QUOTA_SYS_FILE	0x0200
+ *  DQUOT_NEGATIVE_USAGE	0x0400
+ *  DQUOT_NOLIST_DIRTY		0x0800
  */
 enum {
 	_DQUOT_USAGE_ENABLED = 0,		/* Track disk usage for users */
-- 
cgit 


From 286228d382ba6320f04fa2e7c6fc8d4d92e428f4 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Wed, 18 Dec 2019 09:39:02 +0100
Subject: can: can_create_echo_skb(): fix echo skb generation: always use
 skb_clone()

All user space generated SKBs are owned by a socket (unless injected into the
key via AF_PACKET). If a socket is closed, all associated skbs will be cleaned
up.

This leads to a problem when a CAN driver calls can_put_echo_skb() on a
unshared SKB. If the socket is closed prior to the TX complete handler,
can_get_echo_skb() and the subsequent delivering of the echo SKB to all
registered callbacks, a SKB with a refcount of 0 is delivered.

To avoid the problem, in can_get_echo_skb() the original SKB is now always
cloned, regardless of shared SKB or not. If the process exists it can now
safely discard its SKBs, without disturbing the delivery of the echo SKB.

The problem shows up in the j1939 stack, when it clones the incoming skb, which
detects the already 0 refcount.

We can easily reproduce this with following example:

testj1939 -B -r can0: &
cansend can0 1823ff40#0123

WARNING: CPU: 0 PID: 293 at lib/refcount.c:25 refcount_warn_saturate+0x108/0x174
refcount_t: addition on 0; use-after-free.
Modules linked in: coda_vpu imx_vdoa videobuf2_vmalloc dw_hdmi_ahb_audio vcan
CPU: 0 PID: 293 Comm: cansend Not tainted 5.5.0-rc6-00376-g9e20dcb7040d #1
Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree)
Backtrace:
[<c010f570>] (dump_backtrace) from [<c010f90c>] (show_stack+0x20/0x24)
[<c010f8ec>] (show_stack) from [<c0c3e1a4>] (dump_stack+0x8c/0xa0)
[<c0c3e118>] (dump_stack) from [<c0127fec>] (__warn+0xe0/0x108)
[<c0127f0c>] (__warn) from [<c01283c8>] (warn_slowpath_fmt+0xa8/0xcc)
[<c0128324>] (warn_slowpath_fmt) from [<c0539c0c>] (refcount_warn_saturate+0x108/0x174)
[<c0539b04>] (refcount_warn_saturate) from [<c0ad2cac>] (j1939_can_recv+0x20c/0x210)
[<c0ad2aa0>] (j1939_can_recv) from [<c0ac9dc8>] (can_rcv_filter+0xb4/0x268)
[<c0ac9d14>] (can_rcv_filter) from [<c0aca2cc>] (can_receive+0xb0/0xe4)
[<c0aca21c>] (can_receive) from [<c0aca348>] (can_rcv+0x48/0x98)
[<c0aca300>] (can_rcv) from [<c09b1fdc>] (__netif_receive_skb_one_core+0x64/0x88)
[<c09b1f78>] (__netif_receive_skb_one_core) from [<c09b2070>] (__netif_receive_skb+0x38/0x94)
[<c09b2038>] (__netif_receive_skb) from [<c09b2130>] (netif_receive_skb_internal+0x64/0xf8)
[<c09b20cc>] (netif_receive_skb_internal) from [<c09b21f8>] (netif_receive_skb+0x34/0x19c)
[<c09b21c4>] (netif_receive_skb) from [<c0791278>] (can_rx_offload_napi_poll+0x58/0xb4)

Fixes: 0ae89beb283a ("can: add destructor for self generated skbs")
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: http://lore.kernel.org/r/20200124132656.22156-1-o.rempel@pengutronix.de
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/skb.h | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index 900b9f4e0605..fc61cf4eff1c 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -61,21 +61,17 @@ static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk)
  */
 static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb)
 {
-	if (skb_shared(skb)) {
-		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+	struct sk_buff *nskb;
 
-		if (likely(nskb)) {
-			can_skb_set_owner(nskb, skb->sk);
-			consume_skb(skb);
-			return nskb;
-		} else {
-			kfree_skb(skb);
-			return NULL;
-		}
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (unlikely(!nskb)) {
+		kfree_skb(skb);
+		return NULL;
 	}
 
-	/* we can assume to have an unshared skb with proper owner */
-	return skb;
+	can_skb_set_owner(nskb, skb->sk);
+	consume_skb(skb);
+	return nskb;
 }
 
 #endif /* !_CAN_SKB_H */
-- 
cgit 


From 2e4ef10f58502323ea470bc30ba84d5ddd4e77f0 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sun, 1 Nov 2020 13:17:07 +0000
Subject: net: add GSO UDP L4 and GSO fraglists to the list of software-backed
 types

Commit e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.") and
commit 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") made UDP L4
and fraglisted GRO/GSO fully supported by the software fallback mode.
We can safely add them to NETIF_F_GSO_SOFTWARE to allow logical/virtual
netdevs to forward these types of skbs up to the real drivers.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdev_features.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 0b17c4322b09..934de56644e7 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -207,8 +207,8 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 				 NETIF_F_FSO)
 
 /* List of features with software fallbacks. */
-#define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | \
-				 NETIF_F_GSO_SCTP)
+#define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP |	     \
+				 NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST)
 
 /*
  * If one device supports one of these features, then enable them
-- 
cgit 


From 179dfb954790410f65605f1c479c029c2cd73c55 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 26 Oct 2020 11:44:21 +0100
Subject: USB: serial: remove write wait queue

The digi_acceleport driver is the only driver still using the port
write wake queue so move it to that driver's port data.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/digi_acceleport.c | 12 ++++++------
 include/linux/usb/serial.h           |  2 --
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 968e08cdcb09..0ecd5316d85f 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -197,6 +197,7 @@ struct digi_port {
 	int dp_throttle_restart;
 	wait_queue_head_t dp_flush_wait;
 	wait_queue_head_t dp_close_wait;	/* wait queue for close */
+	wait_queue_head_t write_wait;
 	struct usb_serial_port *dp_port;
 };
 
@@ -381,7 +382,7 @@ static int digi_write_oob_command(struct usb_serial_port *port,
 	while (count > 0) {
 		while (oob_priv->dp_write_urb_in_use) {
 			cond_wait_interruptible_timeout_irqrestore(
-				&oob_port->write_wait, DIGI_RETRY_TIMEOUT,
+				&oob_priv->write_wait, DIGI_RETRY_TIMEOUT,
 				&oob_priv->dp_port_lock, flags);
 			if (interruptible && signal_pending(current))
 				return -EINTR;
@@ -444,7 +445,7 @@ static int digi_write_inb_command(struct usb_serial_port *port,
 		while (priv->dp_write_urb_in_use &&
 		       time_before(jiffies, timeout)) {
 			cond_wait_interruptible_timeout_irqrestore(
-				&port->write_wait, DIGI_RETRY_TIMEOUT,
+				&priv->write_wait, DIGI_RETRY_TIMEOUT,
 				&priv->dp_port_lock, flags);
 			if (signal_pending(current))
 				return -EINTR;
@@ -523,7 +524,7 @@ static int digi_set_modem_signals(struct usb_serial_port *port,
 	while (oob_priv->dp_write_urb_in_use) {
 		spin_unlock(&port_priv->dp_port_lock);
 		cond_wait_interruptible_timeout_irqrestore(
-			&oob_port->write_wait, DIGI_RETRY_TIMEOUT,
+			&oob_priv->write_wait, DIGI_RETRY_TIMEOUT,
 			&oob_priv->dp_port_lock, flags);
 		if (interruptible && signal_pending(current))
 			return -EINTR;
@@ -983,7 +984,7 @@ static void digi_write_bulk_callback(struct urb *urb)
 		dev_dbg(&port->dev, "digi_write_bulk_callback: oob callback\n");
 		spin_lock_irqsave(&priv->dp_port_lock, flags);
 		priv->dp_write_urb_in_use = 0;
-		wake_up_interruptible(&port->write_wait);
+		wake_up_interruptible(&priv->write_wait);
 		spin_unlock_irqrestore(&priv->dp_port_lock, flags);
 		return;
 	}
@@ -1216,10 +1217,9 @@ static int digi_port_init(struct usb_serial_port *port, unsigned port_num)
 	init_waitqueue_head(&priv->dp_transmit_idle_wait);
 	init_waitqueue_head(&priv->dp_flush_wait);
 	init_waitqueue_head(&priv->dp_close_wait);
+	init_waitqueue_head(&priv->write_wait);
 	priv->dp_port = port;
 
-	init_waitqueue_head(&port->write_wait);
-
 	usb_set_serial_port_data(port, priv);
 
 	return 0;
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 8e67eff9039f..1c09b922f8b0 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -62,7 +62,6 @@
  * @bulk_out_endpointAddress: endpoint address for the bulk out pipe for this
  *	port.
  * @flags: usb serial port flags
- * @write_wait: a wait_queue_head_t used by the port.
  * @work: work queue entry for the line discipline waking up.
  * @dev: pointer to the serial device
  *
@@ -108,7 +107,6 @@ struct usb_serial_port {
 	int			tx_bytes;
 
 	unsigned long		flags;
-	wait_queue_head_t	write_wait;
 	struct work_struct	work;
 	unsigned long		sysrq; /* sysrq timeout */
 	struct device		dev;
-- 
cgit 


From 2374a045263b47f763571ec87ad7c65ea505188a Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 29 Oct 2020 12:32:18 +0100
Subject: vt: keyboard, remove unneeded func_* declarations

Now that we removed the ugly handling of func_buf, remove unneeded
declarations of func_* variables. The definitions are in the generated
defkeymap.c_shipped, so we cannot really remove them (but we would love
to).

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20201029113222.32640-13-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kbd_kern.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h
index bb2246c8ec13..82f29aa35062 100644
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -9,9 +9,6 @@
 extern struct tasklet_struct keyboard_tasklet;
 
 extern char *func_table[MAX_NR_FUNC];
-extern char func_buf[];
-extern char *funcbufptr;
-extern int funcbufsize, funcbufleft;
 
 /*
  * kbd->xxx contains the VC-local things (flag settings etc..)
-- 
cgit 


From 763e4cdc0f6d5cea45c896fef67f7be4bdefcca7 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 29 Oct 2020 14:30:48 -0700
Subject: iomap: support partial page discard on writeback block mapping
 failure

iomap writeback mapping failure only calls into ->discard_page() if
the current page has not been added to the ioend. Accordingly, the
XFS callback assumes a full page discard and invalidation. This is
problematic for sub-page block size filesystems where some portion
of a page might have been mapped successfully before a failure to
map a delalloc block occurs. ->discard_page() is not called in that
error scenario and the bio is explicitly failed by iomap via the
error return from ->prepare_ioend(). As a result, the filesystem
leaks delalloc blocks and corrupts the filesystem block counters.

Since XFS is the only user of ->discard_page(), tweak the semantics
to invoke the callback unconditionally on mapping errors and provide
the file offset that failed to map. Update xfs_discard_page() to
discard the corresponding portion of the file and pass the range
along to iomap_invalidatepage(). The latter already properly handles
both full and sub-page scenarios by not changing any iomap or page
state on sub-page invalidations.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap/buffered-io.c | 15 ++++++++-------
 fs/xfs/xfs_aops.c      | 14 ++++++++------
 include/linux/iomap.h  |  2 +-
 3 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8180061b9e16..e4ea1f9f94d0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1382,14 +1382,15 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * appropriately.
 	 */
 	if (unlikely(error)) {
+		/*
+		 * Let the filesystem know what portion of the current page
+		 * failed to map. If the page wasn't been added to ioend, it
+		 * won't be affected by I/O completion and we must unlock it
+		 * now.
+		 */
+		if (wpc->ops->discard_page)
+			wpc->ops->discard_page(page, file_offset);
 		if (!count) {
-			/*
-			 * If the current page hasn't been added to ioend, it
-			 * won't be affected by I/O completions and we must
-			 * discard and unlock it right here.
-			 */
-			if (wpc->ops->discard_page)
-				wpc->ops->discard_page(page);
 			ClearPageUptodate(page);
 			unlock_page(page);
 			goto done;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 55d126d4e096..5bf37afae5e9 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -527,13 +527,15 @@ xfs_prepare_ioend(
  */
 static void
 xfs_discard_page(
-	struct page		*page)
+	struct page		*page,
+	loff_t			fileoff)
 {
 	struct inode		*inode = page->mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	loff_t			offset = page_offset(page);
-	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
+	unsigned int		pageoff = offset_in_page(fileoff);
+	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, fileoff);
+	xfs_fileoff_t		pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
 	int			error;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
@@ -541,14 +543,14 @@ xfs_discard_page(
 
 	xfs_alert_ratelimited(mp,
 		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
-			page, ip->i_ino, offset);
+			page, ip->i_ino, fileoff);
 
 	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-			i_blocks_per_page(inode, page));
+			i_blocks_per_page(inode, page) - pageoff_fsb);
 	if (error && !XFS_FORCED_SHUTDOWN(mp))
 		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 out_invalidate:
-	iomap_invalidatepage(page, 0, PAGE_SIZE);
+	iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
 }
 
 static const struct iomap_writeback_ops xfs_writeback_ops = {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 172b3397a1a3..5bd3cac4df9c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -221,7 +221,7 @@ struct iomap_writeback_ops {
 	 * Optional, allows the file system to discard state on a page where
 	 * we failed to submit any I/O.
 	 */
-	void (*discard_page)(struct page *page);
+	void (*discard_page)(struct page *page, loff_t fileoff);
 };
 
 struct iomap_writepage_ctx {
-- 
cgit 


From fdaf083cdfb556a45c422c8998268baf1ab26829 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 30 Oct 2020 09:37:30 -0600
Subject: io_uring: properly handle SQPOLL request cancelations

Track if a given task io_uring context contains SQPOLL instances, so we
can iterate those for cancelation (and request counts). This ensures that
we properly wait on SQPOLL contexts, and find everything that needs
canceling.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 77 ++++++++++++++++++++++++++++++++++++++++--------
 include/linux/io_uring.h |  3 +-
 2 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a7429c977eb3..b398394a919e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1668,7 +1668,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, cflags);
-	} else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
+	} else if (ctx->cq_overflow_flushed ||
+		   atomic_read(&req->task->io_uring->in_idle)) {
 		/*
 		 * If we're in ring overflow flush mode, or in task cancel mode,
 		 * then we cannot store the request for later flushing, we need
@@ -1838,7 +1839,7 @@ static void __io_free_req(struct io_kiocb *req)
 	io_dismantle_req(req);
 
 	percpu_counter_dec(&tctx->inflight);
-	if (tctx->in_idle)
+	if (atomic_read(&tctx->in_idle))
 		wake_up(&tctx->wait);
 	put_task_struct(req->task);
 
@@ -7695,7 +7696,8 @@ static int io_uring_alloc_task_context(struct task_struct *task)
 	xa_init(&tctx->xa);
 	init_waitqueue_head(&tctx->wait);
 	tctx->last = NULL;
-	tctx->in_idle = 0;
+	atomic_set(&tctx->in_idle, 0);
+	tctx->sqpoll = false;
 	io_init_identity(&tctx->__identity);
 	tctx->identity = &tctx->__identity;
 	task->io_uring = tctx;
@@ -8598,8 +8600,11 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 {
 	struct task_struct *task = current;
 
-	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data)
+	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
 		task = ctx->sq_data->thread;
+		atomic_inc(&task->io_uring->in_idle);
+		io_sq_thread_park(ctx->sq_data);
+	}
 
 	io_cqring_overflow_flush(ctx, true, task, files);
 
@@ -8607,12 +8612,23 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 		io_run_task_work();
 		cond_resched();
 	}
+
+	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+		atomic_dec(&task->io_uring->in_idle);
+		/*
+		 * If the files that are going away are the ones in the thread
+		 * identity, clear them out.
+		 */
+		if (task->io_uring->identity->files == files)
+			task->io_uring->identity->files = NULL;
+		io_sq_thread_unpark(ctx->sq_data);
+	}
 }
 
 /*
  * Note that this task has used io_uring. We use it for cancelation purposes.
  */
-static int io_uring_add_task_file(struct file *file)
+static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
 {
 	struct io_uring_task *tctx = current->io_uring;
 
@@ -8634,6 +8650,14 @@ static int io_uring_add_task_file(struct file *file)
 		tctx->last = file;
 	}
 
+	/*
+	 * This is race safe in that the task itself is doing this, hence it
+	 * cannot be going through the exit/cancel paths at the same time.
+	 * This cannot be modified while exit/cancel is running.
+	 */
+	if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
+		tctx->sqpoll = true;
+
 	return 0;
 }
 
@@ -8675,7 +8699,7 @@ void __io_uring_files_cancel(struct files_struct *files)
 	unsigned long index;
 
 	/* make sure overflow events are dropped */
-	tctx->in_idle = true;
+	atomic_inc(&tctx->in_idle);
 
 	xa_for_each(&tctx->xa, index, file) {
 		struct io_ring_ctx *ctx = file->private_data;
@@ -8684,6 +8708,35 @@ void __io_uring_files_cancel(struct files_struct *files)
 		if (files)
 			io_uring_del_task_file(file);
 	}
+
+	atomic_dec(&tctx->in_idle);
+}
+
+static s64 tctx_inflight(struct io_uring_task *tctx)
+{
+	unsigned long index;
+	struct file *file;
+	s64 inflight;
+
+	inflight = percpu_counter_sum(&tctx->inflight);
+	if (!tctx->sqpoll)
+		return inflight;
+
+	/*
+	 * If we have SQPOLL rings, then we need to iterate and find them, and
+	 * add the pending count for those.
+	 */
+	xa_for_each(&tctx->xa, index, file) {
+		struct io_ring_ctx *ctx = file->private_data;
+
+		if (ctx->flags & IORING_SETUP_SQPOLL) {
+			struct io_uring_task *__tctx = ctx->sqo_task->io_uring;
+
+			inflight += percpu_counter_sum(&__tctx->inflight);
+		}
+	}
+
+	return inflight;
 }
 
 /*
@@ -8697,11 +8750,11 @@ void __io_uring_task_cancel(void)
 	s64 inflight;
 
 	/* make sure overflow events are dropped */
-	tctx->in_idle = true;
+	atomic_inc(&tctx->in_idle);
 
 	do {
 		/* read completions before cancelations */
-		inflight = percpu_counter_sum(&tctx->inflight);
+		inflight = tctx_inflight(tctx);
 		if (!inflight)
 			break;
 		__io_uring_files_cancel(NULL);
@@ -8712,13 +8765,13 @@ void __io_uring_task_cancel(void)
 		 * If we've seen completions, retry. This avoids a race where
 		 * a completion comes in before we did prepare_to_wait().
 		 */
-		if (inflight != percpu_counter_sum(&tctx->inflight))
+		if (inflight != tctx_inflight(tctx))
 			continue;
 		schedule();
 	} while (1);
 
 	finish_wait(&tctx->wait, &wait);
-	tctx->in_idle = false;
+	atomic_dec(&tctx->in_idle);
 }
 
 static int io_uring_flush(struct file *file, void *data)
@@ -8863,7 +8916,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			io_sqpoll_wait_sq(ctx);
 		submitted = to_submit;
 	} else if (to_submit) {
-		ret = io_uring_add_task_file(f.file);
+		ret = io_uring_add_task_file(ctx, f.file);
 		if (unlikely(ret))
 			goto out;
 		mutex_lock(&ctx->uring_lock);
@@ -9092,7 +9145,7 @@ err_fd:
 #if defined(CONFIG_UNIX)
 	ctx->ring_sock->file = file;
 #endif
-	if (unlikely(io_uring_add_task_file(file))) {
+	if (unlikely(io_uring_add_task_file(ctx, file))) {
 		file = ERR_PTR(-ENOMEM);
 		goto err_fd;
 	}
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 868364cea3b7..35b2d845704d 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -30,7 +30,8 @@ struct io_uring_task {
 	struct percpu_counter	inflight;
 	struct io_identity	__identity;
 	struct io_identity	*identity;
-	bool			in_idle;
+	atomic_t		in_idle;
+	bool			sqpoll;
 };
 
 #if defined(CONFIG_IO_URING)
-- 
cgit 


From b6be002bcd1dd1dedb926abf3c90c794eacb77dc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 2 Nov 2020 12:53:16 -0800
Subject: x86/entry: Move nmi entry/exit into common code

Lockdep state handling on NMI enter and exit is nothing specific to X86. It's
not any different on other architectures. Also the extra state type is not
necessary, irqentry_state_t can carry the necessary information as well.

Move it to common code and extend irqentry_state_t to carry lockdep state.

[ Ira: Make exit_rcu and lockdep a union as they are mutually exclusive
  between the IRQ and NMI exceptions, and add kernel documentation for
  struct irqentry_state_t ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201102205320.1458656-7-ira.weiny@intel.com
---
 arch/x86/entry/common.c         | 34 ----------------------------------
 arch/x86/include/asm/idtentry.h |  3 ---
 arch/x86/kernel/cpu/mce/core.c  |  6 +++---
 arch/x86/kernel/nmi.c           |  6 +++---
 arch/x86/kernel/traps.c         | 13 +++++++------
 include/linux/entry-common.h    | 39 ++++++++++++++++++++++++++++++++++++++-
 kernel/entry/common.c           | 36 ++++++++++++++++++++++++++++++++++++
 7 files changed, 87 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..18d8f17f755c 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -209,40 +209,6 @@ SYSCALL_DEFINE0(ni_syscall)
 	return -ENOSYS;
 }
 
-noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
-{
-	bool irq_state = lockdep_hardirqs_enabled();
-
-	__nmi_enter();
-	lockdep_hardirqs_off(CALLER_ADDR0);
-	lockdep_hardirq_enter();
-	rcu_nmi_enter();
-
-	instrumentation_begin();
-	trace_hardirqs_off_finish();
-	ftrace_nmi_enter();
-	instrumentation_end();
-
-	return irq_state;
-}
-
-noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore)
-{
-	instrumentation_begin();
-	ftrace_nmi_exit();
-	if (restore) {
-		trace_hardirqs_on_prepare();
-		lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-	}
-	instrumentation_end();
-
-	rcu_nmi_exit();
-	lockdep_hardirq_exit();
-	if (restore)
-		lockdep_hardirqs_on(CALLER_ADDR0);
-	__nmi_exit();
-}
-
 #ifdef CONFIG_XEN_PV
 #ifndef CONFIG_PREEMPTION
 /*
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index b2442eb0ac2f..247a60a47331 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -11,9 +11,6 @@
 
 #include <asm/irq_stack.h>
 
-bool idtentry_enter_nmi(struct pt_regs *regs);
-void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
-
 /**
  * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
  *		      No error code pushed by hardware
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4102b866e7c0..f5c860b1a50b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1983,7 +1983,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
 
 static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
 {
-	bool irq_state;
+	irqentry_state_t irq_state;
 
 	WARN_ON_ONCE(user_mode(regs));
 
@@ -1995,7 +1995,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
 	    mce_check_crashing_cpu())
 		return;
 
-	irq_state = idtentry_enter_nmi(regs);
+	irq_state = irqentry_nmi_enter(regs);
 	/*
 	 * The call targets are marked noinstr, but objtool can't figure
 	 * that out because it's an indirect call. Annotate it.
@@ -2006,7 +2006,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
 	if (regs->flags & X86_EFLAGS_IF)
 		trace_hardirqs_on_prepare();
 	instrumentation_end();
-	idtentry_exit_nmi(regs, irq_state);
+	irqentry_nmi_exit(regs, irq_state);
 }
 
 static __always_inline void exc_machine_check_user(struct pt_regs *regs)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4bc77aaf1303..bf250a339655 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
 
 DEFINE_IDTENTRY_RAW(exc_nmi)
 {
-	bool irq_state;
+	irqentry_state_t irq_state;
 
 	/*
 	 * Re-enable NMIs right here when running as an SEV-ES guest. This might
@@ -502,14 +502,14 @@ nmi_restart:
 
 	this_cpu_write(nmi_dr7, local_db_save());
 
-	irq_state = idtentry_enter_nmi(regs);
+	irq_state = irqentry_nmi_enter(regs);
 
 	inc_irq_stat(__nmi_count);
 
 	if (!ignore_nmis)
 		default_do_nmi(regs);
 
-	idtentry_exit_nmi(regs, irq_state);
+	irqentry_nmi_exit(regs, irq_state);
 
 	local_db_restore(this_cpu_read(nmi_dr7));
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e19df6cde35d..e1b78829d909 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -405,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 	}
 #endif
 
-	idtentry_enter_nmi(regs);
+	irqentry_nmi_enter(regs);
 	instrumentation_begin();
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
@@ -651,12 +651,13 @@ DEFINE_IDTENTRY_RAW(exc_int3)
 		instrumentation_end();
 		irqentry_exit_to_user_mode(regs);
 	} else {
-		bool irq_state = idtentry_enter_nmi(regs);
+		irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
 		instrumentation_begin();
 		if (!do_int3(regs))
 			die("int3", regs, 0);
 		instrumentation_end();
-		idtentry_exit_nmi(regs, irq_state);
+		irqentry_nmi_exit(regs, irq_state);
 	}
 }
 
@@ -851,7 +852,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 	 * includes the entry stack is excluded for everything.
 	 */
 	unsigned long dr7 = local_db_save();
-	bool irq_state = idtentry_enter_nmi(regs);
+	irqentry_state_t irq_state = irqentry_nmi_enter(regs);
 	instrumentation_begin();
 
 	/*
@@ -908,7 +909,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 		regs->flags &= ~X86_EFLAGS_TF;
 out:
 	instrumentation_end();
-	idtentry_exit_nmi(regs, irq_state);
+	irqentry_nmi_exit(regs, irq_state);
 
 	local_db_restore(dr7);
 }
@@ -926,7 +927,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
 
 	/*
 	 * NB: We can't easily clear DR7 here because
-	 * idtentry_exit_to_usermode() can invoke ptrace, schedule, access
+	 * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
 	 * user memory, etc.  This means that a recursive #DB is possible.  If
 	 * this happens, that #DB will hit exc_debug_kernel() and clear DR7.
 	 * Since we're not on the IST stack right now, everything will be
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index b9711e813ec2..1a128baf3628 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -346,8 +346,26 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs);
 void irqentry_exit_to_user_mode(struct pt_regs *regs);
 
 #ifndef irqentry_state
+/**
+ * struct irqentry_state - Opaque object for exception state storage
+ * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the
+ *            exit path has to invoke rcu_irq_exit().
+ * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that
+ *           lockdep state is restored correctly on exit from nmi.
+ *
+ * This opaque object is filled in by the irqentry_*_enter() functions and
+ * must be passed back into the corresponding irqentry_*_exit() functions
+ * when the exception is complete.
+ *
+ * Callers of irqentry_*_[enter|exit]() must consider this structure opaque
+ * and all members private.  Descriptions of the members are provided to aid in
+ * the maintenance of the irqentry_*() functions.
+ */
 typedef struct irqentry_state {
-	bool	exit_rcu;
+	union {
+		bool	exit_rcu;
+		bool	lockdep;
+	};
 } irqentry_state_t;
 #endif
 
@@ -407,4 +425,23 @@ void irqentry_exit_cond_resched(void);
  */
 void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);
 
+/**
+ * irqentry_nmi_enter - Handle NMI entry
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Similar to irqentry_enter() but taking care of the NMI constraints.
+ */
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs);
+
+/**
+ * irqentry_nmi_exit - Handle return from NMI handling
+ * @regs:	Pointer to pt_regs (NMI entry regs)
+ * @irq_state:	Return value from matching call to irqentry_nmi_enter()
+ *
+ * Last action before returning to the low level assmenbly code.
+ *
+ * Counterpart to irqentry_nmi_enter().
+ */
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state);
+
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 3a1dfecc533e..bc75c114c1b3 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -405,3 +405,39 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 			rcu_irq_exit();
 	}
 }
+
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
+{
+	irqentry_state_t irq_state;
+
+	irq_state.lockdep = lockdep_hardirqs_enabled();
+
+	__nmi_enter();
+	lockdep_hardirqs_off(CALLER_ADDR0);
+	lockdep_hardirq_enter();
+	rcu_nmi_enter();
+
+	instrumentation_begin();
+	trace_hardirqs_off_finish();
+	ftrace_nmi_enter();
+	instrumentation_end();
+
+	return irq_state;
+}
+
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
+{
+	instrumentation_begin();
+	ftrace_nmi_exit();
+	if (irq_state.lockdep) {
+		trace_hardirqs_on_prepare();
+		lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+	}
+	instrumentation_end();
+
+	rcu_nmi_exit();
+	lockdep_hardirq_exit();
+	if (irq_state.lockdep)
+		lockdep_hardirqs_on(CALLER_ADDR0);
+	__nmi_exit();
+}
-- 
cgit 


From e1ac4b2406d94eddce8ac2c5ab4235f6075a9602 Mon Sep 17 00:00:00 2001
From: Chester Lin <clin@suse.com>
Date: Fri, 30 Oct 2020 14:08:38 +0800
Subject: efi: generalize efi_get_secureboot

Generalize the efi_get_secureboot() function so not only efistub but also
other subsystems can use it.

Note that the MokSbState handling is not factored out: the variable is
boot time only, and so it cannot be parameterized as easily. Also, the
IMA code will switch to this version in a future patch, and it does not
incorporate the MokSbState exception in the first place.

Note that the new efi_get_secureboot_mode() helper treats any failures
to read SetupMode as setup mode being disabled.

Co-developed-by: Chester Lin <clin@suse.com>
Signed-off-by: Chester Lin <clin@suse.com>
Acked-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/boot/compressed/Makefile         |  2 +-
 drivers/firmware/efi/libstub/efistub.h    |  2 ++
 drivers/firmware/efi/libstub/secureboot.c | 41 +++++++++++--------------------
 include/linux/efi.h                       | 23 ++++++++++++++++-
 4 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index ee249088cbfe..8d358a6fe6ec 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -35,7 +35,7 @@ cflags-$(CONFIG_X86_32) := -march=i386
 cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
 KBUILD_CFLAGS += $(cflags-y)
 KBUILD_CFLAGS += -mno-mmx -mno-sse
-KBUILD_CFLAGS += -ffreestanding
+KBUILD_CFLAGS += -ffreestanding -fshort-wchar
 KBUILD_CFLAGS += -fno-stack-protector
 KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 2d7abcd99de9..b8ec29d6a74a 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -848,4 +848,6 @@ asmlinkage void __noreturn efi_enter_kernel(unsigned long entrypoint,
 
 void efi_handle_post_ebs_state(void);
 
+enum efi_secureboot_mode efi_get_secureboot(void);
+
 #endif
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 5efc524b14be..af18d86c1604 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -12,15 +12,16 @@
 
 #include "efistub.h"
 
-/* BIOS variables */
-static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
-static const efi_char16_t efi_SecureBoot_name[] = L"SecureBoot";
-static const efi_char16_t efi_SetupMode_name[] = L"SetupMode";
-
 /* SHIM variables */
 static const efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
 static const efi_char16_t shim_MokSBState_name[] = L"MokSBState";
 
+static efi_status_t get_var(efi_char16_t *name, efi_guid_t *vendor, u32 *attr,
+			    unsigned long *data_size, void *data)
+{
+	return get_efi_var(name, vendor, attr, data_size, data);
+}
+
 /*
  * Determine whether we're in secure boot mode.
  *
@@ -30,26 +31,18 @@ static const efi_char16_t shim_MokSBState_name[] = L"MokSBState";
 enum efi_secureboot_mode efi_get_secureboot(void)
 {
 	u32 attr;
-	u8 secboot, setupmode, moksbstate;
 	unsigned long size;
+	enum efi_secureboot_mode mode;
 	efi_status_t status;
+	u8 moksbstate;
 
-	size = sizeof(secboot);
-	status = get_efi_var(efi_SecureBoot_name, &efi_variable_guid,
-			     NULL, &size, &secboot);
-	if (status == EFI_NOT_FOUND)
-		return efi_secureboot_mode_disabled;
-	if (status != EFI_SUCCESS)
-		goto out_efi_err;
-
-	size = sizeof(setupmode);
-	status = get_efi_var(efi_SetupMode_name, &efi_variable_guid,
-			     NULL, &size, &setupmode);
-	if (status != EFI_SUCCESS)
-		goto out_efi_err;
-
-	if (secboot == 0 || setupmode == 1)
-		return efi_secureboot_mode_disabled;
+	mode = efi_get_secureboot_mode(get_var);
+	if (mode == efi_secureboot_mode_unknown) {
+		efi_err("Could not determine UEFI Secure Boot status.\n");
+		return efi_secureboot_mode_unknown;
+	}
+	if (mode != efi_secureboot_mode_enabled)
+		return mode;
 
 	/*
 	 * See if a user has put the shim into insecure mode. If so, and if the
@@ -69,8 +62,4 @@ enum efi_secureboot_mode efi_get_secureboot(void)
 secure_boot_enabled:
 	efi_info("UEFI Secure Boot is enabled.\n");
 	return efi_secureboot_mode_enabled;
-
-out_efi_err:
-	efi_err("Could not determine UEFI Secure Boot status.\n");
-	return efi_secureboot_mode_unknown;
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d7c0e73af2b9..1cd5d91d8ca1 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1089,7 +1089,28 @@ enum efi_secureboot_mode {
 	efi_secureboot_mode_disabled,
 	efi_secureboot_mode_enabled,
 };
-enum efi_secureboot_mode efi_get_secureboot(void);
+
+static inline
+enum efi_secureboot_mode efi_get_secureboot_mode(efi_get_variable_t *get_var)
+{
+	u8 secboot, setupmode = 0;
+	efi_status_t status;
+	unsigned long size;
+
+	size = sizeof(secboot);
+	status = get_var(L"SecureBoot", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size,
+			 &secboot);
+	if (status == EFI_NOT_FOUND)
+		return efi_secureboot_mode_disabled;
+	if (status != EFI_SUCCESS)
+		return efi_secureboot_mode_unknown;
+
+	size = sizeof(setupmode);
+	get_var(L"SetupMode", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size, &setupmode);
+	if (secboot == 0 || setupmode == 1)
+		return efi_secureboot_mode_disabled;
+	return efi_secureboot_mode_enabled;
+}
 
 #ifdef CONFIG_RESET_ATTACK_MITIGATION
 void efi_enable_reset_attack_mitigation(void);
-- 
cgit 


From 9d1c94a69d70f1b02bdf06b231cd16ad47ef06cd Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 23 Oct 2020 18:33:25 +0200
Subject: clk: fix a kernel-doc markup

clk_get_duty_cycle -> clk_get_scaled_duty_cycle

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/b2336f3f3cdfe6e1a2d3a7a056ab7ccc7a81b945.1603469755.git.mchehab+huawei@kernel.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 7fd6a1febcf4..5f8d5f4931c0 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -150,7 +150,7 @@ int clk_get_phase(struct clk *clk);
 int clk_set_duty_cycle(struct clk *clk, unsigned int num, unsigned int den);
 
 /**
- * clk_get_duty_cycle - return the duty cycle ratio of a clock signal
+ * clk_get_scaled_duty_cycle - return the duty cycle ratio of a clock signal
  * @clk: clock signal source
  * @scale: scaling factor to be applied to represent the ratio as an integer
  *
-- 
cgit 


From 2b5b95b1ff3d70a95013a45e3b5b90f1daf42348 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Mon, 14 Sep 2020 15:09:33 +0200
Subject: mm: introduce vma_set_file function v4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the new vma_set_file() function to allow changing
vma->vm_file with the necessary refcount dance.

v2: add more users of this.
v3: add missing EXPORT_SYMBOL, rebase on mmap cleanup,
    add comments why we drop the reference on two occasions.
v4: make it clear that changing an anonymous vma is illegal.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> (v2)
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://patchwork.freedesktop.org/patch/394773/
---
 drivers/dma-buf/dma-buf.c                  |  3 +--
 drivers/gpu/drm/etnaviv/etnaviv_gem.c      |  4 +---
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  3 +--
 drivers/gpu/drm/i915/gem/i915_gem_mman.c   |  5 +++--
 drivers/gpu/drm/msm/msm_gem.c              |  4 +---
 drivers/gpu/drm/omapdrm/omap_gem.c         |  3 +--
 drivers/gpu/drm/vgem/vgem_drv.c            |  3 +--
 drivers/staging/android/ashmem.c           |  6 +++---
 include/linux/mm.h                         |  2 ++
 mm/mmap.c                                  | 12 ++++++++++++
 10 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 3d22dff1dab9..696b4ce4cfd9 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1183,8 +1183,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
 		return -EINVAL;
 
 	/* readjust the vma */
-	fput(vma->vm_file);
-	vma->vm_file = get_file(dmabuf->file);
+	vma_set_file(vma, dmabuf->file);
 	vma->vm_pgoff = pgoff;
 
 	return dmabuf->ops->mmap(dmabuf, vma);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 67d9a2b9ea6a..4132acfa11be 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -145,10 +145,8 @@ static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
-		get_file(etnaviv_obj->base.filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = etnaviv_obj->base.filp;
+		vma_set_file(vma, etnaviv_obj->base.filp);
 
 		vma->vm_page_prot = vm_page_prot;
 	}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 0dd477e56573..04e9c04545ad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -114,8 +114,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 	if (ret)
 		return ret;
 
-	fput(vma->vm_file);
-	vma->vm_file = get_file(obj->base.filp);
+	vma_set_file(vma, obj->base.filp);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 3d69e51f3e4d..ec28a6cde49b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -893,8 +893,9 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	 * requires avoiding extraneous references to their filp, hence why
 	 * we prefer to use an anonymous file for their mmaps.
 	 */
-	fput(vma->vm_file);
-	vma->vm_file = anon;
+	vma_set_file(vma, anon);
+	/* Drop the initial creation reference, the vma is now holding one. */
+	fput(anon);
 
 	switch (mmo->mmap_type) {
 	case I915_MMAP_TYPE_WC:
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 2e1bce7c0b19..311721ceee50 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -212,10 +212,8 @@ int msm_gem_mmap_obj(struct drm_gem_object *obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
-		get_file(obj->filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = obj->filp;
+		vma_set_file(vma, obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index d8e09792793a..f063f5a04fb0 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -564,9 +564,8 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = get_file(obj->filp);
+		vma_set_file(vma, obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index fa54a6d1403d..ea0eecae5153 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -397,8 +397,7 @@ static int vgem_prime_mmap(struct drm_gem_object *obj,
 	if (ret)
 		return ret;
 
-	fput(vma->vm_file);
-	vma->vm_file = get_file(obj->filp);
+	vma_set_file(vma, obj->filp);
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
 
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 10b4be1f3e78..4789d36ddfd3 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -450,9 +450,9 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 		vma_set_anonymous(vma);
 	}
 
-	if (vma->vm_file)
-		fput(vma->vm_file);
-	vma->vm_file = asma->file;
+	vma_set_file(vma, asma->file);
+	/* XXX: merge this with the get_file() above if possible */
+	fput(asma->file);
 
 out:
 	mutex_unlock(&ashmem_mutex);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ef360fe70aaf..2b7ac36c42dd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2719,6 +2719,8 @@ static inline void vma_set_page_prot(struct vm_area_struct *vma)
 }
 #endif
 
+void vma_set_file(struct vm_area_struct *vma, struct file *file);
+
 #ifdef CONFIG_NUMA_BALANCING
 unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
diff --git a/mm/mmap.c b/mm/mmap.c
index 30a4e8412a58..b0093cc69240 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -136,6 +136,18 @@ void vma_set_page_prot(struct vm_area_struct *vma)
 	WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
 }
 
+/*
+ * Change backing file, only valid to use during initial VMA setup.
+ */
+void vma_set_file(struct vm_area_struct *vma, struct file *file)
+{
+	/* Changing an anonymous vma with this is illegal */
+	get_file(file);
+	swap(vma->vm_file, file);
+	fput(file);
+}
+EXPORT_SYMBOL(vma_set_file);
+
 /*
  * Requires inode->i_mapping->i_mmap_rwsem
  */
-- 
cgit 


From a4da45dda6475816f4c8b9e0d512261991ba31e5 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 28 Oct 2020 15:51:17 +0100
Subject: pinctrl: Remove hole in pinctrl_gpio_range

On 64-bit platforms, pointer size and alignment are 64-bit, hence two
4-byte holes are present before the pins and gc members of the
pinctrl_gpio_range structure.  Get rid of these holes by moving the
pins pointer.

This reduces kernel size of an arm64 Rockchip kernel by ca. 512 bytes.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20201028145117.1731876-1-geert+renesas@glider.be
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinctrl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 2aef59df93d7..70b45d28e7a9 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -51,8 +51,8 @@ struct pinctrl_pin_desc {
  * @id: an ID number for the chip in this range
  * @base: base offset of the GPIO range
  * @pin_base: base pin number of the GPIO range if pins == NULL
- * @pins: enumeration of pins in GPIO range or NULL
  * @npins: number of pins in the GPIO range, including the base number
+ * @pins: enumeration of pins in GPIO range or NULL
  * @gc: an optional pointer to a gpio_chip
  */
 struct pinctrl_gpio_range {
@@ -61,8 +61,8 @@ struct pinctrl_gpio_range {
 	unsigned int id;
 	unsigned int base;
 	unsigned int pin_base;
-	unsigned const *pins;
 	unsigned int npins;
+	unsigned const *pins;
 	struct gpio_chip *gc;
 };
 
-- 
cgit 


From e40b0b56ffdc16edce207904a7782da6a1a47162 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Thu, 5 Nov 2020 17:05:35 +0100
Subject: Revert "mm: introduce vma_set_file function v4"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kernel test robot is not happy with that.

This reverts commit 2b5b95b1ff3d70a95013a45e3b5b90f1daf42348.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/394773/
---
 drivers/dma-buf/dma-buf.c                  |  3 ++-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c      |  4 +++-
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  3 ++-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c   |  5 ++---
 drivers/gpu/drm/msm/msm_gem.c              |  4 +++-
 drivers/gpu/drm/omapdrm/omap_gem.c         |  3 ++-
 drivers/gpu/drm/vgem/vgem_drv.c            |  3 ++-
 drivers/staging/android/ashmem.c           |  6 +++---
 include/linux/mm.h                         |  2 --
 mm/mmap.c                                  | 12 ------------
 10 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index e63684d4cd90..282bd8b84170 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1183,7 +1183,8 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
 		return -EINVAL;
 
 	/* readjust the vma */
-	vma_set_file(vma, dmabuf->file);
+	fput(vma->vm_file);
+	vma->vm_file = get_file(dmabuf->file);
 	vma->vm_pgoff = pgoff;
 
 	return dmabuf->ops->mmap(dmabuf, vma);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 4132acfa11be..67d9a2b9ea6a 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -145,8 +145,10 @@ static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
+		fput(vma->vm_file);
+		get_file(etnaviv_obj->base.filp);
 		vma->vm_pgoff = 0;
-		vma_set_file(vma, etnaviv_obj->base.filp);
+		vma->vm_file  = etnaviv_obj->base.filp;
 
 		vma->vm_page_prot = vm_page_prot;
 	}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 04e9c04545ad..0dd477e56573 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -114,7 +114,8 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 	if (ret)
 		return ret;
 
-	vma_set_file(vma, obj->base.filp);
+	fput(vma->vm_file);
+	vma->vm_file = get_file(obj->base.filp);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index ec28a6cde49b..3d69e51f3e4d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -893,9 +893,8 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	 * requires avoiding extraneous references to their filp, hence why
 	 * we prefer to use an anonymous file for their mmaps.
 	 */
-	vma_set_file(vma, anon);
-	/* Drop the initial creation reference, the vma is now holding one. */
-	fput(anon);
+	fput(vma->vm_file);
+	vma->vm_file = anon;
 
 	switch (mmo->mmap_type) {
 	case I915_MMAP_TYPE_WC:
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 311721ceee50..2e1bce7c0b19 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -212,8 +212,10 @@ int msm_gem_mmap_obj(struct drm_gem_object *obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
+		fput(vma->vm_file);
+		get_file(obj->filp);
 		vma->vm_pgoff = 0;
-		vma_set_file(vma, obj->filp);
+		vma->vm_file  = obj->filp;
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index f063f5a04fb0..d8e09792793a 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -564,8 +564,9 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
+		fput(vma->vm_file);
 		vma->vm_pgoff = 0;
-		vma_set_file(vma, obj->filp);
+		vma->vm_file  = get_file(obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index ea0eecae5153..fa54a6d1403d 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -397,7 +397,8 @@ static int vgem_prime_mmap(struct drm_gem_object *obj,
 	if (ret)
 		return ret;
 
-	vma_set_file(vma, obj->filp);
+	fput(vma->vm_file);
+	vma->vm_file = get_file(obj->filp);
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
 
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 4789d36ddfd3..10b4be1f3e78 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -450,9 +450,9 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 		vma_set_anonymous(vma);
 	}
 
-	vma_set_file(vma, asma->file);
-	/* XXX: merge this with the get_file() above if possible */
-	fput(asma->file);
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	vma->vm_file = asma->file;
 
 out:
 	mutex_unlock(&ashmem_mutex);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2b7ac36c42dd..ef360fe70aaf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2719,8 +2719,6 @@ static inline void vma_set_page_prot(struct vm_area_struct *vma)
 }
 #endif
 
-void vma_set_file(struct vm_area_struct *vma, struct file *file);
-
 #ifdef CONFIG_NUMA_BALANCING
 unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
diff --git a/mm/mmap.c b/mm/mmap.c
index b0093cc69240..30a4e8412a58 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -136,18 +136,6 @@ void vma_set_page_prot(struct vm_area_struct *vma)
 	WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
 }
 
-/*
- * Change backing file, only valid to use during initial VMA setup.
- */
-void vma_set_file(struct vm_area_struct *vma, struct file *file)
-{
-	/* Changing an anonymous vma with this is illegal */
-	get_file(file);
-	swap(vma->vm_file, file);
-	fput(file);
-}
-EXPORT_SYMBOL(vma_set_file);
-
 /*
  * Requires inode->i_mapping->i_mmap_rwsem
  */
-- 
cgit 


From 3b20369313a486246582c8ef6ff5d1d0b9c34613 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Thu, 5 Nov 2020 15:48:51 +0800
Subject: EDAC: Add three new memory types

There are {Low-Power DDR3/4, WIO2} types of memory.
Add new entries to 'enum mem_type' and new strings to
'edac_mem_types[]' for the new types.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/edac_mc.c | 3 +++
 include/linux/edac.h   | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 01ff71f7b645..eef8724faae0 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -158,10 +158,13 @@ const char * const edac_mem_types[] = {
 	[MEM_DDR3]	= "Unbuffered-DDR3",
 	[MEM_RDDR3]	= "Registered-DDR3",
 	[MEM_LRDDR3]	= "Load-Reduced-DDR3-RAM",
+	[MEM_LPDDR3]	= "Low-Power-DDR3-RAM",
 	[MEM_DDR4]	= "Unbuffered-DDR4",
 	[MEM_RDDR4]	= "Registered-DDR4",
+	[MEM_LPDDR4]	= "Low-Power-DDR4-RAM",
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
+	[MEM_WIO2]	= "Wide-IO-2",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 15e8f3d8a895..8f63245f7f7c 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -175,11 +175,14 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_RDDR3:		Registered DDR3 RAM
  *			This is a variant of the DDR3 memories.
  * @MEM_LRDDR3:		Load-Reduced DDR3 memory.
+ * @MEM_LPDDR3:		Low-Power DDR3 memory.
  * @MEM_DDR4:		Unbuffered DDR4 RAM
  * @MEM_RDDR4:		Registered DDR4 RAM
  *			This is a variant of the DDR4 memories.
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
+ * @MEM_LPDDR4:		Low-Power DDR4 memory.
  * @MEM_NVDIMM:		Non-volatile RAM
+ * @MEM_WIO2:		Wide I/O 2.
  */
 enum mem_type {
 	MEM_EMPTY = 0,
@@ -200,10 +203,13 @@ enum mem_type {
 	MEM_DDR3,
 	MEM_RDDR3,
 	MEM_LRDDR3,
+	MEM_LPDDR3,
 	MEM_DDR4,
 	MEM_RDDR4,
 	MEM_LRDDR4,
+	MEM_LPDDR4,
 	MEM_NVDIMM,
+	MEM_WIO2,
 };
 
 #define MEM_FLAG_EMPTY		BIT(MEM_EMPTY)
@@ -223,10 +229,13 @@ enum mem_type {
 #define MEM_FLAG_XDR            BIT(MEM_XDR)
 #define MEM_FLAG_DDR3           BIT(MEM_DDR3)
 #define MEM_FLAG_RDDR3          BIT(MEM_RDDR3)
+#define MEM_FLAG_LPDDR3         BIT(MEM_LPDDR3)
 #define MEM_FLAG_DDR4           BIT(MEM_DDR4)
 #define MEM_FLAG_RDDR4          BIT(MEM_RDDR4)
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
+#define MEM_FLAG_LPDDR4         BIT(MEM_LPDDR4)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
+#define MEM_FLAG_WIO2		BIT(MEM_WIO2)
 
 /**
  * enum edac-type - Error Detection and Correction capabilities and mode
-- 
cgit 


From e4b27ebc780fa7fa951d81d241912755532568ff Mon Sep 17 00:00:00 2001
From: Kurt Kanzenbach <kurt@linutronix.de>
Date: Tue, 3 Nov 2020 08:10:56 +0100
Subject: net: dsa: Add DSA driver for Hirschmann Hellcreek switches

Add a basic DSA driver for Hirschmann Hellcreek switches. Those switches are
implementing features needed for Time Sensitive Networking (TSN) such as support
for the Time Precision Protocol and various shapers like the Time Aware Shaper.

This driver includes basic support for networking:

 * VLAN handling
 * FDB handling
 * Port statistics
 * STP
 * Phylink

Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/Kconfig                            |    2 +
 drivers/net/dsa/Makefile                           |    1 +
 drivers/net/dsa/hirschmann/Kconfig                 |    8 +
 drivers/net/dsa/hirschmann/Makefile                |    2 +
 drivers/net/dsa/hirschmann/hellcreek.c             | 1253 ++++++++++++++++++++
 drivers/net/dsa/hirschmann/hellcreek.h             |  249 ++++
 include/linux/platform_data/hirschmann-hellcreek.h |   23 +
 7 files changed, 1538 insertions(+)
 create mode 100644 drivers/net/dsa/hirschmann/Kconfig
 create mode 100644 drivers/net/dsa/hirschmann/Makefile
 create mode 100644 drivers/net/dsa/hirschmann/hellcreek.c
 create mode 100644 drivers/net/dsa/hirschmann/hellcreek.h
 create mode 100644 include/linux/platform_data/hirschmann-hellcreek.h

(limited to 'include/linux')

diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 2451f61a38e4..f6a0488589fc 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -24,6 +24,8 @@ config NET_DSA_LOOP
 	  This enables support for a fake mock-up switch chip which
 	  exercises the DSA APIs.
 
+source "drivers/net/dsa/hirschmann/Kconfig"
+
 config NET_DSA_LANTIQ_GSWIP
 	tristate "Lantiq / Intel GSWIP"
 	depends on HAS_IOMEM && NET_DSA
diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile
index 4a943ccc2ca4..a84adb140a04 100644
--- a/drivers/net/dsa/Makefile
+++ b/drivers/net/dsa/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_NET_DSA_VITESSE_VSC73XX) += vitesse-vsc73xx-core.o
 obj-$(CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM) += vitesse-vsc73xx-platform.o
 obj-$(CONFIG_NET_DSA_VITESSE_VSC73XX_SPI) += vitesse-vsc73xx-spi.o
 obj-y				+= b53/
+obj-y				+= hirschmann/
 obj-y				+= microchip/
 obj-y				+= mv88e6xxx/
 obj-y				+= ocelot/
diff --git a/drivers/net/dsa/hirschmann/Kconfig b/drivers/net/dsa/hirschmann/Kconfig
new file mode 100644
index 000000000000..7d189cb936e3
--- /dev/null
+++ b/drivers/net/dsa/hirschmann/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+config NET_DSA_HIRSCHMANN_HELLCREEK
+	tristate "Hirschmann Hellcreek TSN Switch support"
+	depends on HAS_IOMEM
+	depends on NET_DSA
+	select NET_DSA_TAG_HELLCREEK
+	help
+	  This driver adds support for Hirschmann Hellcreek TSN switches.
diff --git a/drivers/net/dsa/hirschmann/Makefile b/drivers/net/dsa/hirschmann/Makefile
new file mode 100644
index 000000000000..0e12e149e40f
--- /dev/null
+++ b/drivers/net/dsa/hirschmann/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_NET_DSA_HIRSCHMANN_HELLCREEK)	+= hellcreek.o
diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c
new file mode 100644
index 000000000000..a4965a23db20
--- /dev/null
+++ b/drivers/net/dsa/hirschmann/hellcreek.c
@@ -0,0 +1,1253 @@
+// SPDX-License-Identifier: (GPL-2.0 or MIT)
+/*
+ * DSA driver for:
+ * Hirschmann Hellcreek TSN switch.
+ *
+ * Copyright (C) 2019,2020 Linutronix GmbH
+ * Author Kurt Kanzenbach <kurt@linutronix.de>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_mdio.h>
+#include <linux/platform_device.h>
+#include <linux/bitops.h>
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/etherdevice.h>
+#include <linux/random.h>
+#include <linux/iopoll.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <net/dsa.h>
+
+#include "hellcreek.h"
+
+static const struct hellcreek_counter hellcreek_counter[] = {
+	{ 0x00, "RxFiltered", },
+	{ 0x01, "RxOctets1k", },
+	{ 0x02, "RxVTAG", },
+	{ 0x03, "RxL2BAD", },
+	{ 0x04, "RxOverloadDrop", },
+	{ 0x05, "RxUC", },
+	{ 0x06, "RxMC", },
+	{ 0x07, "RxBC", },
+	{ 0x08, "RxRS<64", },
+	{ 0x09, "RxRS64", },
+	{ 0x0a, "RxRS65_127", },
+	{ 0x0b, "RxRS128_255", },
+	{ 0x0c, "RxRS256_511", },
+	{ 0x0d, "RxRS512_1023", },
+	{ 0x0e, "RxRS1024_1518", },
+	{ 0x0f, "RxRS>1518", },
+	{ 0x10, "TxTailDropQueue0", },
+	{ 0x11, "TxTailDropQueue1", },
+	{ 0x12, "TxTailDropQueue2", },
+	{ 0x13, "TxTailDropQueue3", },
+	{ 0x14, "TxTailDropQueue4", },
+	{ 0x15, "TxTailDropQueue5", },
+	{ 0x16, "TxTailDropQueue6", },
+	{ 0x17, "TxTailDropQueue7", },
+	{ 0x18, "RxTrafficClass0", },
+	{ 0x19, "RxTrafficClass1", },
+	{ 0x1a, "RxTrafficClass2", },
+	{ 0x1b, "RxTrafficClass3", },
+	{ 0x1c, "RxTrafficClass4", },
+	{ 0x1d, "RxTrafficClass5", },
+	{ 0x1e, "RxTrafficClass6", },
+	{ 0x1f, "RxTrafficClass7", },
+	{ 0x21, "TxOctets1k", },
+	{ 0x22, "TxVTAG", },
+	{ 0x23, "TxL2BAD", },
+	{ 0x25, "TxUC", },
+	{ 0x26, "TxMC", },
+	{ 0x27, "TxBC", },
+	{ 0x28, "TxTS<64", },
+	{ 0x29, "TxTS64", },
+	{ 0x2a, "TxTS65_127", },
+	{ 0x2b, "TxTS128_255", },
+	{ 0x2c, "TxTS256_511", },
+	{ 0x2d, "TxTS512_1023", },
+	{ 0x2e, "TxTS1024_1518", },
+	{ 0x2f, "TxTS>1518", },
+	{ 0x30, "TxTrafficClassOverrun0", },
+	{ 0x31, "TxTrafficClassOverrun1", },
+	{ 0x32, "TxTrafficClassOverrun2", },
+	{ 0x33, "TxTrafficClassOverrun3", },
+	{ 0x34, "TxTrafficClassOverrun4", },
+	{ 0x35, "TxTrafficClassOverrun5", },
+	{ 0x36, "TxTrafficClassOverrun6", },
+	{ 0x37, "TxTrafficClassOverrun7", },
+	{ 0x38, "TxTrafficClass0", },
+	{ 0x39, "TxTrafficClass1", },
+	{ 0x3a, "TxTrafficClass2", },
+	{ 0x3b, "TxTrafficClass3", },
+	{ 0x3c, "TxTrafficClass4", },
+	{ 0x3d, "TxTrafficClass5", },
+	{ 0x3e, "TxTrafficClass6", },
+	{ 0x3f, "TxTrafficClass7", },
+};
+
+static u16 hellcreek_read(struct hellcreek *hellcreek, unsigned int offset)
+{
+	return readw(hellcreek->base + offset);
+}
+
+static u16 hellcreek_read_ctrl(struct hellcreek *hellcreek)
+{
+	return readw(hellcreek->base + HR_CTRL_C);
+}
+
+static u16 hellcreek_read_stat(struct hellcreek *hellcreek)
+{
+	return readw(hellcreek->base + HR_SWSTAT);
+}
+
+static void hellcreek_write(struct hellcreek *hellcreek, u16 data,
+			    unsigned int offset)
+{
+	writew(data, hellcreek->base + offset);
+}
+
+static void hellcreek_select_port(struct hellcreek *hellcreek, int port)
+{
+	u16 val = port << HR_PSEL_PTWSEL_SHIFT;
+
+	hellcreek_write(hellcreek, val, HR_PSEL);
+}
+
+static void hellcreek_select_prio(struct hellcreek *hellcreek, int prio)
+{
+	u16 val = prio << HR_PSEL_PRTCWSEL_SHIFT;
+
+	hellcreek_write(hellcreek, val, HR_PSEL);
+}
+
+static void hellcreek_select_counter(struct hellcreek *hellcreek, int counter)
+{
+	u16 val = counter << HR_CSEL_SHIFT;
+
+	hellcreek_write(hellcreek, val, HR_CSEL);
+
+	/* Data sheet states to wait at least 20 internal clock cycles */
+	ndelay(200);
+}
+
+static void hellcreek_select_vlan(struct hellcreek *hellcreek, int vid,
+				  bool pvid)
+{
+	u16 val = 0;
+
+	/* Set pvid bit first */
+	if (pvid)
+		val |= HR_VIDCFG_PVID;
+	hellcreek_write(hellcreek, val, HR_VIDCFG);
+
+	/* Set vlan */
+	val |= vid << HR_VIDCFG_VID_SHIFT;
+	hellcreek_write(hellcreek, val, HR_VIDCFG);
+}
+
+static int hellcreek_wait_until_ready(struct hellcreek *hellcreek)
+{
+	u16 val;
+
+	/* Wait up to 1ms, although 3 us should be enough */
+	return readx_poll_timeout(hellcreek_read_ctrl, hellcreek,
+				  val, val & HR_CTRL_C_READY,
+				  3, 1000);
+}
+
+static int hellcreek_wait_until_transitioned(struct hellcreek *hellcreek)
+{
+	u16 val;
+
+	return readx_poll_timeout_atomic(hellcreek_read_ctrl, hellcreek,
+					 val, !(val & HR_CTRL_C_TRANSITION),
+					 1, 1000);
+}
+
+static int hellcreek_wait_fdb_ready(struct hellcreek *hellcreek)
+{
+	u16 val;
+
+	return readx_poll_timeout_atomic(hellcreek_read_stat, hellcreek,
+					 val, !(val & HR_SWSTAT_BUSY),
+					 1, 1000);
+}
+
+static int hellcreek_detect(struct hellcreek *hellcreek)
+{
+	u16 id, rel_low, rel_high, date_low, date_high, tgd_ver;
+	u8 tgd_maj, tgd_min;
+	u32 rel, date;
+
+	id	  = hellcreek_read(hellcreek, HR_MODID_C);
+	rel_low	  = hellcreek_read(hellcreek, HR_REL_L_C);
+	rel_high  = hellcreek_read(hellcreek, HR_REL_H_C);
+	date_low  = hellcreek_read(hellcreek, HR_BLD_L_C);
+	date_high = hellcreek_read(hellcreek, HR_BLD_H_C);
+	tgd_ver   = hellcreek_read(hellcreek, TR_TGDVER);
+
+	if (id != hellcreek->pdata->module_id)
+		return -ENODEV;
+
+	rel	= rel_low | (rel_high << 16);
+	date	= date_low | (date_high << 16);
+	tgd_maj = (tgd_ver & TR_TGDVER_REV_MAJ_MASK) >> TR_TGDVER_REV_MAJ_SHIFT;
+	tgd_min = (tgd_ver & TR_TGDVER_REV_MIN_MASK) >> TR_TGDVER_REV_MIN_SHIFT;
+
+	dev_info(hellcreek->dev, "Module ID=%02x Release=%04x Date=%04x TGD Version=%02x.%02x\n",
+		 id, rel, date, tgd_maj, tgd_min);
+
+	return 0;
+}
+
+static void hellcreek_feature_detect(struct hellcreek *hellcreek)
+{
+	u16 features;
+
+	features = hellcreek_read(hellcreek, HR_FEABITS0);
+
+	/* Currently we only detect the size of the FDB table */
+	hellcreek->fdb_entries = ((features & HR_FEABITS0_FDBBINS_MASK) >>
+			       HR_FEABITS0_FDBBINS_SHIFT) * 32;
+
+	dev_info(hellcreek->dev, "Feature detect: FDB entries=%zu\n",
+		 hellcreek->fdb_entries);
+}
+
+static enum dsa_tag_protocol hellcreek_get_tag_protocol(struct dsa_switch *ds,
+							int port,
+							enum dsa_tag_protocol mp)
+{
+	return DSA_TAG_PROTO_HELLCREEK;
+}
+
+static int hellcreek_port_enable(struct dsa_switch *ds, int port,
+				 struct phy_device *phy)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	struct hellcreek_port *hellcreek_port;
+	u16 val;
+
+	hellcreek_port = &hellcreek->ports[port];
+
+	dev_dbg(hellcreek->dev, "Enable port %d\n", port);
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	hellcreek_select_port(hellcreek, port);
+	val = hellcreek_port->ptcfg;
+	val |= HR_PTCFG_ADMIN_EN;
+	hellcreek_write(hellcreek, val, HR_PTCFG);
+	hellcreek_port->ptcfg = val;
+
+	mutex_unlock(&hellcreek->reg_lock);
+
+	return 0;
+}
+
+static void hellcreek_port_disable(struct dsa_switch *ds, int port)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	struct hellcreek_port *hellcreek_port;
+	u16 val;
+
+	hellcreek_port = &hellcreek->ports[port];
+
+	dev_dbg(hellcreek->dev, "Disable port %d\n", port);
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	hellcreek_select_port(hellcreek, port);
+	val = hellcreek_port->ptcfg;
+	val &= ~HR_PTCFG_ADMIN_EN;
+	hellcreek_write(hellcreek, val, HR_PTCFG);
+	hellcreek_port->ptcfg = val;
+
+	mutex_unlock(&hellcreek->reg_lock);
+}
+
+static void hellcreek_get_strings(struct dsa_switch *ds, int port,
+				  u32 stringset, uint8_t *data)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(hellcreek_counter); ++i) {
+		const struct hellcreek_counter *counter = &hellcreek_counter[i];
+
+		strlcpy(data + i * ETH_GSTRING_LEN,
+			counter->name, ETH_GSTRING_LEN);
+	}
+}
+
+static int hellcreek_get_sset_count(struct dsa_switch *ds, int port, int sset)
+{
+	if (sset != ETH_SS_STATS)
+		return 0;
+
+	return ARRAY_SIZE(hellcreek_counter);
+}
+
+static void hellcreek_get_ethtool_stats(struct dsa_switch *ds, int port,
+					uint64_t *data)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	struct hellcreek_port *hellcreek_port;
+	int i;
+
+	hellcreek_port = &hellcreek->ports[port];
+
+	for (i = 0; i < ARRAY_SIZE(hellcreek_counter); ++i) {
+		const struct hellcreek_counter *counter = &hellcreek_counter[i];
+		u8 offset = counter->offset + port * 64;
+		u16 high, low;
+		u64 value = 0;
+
+		mutex_lock(&hellcreek->reg_lock);
+
+		hellcreek_select_counter(hellcreek, offset);
+
+		/* The registers are locked internally by selecting the
+		 * counter. So low and high can be read without reading high
+		 * again.
+		 */
+		high  = hellcreek_read(hellcreek, HR_CRDH);
+		low   = hellcreek_read(hellcreek, HR_CRDL);
+		value = (high << 16) | low;
+
+		hellcreek_port->counter_values[i] += value;
+		data[i] = hellcreek_port->counter_values[i];
+
+		mutex_unlock(&hellcreek->reg_lock);
+	}
+}
+
+static u16 hellcreek_private_vid(int port)
+{
+	return VLAN_N_VID - port + 1;
+}
+
+static int hellcreek_vlan_prepare(struct dsa_switch *ds, int port,
+				  const struct switchdev_obj_port_vlan *vlan)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	int i;
+
+	dev_dbg(hellcreek->dev, "VLAN prepare for port %d\n", port);
+
+	/* Restriction: Make sure that nobody uses the "private" VLANs. These
+	 * VLANs are internally used by the driver to ensure port
+	 * separation. Thus, they cannot be used by someone else.
+	 */
+	for (i = 0; i < hellcreek->pdata->num_ports; ++i) {
+		const u16 restricted_vid = hellcreek_private_vid(i);
+		u16 vid;
+
+		if (!dsa_is_user_port(ds, i))
+			continue;
+
+		for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid)
+			if (vid == restricted_vid)
+				return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void hellcreek_select_vlan_params(struct hellcreek *hellcreek, int port,
+					 int *shift, int *mask)
+{
+	switch (port) {
+	case 0:
+		*shift = HR_VIDMBRCFG_P0MBR_SHIFT;
+		*mask  = HR_VIDMBRCFG_P0MBR_MASK;
+		break;
+	case 1:
+		*shift = HR_VIDMBRCFG_P1MBR_SHIFT;
+		*mask  = HR_VIDMBRCFG_P1MBR_MASK;
+		break;
+	case 2:
+		*shift = HR_VIDMBRCFG_P2MBR_SHIFT;
+		*mask  = HR_VIDMBRCFG_P2MBR_MASK;
+		break;
+	case 3:
+		*shift = HR_VIDMBRCFG_P3MBR_SHIFT;
+		*mask  = HR_VIDMBRCFG_P3MBR_MASK;
+		break;
+	default:
+		*shift = *mask = 0;
+		dev_err(hellcreek->dev, "Unknown port %d selected!\n", port);
+	}
+}
+
+static void hellcreek_apply_vlan(struct hellcreek *hellcreek, int port, u16 vid,
+				 bool pvid, bool untagged)
+{
+	int shift, mask;
+	u16 val;
+
+	dev_dbg(hellcreek->dev, "Apply VLAN: port=%d vid=%u pvid=%d untagged=%d",
+		port, vid, pvid, untagged);
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	hellcreek_select_port(hellcreek, port);
+	hellcreek_select_vlan(hellcreek, vid, pvid);
+
+	/* Setup port vlan membership */
+	hellcreek_select_vlan_params(hellcreek, port, &shift, &mask);
+	val = hellcreek->vidmbrcfg[vid];
+	val &= ~mask;
+	if (untagged)
+		val |= HELLCREEK_VLAN_UNTAGGED_MEMBER << shift;
+	else
+		val |= HELLCREEK_VLAN_TAGGED_MEMBER << shift;
+
+	hellcreek_write(hellcreek, val, HR_VIDMBRCFG);
+	hellcreek->vidmbrcfg[vid] = val;
+
+	mutex_unlock(&hellcreek->reg_lock);
+}
+
+static void hellcreek_unapply_vlan(struct hellcreek *hellcreek, int port,
+				   u16 vid)
+{
+	int shift, mask;
+	u16 val;
+
+	dev_dbg(hellcreek->dev, "Unapply VLAN: port=%d vid=%u\n", port, vid);
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	hellcreek_select_vlan(hellcreek, vid, 0);
+
+	/* Setup port vlan membership */
+	hellcreek_select_vlan_params(hellcreek, port, &shift, &mask);
+	val = hellcreek->vidmbrcfg[vid];
+	val &= ~mask;
+	val |= HELLCREEK_VLAN_NO_MEMBER << shift;
+
+	hellcreek_write(hellcreek, val, HR_VIDMBRCFG);
+	hellcreek->vidmbrcfg[vid] = val;
+
+	mutex_unlock(&hellcreek->reg_lock);
+}
+
+static void hellcreek_vlan_add(struct dsa_switch *ds, int port,
+			       const struct switchdev_obj_port_vlan *vlan)
+{
+	bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
+	bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
+	struct hellcreek *hellcreek = ds->priv;
+	u16 vid;
+
+	dev_dbg(hellcreek->dev, "Add VLANs (%d -- %d) on port %d, %s, %s\n",
+		vlan->vid_begin, vlan->vid_end, port,
+		untagged ? "untagged" : "tagged",
+		pvid ? "PVID" : "no PVID");
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid)
+		hellcreek_apply_vlan(hellcreek, port, vid, pvid, untagged);
+}
+
+static int hellcreek_vlan_del(struct dsa_switch *ds, int port,
+			      const struct switchdev_obj_port_vlan *vlan)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	u16 vid;
+
+	dev_dbg(hellcreek->dev, "Remove VLANs (%d -- %d) on port %d\n",
+		vlan->vid_begin, vlan->vid_end, port);
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid)
+		hellcreek_unapply_vlan(hellcreek, port, vid);
+
+	return 0;
+}
+
+static void hellcreek_port_stp_state_set(struct dsa_switch *ds, int port,
+					 u8 state)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	struct hellcreek_port *hellcreek_port;
+	const char *new_state;
+	u16 val;
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	hellcreek_port = &hellcreek->ports[port];
+	val = hellcreek_port->ptcfg;
+
+	switch (state) {
+	case BR_STATE_DISABLED:
+		new_state = "DISABLED";
+		val |= HR_PTCFG_BLOCKED;
+		val &= ~HR_PTCFG_LEARNING_EN;
+		break;
+	case BR_STATE_BLOCKING:
+		new_state = "BLOCKING";
+		val |= HR_PTCFG_BLOCKED;
+		val &= ~HR_PTCFG_LEARNING_EN;
+		break;
+	case BR_STATE_LISTENING:
+		new_state = "LISTENING";
+		val |= HR_PTCFG_BLOCKED;
+		val &= ~HR_PTCFG_LEARNING_EN;
+		break;
+	case BR_STATE_LEARNING:
+		new_state = "LEARNING";
+		val |= HR_PTCFG_BLOCKED;
+		val |= HR_PTCFG_LEARNING_EN;
+		break;
+	case BR_STATE_FORWARDING:
+		new_state = "FORWARDING";
+		val &= ~HR_PTCFG_BLOCKED;
+		val |= HR_PTCFG_LEARNING_EN;
+		break;
+	default:
+		new_state = "UNKNOWN";
+	}
+
+	hellcreek_select_port(hellcreek, port);
+	hellcreek_write(hellcreek, val, HR_PTCFG);
+	hellcreek_port->ptcfg = val;
+
+	mutex_unlock(&hellcreek->reg_lock);
+
+	dev_dbg(hellcreek->dev, "Configured STP state for port %d: %s\n",
+		port, new_state);
+}
+
+static void hellcreek_setup_ingressflt(struct hellcreek *hellcreek, int port,
+				       bool enable)
+{
+	struct hellcreek_port *hellcreek_port = &hellcreek->ports[port];
+	u16 ptcfg;
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	ptcfg = hellcreek_port->ptcfg;
+
+	if (enable)
+		ptcfg |= HR_PTCFG_INGRESSFLT;
+	else
+		ptcfg &= ~HR_PTCFG_INGRESSFLT;
+
+	hellcreek_select_port(hellcreek, port);
+	hellcreek_write(hellcreek, ptcfg, HR_PTCFG);
+	hellcreek_port->ptcfg = ptcfg;
+
+	mutex_unlock(&hellcreek->reg_lock);
+}
+
+static void hellcreek_setup_vlan_awareness(struct hellcreek *hellcreek,
+					   bool enable)
+{
+	u16 swcfg;
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	swcfg = hellcreek->swcfg;
+
+	if (enable)
+		swcfg |= HR_SWCFG_VLAN_UNAWARE;
+	else
+		swcfg &= ~HR_SWCFG_VLAN_UNAWARE;
+
+	hellcreek_write(hellcreek, swcfg, HR_SWCFG);
+
+	mutex_unlock(&hellcreek->reg_lock);
+}
+
+/* Default setup for DSA: VLAN <X>: CPU and Port <X> egress untagged. */
+static void hellcreek_setup_vlan_membership(struct dsa_switch *ds, int port,
+					    bool enabled)
+{
+	const u16 vid = hellcreek_private_vid(port);
+	int upstream = dsa_upstream_port(ds, port);
+	struct hellcreek *hellcreek = ds->priv;
+
+	/* Apply vid to port as egress untagged and port vlan id */
+	if (enabled)
+		hellcreek_apply_vlan(hellcreek, port, vid, true, true);
+	else
+		hellcreek_unapply_vlan(hellcreek, port, vid);
+
+	/* Apply vid to cpu port as well */
+	if (enabled)
+		hellcreek_apply_vlan(hellcreek, upstream, vid, false, true);
+	else
+		hellcreek_unapply_vlan(hellcreek, upstream, vid);
+}
+
+static int hellcreek_port_bridge_join(struct dsa_switch *ds, int port,
+				      struct net_device *br)
+{
+	struct hellcreek *hellcreek = ds->priv;
+
+	dev_dbg(hellcreek->dev, "Port %d joins a bridge\n", port);
+
+	/* When joining a vlan_filtering bridge, keep the switch VLAN aware */
+	if (!ds->vlan_filtering)
+		hellcreek_setup_vlan_awareness(hellcreek, false);
+
+	/* Drop private vlans */
+	hellcreek_setup_vlan_membership(ds, port, false);
+
+	return 0;
+}
+
+static void hellcreek_port_bridge_leave(struct dsa_switch *ds, int port,
+					struct net_device *br)
+{
+	struct hellcreek *hellcreek = ds->priv;
+
+	dev_dbg(hellcreek->dev, "Port %d leaves a bridge\n", port);
+
+	/* Enable VLAN awareness */
+	hellcreek_setup_vlan_awareness(hellcreek, true);
+
+	/* Enable private vlans */
+	hellcreek_setup_vlan_membership(ds, port, true);
+}
+
+static int __hellcreek_fdb_add(struct hellcreek *hellcreek,
+			       const struct hellcreek_fdb_entry *entry)
+{
+	u16 meta = 0;
+
+	dev_dbg(hellcreek->dev, "Add static FDB entry: MAC=%pM, MASK=0x%02x, "
+		"OBT=%d, REPRIO_EN=%d, PRIO=%d\n", entry->mac, entry->portmask,
+		entry->is_obt, entry->reprio_en, entry->reprio_tc);
+
+	/* Add mac address */
+	hellcreek_write(hellcreek, entry->mac[1] | (entry->mac[0] << 8), HR_FDBWDH);
+	hellcreek_write(hellcreek, entry->mac[3] | (entry->mac[2] << 8), HR_FDBWDM);
+	hellcreek_write(hellcreek, entry->mac[5] | (entry->mac[4] << 8), HR_FDBWDL);
+
+	/* Meta data */
+	meta |= entry->portmask << HR_FDBWRM0_PORTMASK_SHIFT;
+	if (entry->is_obt)
+		meta |= HR_FDBWRM0_OBT;
+	if (entry->reprio_en) {
+		meta |= HR_FDBWRM0_REPRIO_EN;
+		meta |= entry->reprio_tc << HR_FDBWRM0_REPRIO_TC_SHIFT;
+	}
+	hellcreek_write(hellcreek, meta, HR_FDBWRM0);
+
+	/* Commit */
+	hellcreek_write(hellcreek, 0x00, HR_FDBWRCMD);
+
+	/* Wait until done */
+	return hellcreek_wait_fdb_ready(hellcreek);
+}
+
+static int __hellcreek_fdb_del(struct hellcreek *hellcreek,
+			       const struct hellcreek_fdb_entry *entry)
+{
+	dev_dbg(hellcreek->dev, "Delete FDB entry: MAC=%pM!\n", entry->mac);
+
+	/* Delete by matching idx */
+	hellcreek_write(hellcreek, entry->idx | HR_FDBWRCMD_FDBDEL, HR_FDBWRCMD);
+
+	/* Wait until done */
+	return hellcreek_wait_fdb_ready(hellcreek);
+}
+
+/* Retrieve the index of a FDB entry by mac address. Currently we search through
+ * the complete table in hardware. If that's too slow, we might have to cache
+ * the complete FDB table in software.
+ */
+static int hellcreek_fdb_get(struct hellcreek *hellcreek,
+			     const unsigned char *dest,
+			     struct hellcreek_fdb_entry *entry)
+{
+	size_t i;
+
+	/* Set read pointer to zero: The read of HR_FDBMAX (read-only register)
+	 * should reset the internal pointer. But, that doesn't work. The vendor
+	 * suggested a subsequent write as workaround. Same for HR_FDBRDH below.
+	 */
+	hellcreek_read(hellcreek, HR_FDBMAX);
+	hellcreek_write(hellcreek, 0x00, HR_FDBMAX);
+
+	/* We have to read the complete table, because the switch/driver might
+	 * enter new entries anywhere.
+	 */
+	for (i = 0; i < hellcreek->fdb_entries; ++i) {
+		unsigned char addr[ETH_ALEN];
+		u16 meta, mac;
+
+		meta	= hellcreek_read(hellcreek, HR_FDBMDRD);
+		mac	= hellcreek_read(hellcreek, HR_FDBRDL);
+		addr[5] = mac & 0xff;
+		addr[4] = (mac & 0xff00) >> 8;
+		mac	= hellcreek_read(hellcreek, HR_FDBRDM);
+		addr[3] = mac & 0xff;
+		addr[2] = (mac & 0xff00) >> 8;
+		mac	= hellcreek_read(hellcreek, HR_FDBRDH);
+		addr[1] = mac & 0xff;
+		addr[0] = (mac & 0xff00) >> 8;
+
+		/* Force next entry */
+		hellcreek_write(hellcreek, 0x00, HR_FDBRDH);
+
+		if (memcmp(addr, dest, ETH_ALEN))
+			continue;
+
+		/* Match found */
+		entry->idx	    = i;
+		entry->portmask	    = (meta & HR_FDBMDRD_PORTMASK_MASK) >>
+			HR_FDBMDRD_PORTMASK_SHIFT;
+		entry->age	    = (meta & HR_FDBMDRD_AGE_MASK) >>
+			HR_FDBMDRD_AGE_SHIFT;
+		entry->is_obt	    = !!(meta & HR_FDBMDRD_OBT);
+		entry->pass_blocked = !!(meta & HR_FDBMDRD_PASS_BLOCKED);
+		entry->is_static    = !!(meta & HR_FDBMDRD_STATIC);
+		entry->reprio_tc    = (meta & HR_FDBMDRD_REPRIO_TC_MASK) >>
+			HR_FDBMDRD_REPRIO_TC_SHIFT;
+		entry->reprio_en    = !!(meta & HR_FDBMDRD_REPRIO_EN);
+		memcpy(entry->mac, addr, sizeof(addr));
+
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int hellcreek_fdb_add(struct dsa_switch *ds, int port,
+			     const unsigned char *addr, u16 vid)
+{
+	struct hellcreek_fdb_entry entry = { 0 };
+	struct hellcreek *hellcreek = ds->priv;
+	int ret;
+
+	dev_dbg(hellcreek->dev, "Add FDB entry for MAC=%pM\n", addr);
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	ret = hellcreek_fdb_get(hellcreek, addr, &entry);
+	if (ret) {
+		/* Not found */
+		memcpy(entry.mac, addr, sizeof(entry.mac));
+		entry.portmask = BIT(port);
+
+		ret = __hellcreek_fdb_add(hellcreek, &entry);
+		if (ret) {
+			dev_err(hellcreek->dev, "Failed to add FDB entry!\n");
+			goto out;
+		}
+	} else {
+		/* Found */
+		ret = __hellcreek_fdb_del(hellcreek, &entry);
+		if (ret) {
+			dev_err(hellcreek->dev, "Failed to delete FDB entry!\n");
+			goto out;
+		}
+
+		entry.portmask |= BIT(port);
+
+		ret = __hellcreek_fdb_add(hellcreek, &entry);
+		if (ret) {
+			dev_err(hellcreek->dev, "Failed to add FDB entry!\n");
+			goto out;
+		}
+	}
+
+out:
+	mutex_unlock(&hellcreek->reg_lock);
+
+	return ret;
+}
+
+static int hellcreek_fdb_del(struct dsa_switch *ds, int port,
+			     const unsigned char *addr, u16 vid)
+{
+	struct hellcreek_fdb_entry entry = { 0 };
+	struct hellcreek *hellcreek = ds->priv;
+	int ret;
+
+	dev_dbg(hellcreek->dev, "Delete FDB entry for MAC=%pM\n", addr);
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	ret = hellcreek_fdb_get(hellcreek, addr, &entry);
+	if (ret) {
+		/* Not found */
+		dev_err(hellcreek->dev, "FDB entry for deletion not found!\n");
+	} else {
+		/* Found */
+		ret = __hellcreek_fdb_del(hellcreek, &entry);
+		if (ret) {
+			dev_err(hellcreek->dev, "Failed to delete FDB entry!\n");
+			goto out;
+		}
+
+		entry.portmask &= ~BIT(port);
+
+		if (entry.portmask != 0x00) {
+			ret = __hellcreek_fdb_add(hellcreek, &entry);
+			if (ret) {
+				dev_err(hellcreek->dev, "Failed to add FDB entry!\n");
+				goto out;
+			}
+		}
+	}
+
+out:
+	mutex_unlock(&hellcreek->reg_lock);
+
+	return ret;
+}
+
+static int hellcreek_fdb_dump(struct dsa_switch *ds, int port,
+			      dsa_fdb_dump_cb_t *cb, void *data)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	u16 entries;
+	size_t i;
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	/* Set read pointer to zero: The read of HR_FDBMAX (read-only register)
+	 * should reset the internal pointer. But, that doesn't work. The vendor
+	 * suggested a subsequent write as workaround. Same for HR_FDBRDH below.
+	 */
+	entries = hellcreek_read(hellcreek, HR_FDBMAX);
+	hellcreek_write(hellcreek, 0x00, HR_FDBMAX);
+
+	dev_dbg(hellcreek->dev, "FDB dump for port %d, entries=%d!\n", port, entries);
+
+	/* Read table */
+	for (i = 0; i < hellcreek->fdb_entries; ++i) {
+		unsigned char null_addr[ETH_ALEN] = { 0 };
+		struct hellcreek_fdb_entry entry = { 0 };
+		u16 meta, mac;
+
+		meta	= hellcreek_read(hellcreek, HR_FDBMDRD);
+		mac	= hellcreek_read(hellcreek, HR_FDBRDL);
+		entry.mac[5] = mac & 0xff;
+		entry.mac[4] = (mac & 0xff00) >> 8;
+		mac	= hellcreek_read(hellcreek, HR_FDBRDM);
+		entry.mac[3] = mac & 0xff;
+		entry.mac[2] = (mac & 0xff00) >> 8;
+		mac	= hellcreek_read(hellcreek, HR_FDBRDH);
+		entry.mac[1] = mac & 0xff;
+		entry.mac[0] = (mac & 0xff00) >> 8;
+
+		/* Force next entry */
+		hellcreek_write(hellcreek, 0x00, HR_FDBRDH);
+
+		/* Check valid */
+		if (!memcmp(entry.mac, null_addr, ETH_ALEN))
+			continue;
+
+		entry.portmask	= (meta & HR_FDBMDRD_PORTMASK_MASK) >>
+			HR_FDBMDRD_PORTMASK_SHIFT;
+		entry.is_static	= !!(meta & HR_FDBMDRD_STATIC);
+
+		/* Check port mask */
+		if (!(entry.portmask & BIT(port)))
+			continue;
+
+		cb(entry.mac, 0, entry.is_static, data);
+	}
+
+	mutex_unlock(&hellcreek->reg_lock);
+
+	return 0;
+}
+
+static int hellcreek_vlan_filtering(struct dsa_switch *ds, int port,
+				    bool vlan_filtering,
+				    struct switchdev_trans *trans)
+{
+	struct hellcreek *hellcreek = ds->priv;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	dev_dbg(hellcreek->dev, "%s VLAN filtering on port %d\n",
+		vlan_filtering ? "Enable" : "Disable", port);
+
+	/* Configure port to drop packages with not known vids */
+	hellcreek_setup_ingressflt(hellcreek, port, vlan_filtering);
+
+	/* Enable VLAN awareness on the switch. This save due to
+	 * ds->vlan_filtering_is_global.
+	 */
+	hellcreek_setup_vlan_awareness(hellcreek, vlan_filtering);
+
+	return 0;
+}
+
+static int hellcreek_enable_ip_core(struct hellcreek *hellcreek)
+{
+	int ret;
+	u16 val;
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	val = hellcreek_read(hellcreek, HR_CTRL_C);
+	val |= HR_CTRL_C_ENABLE;
+	hellcreek_write(hellcreek, val, HR_CTRL_C);
+	ret = hellcreek_wait_until_transitioned(hellcreek);
+
+	mutex_unlock(&hellcreek->reg_lock);
+
+	return ret;
+}
+
+static void hellcreek_setup_cpu_and_tunnel_port(struct hellcreek *hellcreek)
+{
+	struct hellcreek_port *tunnel_port = &hellcreek->ports[TUNNEL_PORT];
+	struct hellcreek_port *cpu_port = &hellcreek->ports[CPU_PORT];
+	u16 ptcfg = 0;
+
+	ptcfg |= HR_PTCFG_LEARNING_EN | HR_PTCFG_ADMIN_EN;
+
+	mutex_lock(&hellcreek->reg_lock);
+
+	hellcreek_select_port(hellcreek, CPU_PORT);
+	hellcreek_write(hellcreek, ptcfg, HR_PTCFG);
+
+	hellcreek_select_port(hellcreek, TUNNEL_PORT);
+	hellcreek_write(hellcreek, ptcfg, HR_PTCFG);
+
+	cpu_port->ptcfg	   = ptcfg;
+	tunnel_port->ptcfg = ptcfg;
+
+	mutex_unlock(&hellcreek->reg_lock);
+}
+
+static void hellcreek_setup_tc_identity_mapping(struct hellcreek *hellcreek)
+{
+	int i;
+
+	/* The switch has multiple egress queues per port. The queue is selected
+	 * via the PCP field in the VLAN header. The switch internally deals
+	 * with traffic classes instead of PCP values and this mapping is
+	 * configurable.
+	 *
+	 * The default mapping is (PCP - TC):
+	 *  7 - 7
+	 *  6 - 6
+	 *  5 - 5
+	 *  4 - 4
+	 *  3 - 3
+	 *  2 - 1
+	 *  1 - 0
+	 *  0 - 2
+	 *
+	 * The default should be an identity mapping.
+	 */
+
+	for (i = 0; i < 8; ++i) {
+		mutex_lock(&hellcreek->reg_lock);
+
+		hellcreek_select_prio(hellcreek, i);
+		hellcreek_write(hellcreek,
+				i << HR_PRTCCFG_PCP_TC_MAP_SHIFT,
+				HR_PRTCCFG);
+
+		mutex_unlock(&hellcreek->reg_lock);
+	}
+}
+
+static int hellcreek_setup(struct dsa_switch *ds)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	u16 swcfg = 0;
+	int ret, i;
+
+	dev_dbg(hellcreek->dev, "Set up the switch\n");
+
+	/* Let's go */
+	ret = hellcreek_enable_ip_core(hellcreek);
+	if (ret) {
+		dev_err(hellcreek->dev, "Failed to enable IP core!\n");
+		return ret;
+	}
+
+	/* Enable CPU/Tunnel ports */
+	hellcreek_setup_cpu_and_tunnel_port(hellcreek);
+
+	/* Switch config: Keep defaults, enable FDB aging and learning and tag
+	 * each frame from/to cpu port for DSA tagging.  Also enable the length
+	 * aware shaping mode. This eliminates the need for Qbv guard bands.
+	 */
+	swcfg |= HR_SWCFG_FDBAGE_EN |
+		HR_SWCFG_FDBLRN_EN  |
+		HR_SWCFG_ALWAYS_OBT |
+		(HR_SWCFG_LAS_ON << HR_SWCFG_LAS_MODE_SHIFT);
+	hellcreek->swcfg = swcfg;
+	hellcreek_write(hellcreek, swcfg, HR_SWCFG);
+
+	/* Initial vlan membership to reflect port separation */
+	for (i = 0; i < ds->num_ports; ++i) {
+		if (!dsa_is_user_port(ds, i))
+			continue;
+
+		hellcreek_setup_vlan_membership(ds, i, true);
+	}
+
+	/* Configure PCP <-> TC mapping */
+	hellcreek_setup_tc_identity_mapping(hellcreek);
+
+	/* Allow VLAN configurations while not filtering which is the default
+	 * for new DSA drivers.
+	 */
+	ds->configure_vlan_while_not_filtering = true;
+
+	/* The VLAN awareness is a global switch setting. Therefore, mixed vlan
+	 * filtering setups are not supported.
+	 */
+	ds->vlan_filtering_is_global = true;
+
+	return 0;
+}
+
+static void hellcreek_phylink_validate(struct dsa_switch *ds, int port,
+				       unsigned long *supported,
+				       struct phylink_link_state *state)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
+	struct hellcreek *hellcreek = ds->priv;
+
+	dev_dbg(hellcreek->dev, "Phylink validate for port %d\n", port);
+
+	/* The MAC settings are a hardware configuration option and cannot be
+	 * changed at run time or by strapping. Therefore the attached PHYs
+	 * should be programmed to only advertise settings which are supported
+	 * by the hardware.
+	 */
+	if (hellcreek->pdata->is_100_mbits)
+		phylink_set(mask, 100baseT_Full);
+	else
+		phylink_set(mask, 1000baseT_Full);
+
+	bitmap_and(supported, supported, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_and(state->advertising, state->advertising, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static int
+hellcreek_port_prechangeupper(struct dsa_switch *ds, int port,
+			      struct netdev_notifier_changeupper_info *info)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	bool used = true;
+	int ret = -EBUSY;
+	u16 vid;
+	int i;
+
+	dev_dbg(hellcreek->dev, "Pre change upper for port %d\n", port);
+
+	/*
+	 * Deny VLAN devices on top of lan ports with the same VLAN ids, because
+	 * it breaks the port separation due to the private VLANs. Example:
+	 *
+	 * lan0.100 *and* lan1.100 cannot be used in parallel. However, lan0.99
+	 * and lan1.100 works.
+	 */
+
+	if (!is_vlan_dev(info->upper_dev))
+		return 0;
+
+	vid = vlan_dev_vlan_id(info->upper_dev);
+
+	/* For all ports, check bitmaps */
+	mutex_lock(&hellcreek->vlan_lock);
+	for (i = 0; i < hellcreek->pdata->num_ports; ++i) {
+		if (!dsa_is_user_port(ds, i))
+			continue;
+
+		if (port == i)
+			continue;
+
+		used = used && test_bit(vid, hellcreek->ports[i].vlan_dev_bitmap);
+	}
+
+	if (used)
+		goto out;
+
+	/* Update bitmap */
+	set_bit(vid, hellcreek->ports[port].vlan_dev_bitmap);
+
+	ret = 0;
+
+out:
+	mutex_unlock(&hellcreek->vlan_lock);
+
+	return ret;
+}
+
+static const struct dsa_switch_ops hellcreek_ds_ops = {
+	.get_ethtool_stats   = hellcreek_get_ethtool_stats,
+	.get_sset_count	     = hellcreek_get_sset_count,
+	.get_strings	     = hellcreek_get_strings,
+	.get_tag_protocol    = hellcreek_get_tag_protocol,
+	.phylink_validate    = hellcreek_phylink_validate,
+	.port_bridge_join    = hellcreek_port_bridge_join,
+	.port_bridge_leave   = hellcreek_port_bridge_leave,
+	.port_disable	     = hellcreek_port_disable,
+	.port_enable	     = hellcreek_port_enable,
+	.port_fdb_add	     = hellcreek_fdb_add,
+	.port_fdb_del	     = hellcreek_fdb_del,
+	.port_fdb_dump	     = hellcreek_fdb_dump,
+	.port_prechangeupper = hellcreek_port_prechangeupper,
+	.port_stp_state_set  = hellcreek_port_stp_state_set,
+	.port_vlan_add	     = hellcreek_vlan_add,
+	.port_vlan_del	     = hellcreek_vlan_del,
+	.port_vlan_filtering = hellcreek_vlan_filtering,
+	.port_vlan_prepare   = hellcreek_vlan_prepare,
+	.setup		     = hellcreek_setup,
+};
+
+static int hellcreek_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct hellcreek *hellcreek;
+	struct resource *res;
+	int ret, i;
+
+	hellcreek = devm_kzalloc(dev, sizeof(*hellcreek), GFP_KERNEL);
+	if (!hellcreek)
+		return -ENOMEM;
+
+	hellcreek->vidmbrcfg = devm_kcalloc(dev, VLAN_N_VID,
+					    sizeof(*hellcreek->vidmbrcfg),
+					    GFP_KERNEL);
+	if (!hellcreek->vidmbrcfg)
+		return -ENOMEM;
+
+	hellcreek->pdata = of_device_get_match_data(dev);
+
+	hellcreek->ports = devm_kcalloc(dev, hellcreek->pdata->num_ports,
+					sizeof(*hellcreek->ports),
+					GFP_KERNEL);
+	if (!hellcreek->ports)
+		return -ENOMEM;
+
+	for (i = 0; i < hellcreek->pdata->num_ports; ++i) {
+		struct hellcreek_port *port = &hellcreek->ports[i];
+
+		port->counter_values =
+			devm_kcalloc(dev,
+				     ARRAY_SIZE(hellcreek_counter),
+				     sizeof(*port->counter_values),
+				     GFP_KERNEL);
+		if (!port->counter_values)
+			return -ENOMEM;
+
+		port->vlan_dev_bitmap =
+			devm_kcalloc(dev,
+				     BITS_TO_LONGS(VLAN_N_VID),
+				     sizeof(unsigned long),
+				     GFP_KERNEL);
+		if (!port->vlan_dev_bitmap)
+			return -ENOMEM;
+
+		port->hellcreek	= hellcreek;
+		port->port	= i;
+	}
+
+	mutex_init(&hellcreek->reg_lock);
+	mutex_init(&hellcreek->vlan_lock);
+
+	hellcreek->dev = dev;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "tsn");
+	if (!res) {
+		dev_err(dev, "No memory region provided!\n");
+		return -ENODEV;
+	}
+
+	hellcreek->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(hellcreek->base)) {
+		dev_err(dev, "No memory available!\n");
+		return PTR_ERR(hellcreek->base);
+	}
+
+	ret = hellcreek_detect(hellcreek);
+	if (ret) {
+		dev_err(dev, "No (known) chip found!\n");
+		return ret;
+	}
+
+	ret = hellcreek_wait_until_ready(hellcreek);
+	if (ret) {
+		dev_err(dev, "Switch didn't become ready!\n");
+		return ret;
+	}
+
+	hellcreek_feature_detect(hellcreek);
+
+	hellcreek->ds = devm_kzalloc(dev, sizeof(*hellcreek->ds), GFP_KERNEL);
+	if (!hellcreek->ds)
+		return -ENOMEM;
+
+	hellcreek->ds->dev	     = dev;
+	hellcreek->ds->priv	     = hellcreek;
+	hellcreek->ds->ops	     = &hellcreek_ds_ops;
+	hellcreek->ds->num_ports     = hellcreek->pdata->num_ports;
+	hellcreek->ds->num_tx_queues = HELLCREEK_NUM_EGRESS_QUEUES;
+
+	ret = dsa_register_switch(hellcreek->ds);
+	if (ret) {
+		dev_err(dev, "Unable to register switch\n");
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, hellcreek);
+
+	return 0;
+}
+
+static int hellcreek_remove(struct platform_device *pdev)
+{
+	struct hellcreek *hellcreek = platform_get_drvdata(pdev);
+
+	dsa_unregister_switch(hellcreek->ds);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+static const struct hellcreek_platform_data de1soc_r1_pdata = {
+	.num_ports	 = 4,
+	.is_100_mbits	 = 1,
+	.qbv_support	 = 1,
+	.qbv_on_cpu_port = 1,
+	.qbu_support	 = 0,
+	.module_id	 = 0x4c30,
+};
+
+static const struct of_device_id hellcreek_of_match[] = {
+	{
+		.compatible = "hirschmann,hellcreek-de1soc-r1",
+		.data	    = &de1soc_r1_pdata,
+	},
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, hellcreek_of_match);
+
+static struct platform_driver hellcreek_driver = {
+	.probe	= hellcreek_probe,
+	.remove = hellcreek_remove,
+	.driver = {
+		.name = "hellcreek",
+		.of_match_table = hellcreek_of_match,
+	},
+};
+module_platform_driver(hellcreek_driver);
+
+MODULE_AUTHOR("Kurt Kanzenbach <kurt@linutronix.de>");
+MODULE_DESCRIPTION("Hirschmann Hellcreek driver");
+MODULE_LICENSE("Dual MIT/GPL");
diff --git a/drivers/net/dsa/hirschmann/hellcreek.h b/drivers/net/dsa/hirschmann/hellcreek.h
new file mode 100644
index 000000000000..762bf9734630
--- /dev/null
+++ b/drivers/net/dsa/hirschmann/hellcreek.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: (GPL-2.0 or MIT) */
+/*
+ * DSA driver for:
+ * Hirschmann Hellcreek TSN switch.
+ *
+ * Copyright (C) 2019,2020 Linutronix GmbH
+ * Author Kurt Kanzenbach <kurt@linutronix.de>
+ */
+
+#ifndef _HELLCREEK_H_
+#define _HELLCREEK_H_
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/platform_data/hirschmann-hellcreek.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/timecounter.h>
+#include <net/dsa.h>
+
+/* Ports:
+ *  - 0: CPU
+ *  - 1: Tunnel
+ *  - 2: TSN front port 1
+ *  - 3: TSN front port 2
+ *  - ...
+ */
+#define CPU_PORT			0
+#define TUNNEL_PORT			1
+
+#define HELLCREEK_VLAN_NO_MEMBER	0x0
+#define HELLCREEK_VLAN_UNTAGGED_MEMBER	0x1
+#define HELLCREEK_VLAN_TAGGED_MEMBER	0x3
+#define HELLCREEK_NUM_EGRESS_QUEUES	8
+
+/* Register definitions */
+#define HR_MODID_C			(0 * 2)
+#define HR_REL_L_C			(1 * 2)
+#define HR_REL_H_C			(2 * 2)
+#define HR_BLD_L_C			(3 * 2)
+#define HR_BLD_H_C			(4 * 2)
+#define HR_CTRL_C			(5 * 2)
+#define HR_CTRL_C_READY			BIT(14)
+#define HR_CTRL_C_TRANSITION		BIT(13)
+#define HR_CTRL_C_ENABLE		BIT(0)
+
+#define HR_PSEL				(0xa6 * 2)
+#define HR_PSEL_PTWSEL_SHIFT		4
+#define HR_PSEL_PTWSEL_MASK		GENMASK(5, 4)
+#define HR_PSEL_PRTCWSEL_SHIFT		0
+#define HR_PSEL_PRTCWSEL_MASK		GENMASK(2, 0)
+
+#define HR_PTCFG			(0xa7 * 2)
+#define HR_PTCFG_MLIMIT_EN		BIT(13)
+#define HR_PTCFG_UMC_FLT		BIT(10)
+#define HR_PTCFG_UUC_FLT		BIT(9)
+#define HR_PTCFG_UNTRUST		BIT(8)
+#define HR_PTCFG_TAG_REQUIRED		BIT(7)
+#define HR_PTCFG_PPRIO_SHIFT		4
+#define HR_PTCFG_PPRIO_MASK		GENMASK(6, 4)
+#define HR_PTCFG_INGRESSFLT		BIT(3)
+#define HR_PTCFG_BLOCKED		BIT(2)
+#define HR_PTCFG_LEARNING_EN		BIT(1)
+#define HR_PTCFG_ADMIN_EN		BIT(0)
+
+#define HR_PRTCCFG			(0xa8 * 2)
+#define HR_PRTCCFG_PCP_TC_MAP_SHIFT	0
+#define HR_PRTCCFG_PCP_TC_MAP_MASK	GENMASK(2, 0)
+
+#define HR_CSEL				(0x8d * 2)
+#define HR_CSEL_SHIFT			0
+#define HR_CSEL_MASK			GENMASK(7, 0)
+#define HR_CRDL				(0x8e * 2)
+#define HR_CRDH				(0x8f * 2)
+
+#define HR_SWTRC_CFG			(0x90 * 2)
+#define HR_SWTRC0			(0x91 * 2)
+#define HR_SWTRC1			(0x92 * 2)
+#define HR_PFREE			(0x93 * 2)
+#define HR_MFREE			(0x94 * 2)
+
+#define HR_FDBAGE			(0x97 * 2)
+#define HR_FDBMAX			(0x98 * 2)
+#define HR_FDBRDL			(0x99 * 2)
+#define HR_FDBRDM			(0x9a * 2)
+#define HR_FDBRDH			(0x9b * 2)
+
+#define HR_FDBMDRD			(0x9c * 2)
+#define HR_FDBMDRD_PORTMASK_SHIFT	0
+#define HR_FDBMDRD_PORTMASK_MASK	GENMASK(3, 0)
+#define HR_FDBMDRD_AGE_SHIFT		4
+#define HR_FDBMDRD_AGE_MASK		GENMASK(7, 4)
+#define HR_FDBMDRD_OBT			BIT(8)
+#define HR_FDBMDRD_PASS_BLOCKED		BIT(9)
+#define HR_FDBMDRD_STATIC		BIT(11)
+#define HR_FDBMDRD_REPRIO_TC_SHIFT	12
+#define HR_FDBMDRD_REPRIO_TC_MASK	GENMASK(14, 12)
+#define HR_FDBMDRD_REPRIO_EN		BIT(15)
+
+#define HR_FDBWDL			(0x9d * 2)
+#define HR_FDBWDM			(0x9e * 2)
+#define HR_FDBWDH			(0x9f * 2)
+#define HR_FDBWRM0			(0xa0 * 2)
+#define HR_FDBWRM0_PORTMASK_SHIFT	0
+#define HR_FDBWRM0_PORTMASK_MASK	GENMASK(3, 0)
+#define HR_FDBWRM0_OBT			BIT(8)
+#define HR_FDBWRM0_PASS_BLOCKED		BIT(9)
+#define HR_FDBWRM0_REPRIO_TC_SHIFT	12
+#define HR_FDBWRM0_REPRIO_TC_MASK	GENMASK(14, 12)
+#define HR_FDBWRM0_REPRIO_EN		BIT(15)
+#define HR_FDBWRM1			(0xa1 * 2)
+
+#define HR_FDBWRCMD			(0xa2 * 2)
+#define HR_FDBWRCMD_FDBDEL		BIT(9)
+
+#define HR_SWCFG			(0xa3 * 2)
+#define HR_SWCFG_GM_STATEMD		BIT(15)
+#define HR_SWCFG_LAS_MODE_SHIFT		12
+#define HR_SWCFG_LAS_MODE_MASK		GENMASK(13, 12)
+#define HR_SWCFG_LAS_OFF		(0x00)
+#define HR_SWCFG_LAS_ON			(0x01)
+#define HR_SWCFG_LAS_STATIC		(0x10)
+#define HR_SWCFG_CT_EN			BIT(11)
+#define HR_SWCFG_VLAN_UNAWARE		BIT(10)
+#define HR_SWCFG_ALWAYS_OBT		BIT(9)
+#define HR_SWCFG_FDBAGE_EN		BIT(5)
+#define HR_SWCFG_FDBLRN_EN		BIT(4)
+
+#define HR_SWSTAT			(0xa4 * 2)
+#define HR_SWSTAT_FAIL			BIT(4)
+#define HR_SWSTAT_BUSY			BIT(0)
+
+#define HR_SWCMD			(0xa5 * 2)
+#define HW_SWCMD_FLUSH			BIT(0)
+
+#define HR_VIDCFG			(0xaa * 2)
+#define HR_VIDCFG_VID_SHIFT		0
+#define HR_VIDCFG_VID_MASK		GENMASK(11, 0)
+#define HR_VIDCFG_PVID			BIT(12)
+
+#define HR_VIDMBRCFG			(0xab * 2)
+#define HR_VIDMBRCFG_P0MBR_SHIFT	0
+#define HR_VIDMBRCFG_P0MBR_MASK		GENMASK(1, 0)
+#define HR_VIDMBRCFG_P1MBR_SHIFT	2
+#define HR_VIDMBRCFG_P1MBR_MASK		GENMASK(3, 2)
+#define HR_VIDMBRCFG_P2MBR_SHIFT	4
+#define HR_VIDMBRCFG_P2MBR_MASK		GENMASK(5, 4)
+#define HR_VIDMBRCFG_P3MBR_SHIFT	6
+#define HR_VIDMBRCFG_P3MBR_MASK		GENMASK(7, 6)
+
+#define HR_FEABITS0			(0xac * 2)
+#define HR_FEABITS0_FDBBINS_SHIFT	4
+#define HR_FEABITS0_FDBBINS_MASK	GENMASK(7, 4)
+#define HR_FEABITS0_PCNT_SHIFT		8
+#define HR_FEABITS0_PCNT_MASK		GENMASK(11, 8)
+#define HR_FEABITS0_MCNT_SHIFT		12
+#define HR_FEABITS0_MCNT_MASK		GENMASK(15, 12)
+
+#define TR_QTRACK			(0xb1 * 2)
+#define TR_TGDVER			(0xb3 * 2)
+#define TR_TGDVER_REV_MIN_MASK		GENMASK(7, 0)
+#define TR_TGDVER_REV_MIN_SHIFT		0
+#define TR_TGDVER_REV_MAJ_MASK		GENMASK(15, 8)
+#define TR_TGDVER_REV_MAJ_SHIFT		8
+#define TR_TGDSEL			(0xb4 * 2)
+#define TR_TGDSEL_TDGSEL_MASK		GENMASK(1, 0)
+#define TR_TGDSEL_TDGSEL_SHIFT		0
+#define TR_TGDCTRL			(0xb5 * 2)
+#define TR_TGDCTRL_GATE_EN		BIT(0)
+#define TR_TGDCTRL_CYC_SNAP		BIT(4)
+#define TR_TGDCTRL_SNAP_EST		BIT(5)
+#define TR_TGDCTRL_ADMINGATESTATES_MASK	GENMASK(15, 8)
+#define TR_TGDCTRL_ADMINGATESTATES_SHIFT	8
+#define TR_TGDSTAT0			(0xb6 * 2)
+#define TR_TGDSTAT1			(0xb7 * 2)
+#define TR_ESTWRL			(0xb8 * 2)
+#define TR_ESTWRH			(0xb9 * 2)
+#define TR_ESTCMD			(0xba * 2)
+#define TR_ESTCMD_ESTSEC_MASK		GENMASK(2, 0)
+#define TR_ESTCMD_ESTSEC_SHIFT		0
+#define TR_ESTCMD_ESTARM		BIT(4)
+#define TR_ESTCMD_ESTSWCFG		BIT(5)
+#define TR_EETWRL			(0xbb * 2)
+#define TR_EETWRH			(0xbc * 2)
+#define TR_EETCMD			(0xbd * 2)
+#define TR_EETCMD_EETSEC_MASK		GEMASK(2, 0)
+#define TR_EETCMD_EETSEC_SHIFT		0
+#define TR_EETCMD_EETARM		BIT(4)
+#define TR_CTWRL			(0xbe * 2)
+#define TR_CTWRH			(0xbf * 2)
+#define TR_LCNSL			(0xc1 * 2)
+#define TR_LCNSH			(0xc2 * 2)
+#define TR_LCS				(0xc3 * 2)
+#define TR_GCLDAT			(0xc4 * 2)
+#define TR_GCLDAT_GCLWRGATES_MASK	GENMASK(7, 0)
+#define TR_GCLDAT_GCLWRGATES_SHIFT	0
+#define TR_GCLDAT_GCLWRLAST		BIT(8)
+#define TR_GCLDAT_GCLOVRI		BIT(9)
+#define TR_GCLTIL			(0xc5 * 2)
+#define TR_GCLTIH			(0xc6 * 2)
+#define TR_GCLCMD			(0xc7 * 2)
+#define TR_GCLCMD_GCLWRADR_MASK		GENMASK(7, 0)
+#define TR_GCLCMD_GCLWRADR_SHIFT	0
+#define TR_GCLCMD_INIT_GATE_STATES_MASK	GENMASK(15, 8)
+#define TR_GCLCMD_INIT_GATE_STATES_SHIFT	8
+
+struct hellcreek_counter {
+	u8 offset;
+	const char *name;
+};
+
+struct hellcreek;
+
+struct hellcreek_port {
+	struct hellcreek *hellcreek;
+	unsigned long *vlan_dev_bitmap;
+	int port;
+	u16 ptcfg;		/* ptcfg shadow */
+	u64 *counter_values;
+};
+
+struct hellcreek_fdb_entry {
+	size_t idx;
+	unsigned char mac[ETH_ALEN];
+	u8 portmask;
+	u8 age;
+	u8 is_obt;
+	u8 pass_blocked;
+	u8 is_static;
+	u8 reprio_tc;
+	u8 reprio_en;
+};
+
+struct hellcreek {
+	const struct hellcreek_platform_data *pdata;
+	struct device *dev;
+	struct dsa_switch *ds;
+	struct hellcreek_port *ports;
+	struct mutex reg_lock;	/* Switch IP register lock */
+	struct mutex vlan_lock;	/* VLAN bitmaps lock */
+	void __iomem *base;
+	u16 swcfg;		/* swcfg shadow */
+	u8 *vidmbrcfg;		/* vidmbrcfg shadow */
+	size_t fdb_entries;
+};
+
+#endif /* _HELLCREEK_H_ */
diff --git a/include/linux/platform_data/hirschmann-hellcreek.h b/include/linux/platform_data/hirschmann-hellcreek.h
new file mode 100644
index 000000000000..388846766bb2
--- /dev/null
+++ b/include/linux/platform_data/hirschmann-hellcreek.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: (GPL-2.0 or MIT) */
+/*
+ * Hirschmann Hellcreek TSN switch platform data.
+ *
+ * Copyright (C) 2020 Linutronix GmbH
+ * Author Kurt Kanzenbach <kurt@linutronix.de>
+ */
+
+#ifndef _HIRSCHMANN_HELLCREEK_H_
+#define _HIRSCHMANN_HELLCREEK_H_
+
+#include <linux/types.h>
+
+struct hellcreek_platform_data {
+	int num_ports;		/* Amount of switch ports */
+	int is_100_mbits;	/* Is it configured to 100 or 1000 mbit/s */
+	int qbv_support;	/* Qbv support on front TSN ports */
+	int qbv_on_cpu_port;	/* Qbv support on the CPU port */
+	int qbu_support;	/* Qbu support on front TSN ports */
+	u16 module_id;		/* Module identificaton */
+};
+
+#endif /* _HIRSCHMANN_HELLCREEK_H_ */
-- 
cgit 


From 293e9a3d950dfebc76d9fa6931e6f91ef856b9ab Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Sun, 1 Nov 2020 14:50:56 +0200
Subject: net: phy: export phy_error and phy_trigger_machine

These functions are currently used by phy_interrupt() to either signal
an error condition or to trigger the link state machine. In an attempt
to actually support shared PHY IRQs, export these two functions so that
the actual PHY drivers can use them.

Cc: Alexandru Ardelean <alexandru.ardelean@analog.com>
Cc: Andre Edich <andre.edich@microchip.com>
Cc: Antoine Tenart <atenart@kernel.org>
Cc: Baruch Siach <baruch@tkos.co.il>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dan Murphy <dmurphy@ti.com>
Cc: Divya Koppera <Divya.Koppera@microchip.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Heiner Kallweit <hkallweit1@gmail.com>
Cc: Jerome Brunet <jbrunet@baylibre.com>
Cc: Kavya Sree Kotagiri <kavyasree.kotagiri@microchip.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Marco Felsch <m.felsch@pengutronix.de>
Cc: Marek Vasut <marex@denx.de>
Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Cc: Mathias Kresin <dev@kresin.me>
Cc: Maxim Kochetkov <fido_max@inbox.ru>
Cc: Michael Walle <michael@walle.cc>
Cc: Neil Armstrong <narmstrong@baylibre.com>
Cc: Nisar Sayed <Nisar.Sayed@microchip.com>
Cc: Oleksij Rempel <o.rempel@pengutronix.de>
Cc: Philippe Schenker <philippe.schenker@toradex.com>
Cc: Willy Liu <willy.liu@realtek.com>
Cc: Yuiko Oshino <yuiko.oshino@microchip.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy.c | 6 ++++--
 include/linux/phy.h   | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 35525a671400..477bdf2f94df 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -493,10 +493,11 @@ EXPORT_SYMBOL(phy_queue_state_machine);
  *
  * @phydev: the phy_device struct
  */
-static void phy_trigger_machine(struct phy_device *phydev)
+void phy_trigger_machine(struct phy_device *phydev)
 {
 	phy_queue_state_machine(phydev, 0);
 }
+EXPORT_SYMBOL(phy_trigger_machine);
 
 static void phy_abort_cable_test(struct phy_device *phydev)
 {
@@ -924,7 +925,7 @@ void phy_stop_machine(struct phy_device *phydev)
  * Must not be called from interrupt context, or while the
  * phydev->lock is held.
  */
-static void phy_error(struct phy_device *phydev)
+void phy_error(struct phy_device *phydev)
 {
 	WARN_ON(1);
 
@@ -934,6 +935,7 @@ static void phy_error(struct phy_device *phydev)
 
 	phy_trigger_machine(phydev);
 }
+EXPORT_SYMBOL(phy_error);
 
 /**
  * phy_disable_interrupts - Disable the PHY interrupts from the PHY side
diff --git a/include/linux/phy.h b/include/linux/phy.h
index eb3cb1a98b45..566b39f6cd64 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1570,8 +1570,10 @@ void phy_drivers_unregister(struct phy_driver *drv, int n);
 int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
 int phy_drivers_register(struct phy_driver *new_driver, int n,
 			 struct module *owner);
+void phy_error(struct phy_device *phydev);
 void phy_state_machine(struct work_struct *work);
 void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies);
+void phy_trigger_machine(struct phy_device *phydev);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-- 
cgit 


From 87de1f058aacc8ce4be94e36a38f77b860914a76 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Sun, 1 Nov 2020 14:51:12 +0200
Subject: net: phy: add genphy_handle_interrupt_no_ack()

It seems there are cases where the interrupts are handled by another
entity (ie an IRQ controller embedded inside the PHY) and do not need
any other interraction from phylib. For this kind of PHYs, like the
RTL8366RB, add the genphy_handle_interrupt_no_ack() function which just
triggers the link state machine.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 13 +++++++++++++
 include/linux/phy.h          |  1 +
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index f54f483d7fd6..e13a46c25437 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2463,6 +2463,19 @@ int genphy_soft_reset(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(genphy_soft_reset);
 
+irqreturn_t genphy_handle_interrupt_no_ack(struct phy_device *phydev)
+{
+	/* It seems there are cases where the interrupts are handled by another
+	 * entity (ie an IRQ controller embedded inside the PHY) and do not
+	 * need any other interraction from phylib. In this case, just trigger
+	 * the state machine directly.
+	 */
+	phy_trigger_machine(phydev);
+
+	return 0;
+}
+EXPORT_SYMBOL(genphy_handle_interrupt_no_ack);
+
 /**
  * genphy_read_abilities - read PHY abilities from Clause 22 registers
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 566b39f6cd64..4f158d6352ae 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1510,6 +1510,7 @@ int genphy_suspend(struct phy_device *phydev);
 int genphy_resume(struct phy_device *phydev);
 int genphy_loopback(struct phy_device *phydev, bool enable);
 int genphy_soft_reset(struct phy_device *phydev);
+irqreturn_t genphy_handle_interrupt_no_ack(struct phy_device *phydev);
 
 static inline int genphy_config_aneg(struct phy_device *phydev)
 {
-- 
cgit 


From d8c4a22363853e196497b06a8ee9be9773873a14 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Tue, 3 Nov 2020 18:23:53 +0100
Subject: bus: mhi: Add mhi_queue_is_full function

This function can be used by client driver to determine whether it's
possible to queue new elements in a channel ring.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1604424234-24446-1-git-send-email-loic.poulain@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/bus/mhi/core/main.c | 11 +++++++++++
 include/linux/mhi.h         |  7 +++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c
index 2cff5ddff225..ba9e721d61b7 100644
--- a/drivers/bus/mhi/core/main.c
+++ b/drivers/bus/mhi/core/main.c
@@ -1165,6 +1165,17 @@ int mhi_queue_buf(struct mhi_device *mhi_dev, enum dma_data_direction dir,
 }
 EXPORT_SYMBOL_GPL(mhi_queue_buf);
 
+bool mhi_queue_is_full(struct mhi_device *mhi_dev, enum dma_data_direction dir)
+{
+	struct mhi_controller *mhi_cntrl = mhi_dev->mhi_cntrl;
+	struct mhi_chan *mhi_chan = (dir == DMA_TO_DEVICE) ?
+					mhi_dev->ul_chan : mhi_dev->dl_chan;
+	struct mhi_ring *tre_ring = &mhi_chan->tre_ring;
+
+	return mhi_is_ring_full(mhi_cntrl, tre_ring);
+}
+EXPORT_SYMBOL_GPL(mhi_queue_is_full);
+
 int mhi_send_cmd(struct mhi_controller *mhi_cntrl,
 		 struct mhi_chan *mhi_chan,
 		 enum mhi_cmd_type cmd)
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index d4841e5a5f45..b9bb7dff32e2 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -737,4 +737,11 @@ int mhi_queue_buf(struct mhi_device *mhi_dev, enum dma_data_direction dir,
 int mhi_queue_skb(struct mhi_device *mhi_dev, enum dma_data_direction dir,
 		  struct sk_buff *skb, size_t len, enum mhi_flags mflags);
 
+/**
+ * mhi_queue_is_full - Determine whether queueing new elements is possible
+ * @mhi_dev: Device associated with the channels
+ * @dir: DMA direction for the channel
+ */
+bool mhi_queue_is_full(struct mhi_device *mhi_dev, enum dma_data_direction dir);
+
 #endif /* _MHI_H_ */
-- 
cgit 


From c1aedf015ebdd0232757a66e2daccf1246bd609c Mon Sep 17 00:00:00 2001
From: Hayes Wang <hayeswang@realtek.com>
Date: Wed, 4 Nov 2020 10:19:22 +0800
Subject: net/usb/r8153_ecm: support ECM mode for RTL8153

Support ECM mode based on cdc_ether with relative mii functions,
when CONFIG_USB_RTL8152 is not set, or the device is not supported
by r8152 driver.

Both r8152 and r8153_ecm would check the return value of
rtl8152_get_version() in porbe(). If rtl8152_get_version()
return none zero value, the r8152 is used for the device
with vendor mode. Otherwise, the r8153_ecm is used for the
device with ECM mode.

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
Link: https://lore.kernel.org/r/1394712342-15778-392-Taiwan-albertk@realtek.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/Makefile    |   2 +-
 drivers/net/usb/r8152.c     |  30 ++------
 drivers/net/usb/r8153_ecm.c | 162 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/r8152.h   |  37 ++++++++++
 4 files changed, 204 insertions(+), 27 deletions(-)
 create mode 100644 drivers/net/usb/r8153_ecm.c
 create mode 100644 include/linux/usb/r8152.h

(limited to 'include/linux')

diff --git a/drivers/net/usb/Makefile b/drivers/net/usb/Makefile
index 99fd12be2111..99381e6bea78 100644
--- a/drivers/net/usb/Makefile
+++ b/drivers/net/usb/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_USB_LAN78XX)	+= lan78xx.o
 obj-$(CONFIG_USB_NET_AX8817X)	+= asix.o
 asix-y := asix_devices.o asix_common.o ax88172a.o
 obj-$(CONFIG_USB_NET_AX88179_178A)      += ax88179_178a.o
-obj-$(CONFIG_USB_NET_CDCETHER)	+= cdc_ether.o
+obj-$(CONFIG_USB_NET_CDCETHER)	+= cdc_ether.o r8153_ecm.o
 obj-$(CONFIG_USB_NET_CDC_EEM)	+= cdc_eem.o
 obj-$(CONFIG_USB_NET_DM9601)	+= dm9601.o
 obj-$(CONFIG_USB_NET_SR9700)	+= sr9700.o
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index b9b3d19a2e98..c448d6089821 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -26,6 +26,7 @@
 #include <linux/acpi.h>
 #include <linux/firmware.h>
 #include <crypto/hash.h>
+#include <linux/usb/r8152.h>
 
 /* Information for net-next */
 #define NETNEXT_VERSION		"11"
@@ -653,18 +654,6 @@ enum rtl_register_content {
 
 #define INTR_LINK		0x0004
 
-#define RTL8152_REQT_READ	0xc0
-#define RTL8152_REQT_WRITE	0x40
-#define RTL8152_REQ_GET_REGS	0x05
-#define RTL8152_REQ_SET_REGS	0x05
-
-#define BYTE_EN_DWORD		0xff
-#define BYTE_EN_WORD		0x33
-#define BYTE_EN_BYTE		0x11
-#define BYTE_EN_SIX_BYTES	0x3f
-#define BYTE_EN_START_MASK	0x0f
-#define BYTE_EN_END_MASK	0xf0
-
 #define RTL8153_MAX_PACKET	9216 /* 9K */
 #define RTL8153_MAX_MTU		(RTL8153_MAX_PACKET - VLAN_ETH_HLEN - \
 				 ETH_FCS_LEN)
@@ -689,21 +678,9 @@ enum rtl8152_flags {
 	LENOVO_MACPASSTHRU,
 };
 
-/* Define these values to match your device */
-#define VENDOR_ID_REALTEK		0x0bda
-#define VENDOR_ID_MICROSOFT		0x045e
-#define VENDOR_ID_SAMSUNG		0x04e8
-#define VENDOR_ID_LENOVO		0x17ef
-#define VENDOR_ID_LINKSYS		0x13b1
-#define VENDOR_ID_NVIDIA		0x0955
-#define VENDOR_ID_TPLINK		0x2357
-
 #define DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2	0x3082
 #define DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2		0xa387
 
-#define MCU_TYPE_PLA			0x0100
-#define MCU_TYPE_USB			0x0000
-
 struct tally_counter {
 	__le64	tx_packets;
 	__le64	rx_packets;
@@ -6621,7 +6598,7 @@ static int rtl_fw_init(struct r8152 *tp)
 	return 0;
 }
 
-static u8 rtl_get_version(struct usb_interface *intf)
+u8 rtl8152_get_version(struct usb_interface *intf)
 {
 	struct usb_device *udev = interface_to_usbdev(intf);
 	u32 ocp_data = 0;
@@ -6679,12 +6656,13 @@ static u8 rtl_get_version(struct usb_interface *intf)
 
 	return version;
 }
+EXPORT_SYMBOL_GPL(rtl8152_get_version);
 
 static int rtl8152_probe(struct usb_interface *intf,
 			 const struct usb_device_id *id)
 {
 	struct usb_device *udev = interface_to_usbdev(intf);
-	u8 version = rtl_get_version(intf);
+	u8 version = rtl8152_get_version(intf);
 	struct r8152 *tp;
 	struct net_device *netdev;
 	int ret;
diff --git a/drivers/net/usb/r8153_ecm.c b/drivers/net/usb/r8153_ecm.c
new file mode 100644
index 000000000000..2c3fabd38b16
--- /dev/null
+++ b/drivers/net/usb/r8153_ecm.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/mii.h>
+#include <linux/usb.h>
+#include <linux/usb/cdc.h>
+#include <linux/usb/usbnet.h>
+#include <linux/usb/r8152.h>
+
+#define OCP_BASE		0xe86c
+
+static int pla_read_word(struct usbnet *dev, u16 index)
+{
+	u16 byen = BYTE_EN_WORD;
+	u8 shift = index & 2;
+	__le32 tmp;
+	int ret;
+
+	if (shift)
+		byen <<= shift;
+
+	index &= ~3;
+
+	ret = usbnet_read_cmd(dev, RTL8152_REQ_GET_REGS, RTL8152_REQT_READ, index,
+			      MCU_TYPE_PLA | byen, &tmp, sizeof(tmp));
+	if (ret < 0)
+		goto out;
+
+	ret = __le32_to_cpu(tmp);
+	ret >>= (shift * 8);
+	ret &= 0xffff;
+
+out:
+	return ret;
+}
+
+static int pla_write_word(struct usbnet *dev, u16 index, u32 data)
+{
+	u32 mask = 0xffff;
+	u16 byen = BYTE_EN_WORD;
+	u8 shift = index & 2;
+	__le32 tmp;
+	int ret;
+
+	data &= mask;
+
+	if (shift) {
+		byen <<= shift;
+		mask <<= (shift * 8);
+		data <<= (shift * 8);
+	}
+
+	index &= ~3;
+
+	ret = usbnet_read_cmd(dev, RTL8152_REQ_GET_REGS, RTL8152_REQT_READ, index,
+			      MCU_TYPE_PLA | byen, &tmp, sizeof(tmp));
+
+	if (ret < 0)
+		goto out;
+
+	data |= __le32_to_cpu(tmp) & ~mask;
+	tmp = __cpu_to_le32(data);
+
+	ret = usbnet_write_cmd(dev, RTL8152_REQ_SET_REGS, RTL8152_REQT_WRITE, index,
+			       MCU_TYPE_PLA | byen, &tmp, sizeof(tmp));
+
+out:
+	return ret;
+}
+
+static int r8153_ecm_mdio_read(struct net_device *netdev, int phy_id, int reg)
+{
+	struct usbnet *dev = netdev_priv(netdev);
+	int ret;
+
+	ret = pla_write_word(dev, OCP_BASE, 0xa000);
+	if (ret < 0)
+		goto out;
+
+	ret = pla_read_word(dev, 0xb400 + reg * 2);
+
+out:
+	return ret;
+}
+
+static void r8153_ecm_mdio_write(struct net_device *netdev, int phy_id, int reg, int val)
+{
+	struct usbnet *dev = netdev_priv(netdev);
+	int ret;
+
+	ret = pla_write_word(dev, OCP_BASE, 0xa000);
+	if (ret < 0)
+		return;
+
+	ret = pla_write_word(dev, 0xb400 + reg * 2, val);
+}
+
+static int r8153_bind(struct usbnet *dev, struct usb_interface *intf)
+{
+	int status;
+
+	status = usbnet_cdc_bind(dev, intf);
+	if (status < 0)
+		return status;
+
+	dev->mii.dev = dev->net;
+	dev->mii.mdio_read = r8153_ecm_mdio_read;
+	dev->mii.mdio_write = r8153_ecm_mdio_write;
+	dev->mii.reg_num_mask = 0x1f;
+	dev->mii.supports_gmii = 1;
+
+	return status;
+}
+
+static const struct driver_info r8153_info = {
+	.description =	"RTL8153 ECM Device",
+	.flags =	FLAG_ETHER,
+	.bind =		r8153_bind,
+	.unbind =	usbnet_cdc_unbind,
+	.status =	usbnet_cdc_status,
+	.manage_power =	usbnet_manage_power,
+};
+
+static const struct usb_device_id products[] = {
+{
+	USB_DEVICE_AND_INTERFACE_INFO(VENDOR_ID_REALTEK, 0x8153, USB_CLASS_COMM,
+				      USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+	.driver_info = (unsigned long)&r8153_info,
+},
+
+	{ },		/* END */
+};
+MODULE_DEVICE_TABLE(usb, products);
+
+static int rtl8153_ecm_probe(struct usb_interface *intf,
+			     const struct usb_device_id *id)
+{
+#if IS_REACHABLE(CONFIG_USB_RTL8152)
+	if (rtl8152_get_version(intf))
+		return -ENODEV;
+#endif
+
+	return usbnet_probe(intf, id);
+}
+
+static struct usb_driver r8153_ecm_driver = {
+	.name =		"r8153_ecm",
+	.id_table =	products,
+	.probe =	rtl8153_ecm_probe,
+	.disconnect =	usbnet_disconnect,
+	.suspend =	usbnet_suspend,
+	.resume =	usbnet_resume,
+	.reset_resume =	usbnet_resume,
+	.supports_autosuspend = 1,
+	.disable_hub_initiated_lpm = 1,
+};
+
+module_usb_driver(r8153_ecm_driver);
+
+MODULE_AUTHOR("Hayes Wang");
+MODULE_DESCRIPTION("Realtek USB ECM device");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h
new file mode 100644
index 000000000000..20d88b1defc3
--- /dev/null
+++ b/include/linux/usb/r8152.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *  Copyright (c) 2020 Realtek Semiconductor Corp. All rights reserved.
+ */
+
+#ifndef	__LINUX_R8152_H
+#define __LINUX_R8152_H
+
+#define RTL8152_REQT_READ		0xc0
+#define RTL8152_REQT_WRITE		0x40
+#define RTL8152_REQ_GET_REGS		0x05
+#define RTL8152_REQ_SET_REGS		0x05
+
+#define BYTE_EN_DWORD			0xff
+#define BYTE_EN_WORD			0x33
+#define BYTE_EN_BYTE			0x11
+#define BYTE_EN_SIX_BYTES		0x3f
+#define BYTE_EN_START_MASK		0x0f
+#define BYTE_EN_END_MASK		0xf0
+
+#define MCU_TYPE_PLA			0x0100
+#define MCU_TYPE_USB			0x0000
+
+/* Define these values to match your device */
+#define VENDOR_ID_REALTEK		0x0bda
+#define VENDOR_ID_MICROSOFT		0x045e
+#define VENDOR_ID_SAMSUNG		0x04e8
+#define VENDOR_ID_LENOVO		0x17ef
+#define VENDOR_ID_LINKSYS		0x13b1
+#define VENDOR_ID_NVIDIA		0x0955
+#define VENDOR_ID_TPLINK		0x2357
+
+#if IS_REACHABLE(CONFIG_USB_RTL8152)
+extern u8 rtl8152_get_version(struct usb_interface *intf);
+#endif
+
+#endif /* __LINUX_R8152_H */
-- 
cgit 


From 8280c07e0762ba753876c427584a792e86f3f7e7 Mon Sep 17 00:00:00 2001
From: Kurt Lee <kurt.lee@cypress.com>
Date: Mon, 12 Oct 2020 03:43:46 -0500
Subject: ieee80211: Add definition for WFA DPP

Add Wi-Fi Alliance definition for DPP (Device Provisioning Protocol).

Signed-off-by: Kurt Lee <kurt.lee@cypress.com>
Signed-off-by: Wright Feng <wright.feng@cypress.com>
Link: https://lore.kernel.org/r/20201012084347.121557-2-wright.feng@cypress.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 770408b2fdaf..5e8cc9c3d45a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3417,6 +3417,8 @@ struct ieee80211_multiple_bssid_configuration {
 #define WLAN_AKM_SUITE_FT_PSK_SHA384		SUITE(0x000FAC, 19)
 #define WLAN_AKM_SUITE_PSK_SHA384		SUITE(0x000FAC, 20)
 
+#define WLAN_AKM_SUITE_WFA_DPP			SUITE(WLAN_OUI_WFA, 2)
+
 #define WLAN_MAX_KEY_LEN		32
 
 #define WLAN_PMK_NAME_LEN		16
@@ -3427,6 +3429,7 @@ struct ieee80211_multiple_bssid_configuration {
 
 #define WLAN_OUI_WFA			0x506f9a
 #define WLAN_OUI_TYPE_WFA_P2P		9
+#define WLAN_OUI_TYPE_WFA_DPP		0x1A
 #define WLAN_OUI_MICROSOFT		0x0050f2
 #define WLAN_OUI_TYPE_MICROSOFT_WPA	1
 #define WLAN_OUI_TYPE_MICROSOFT_WMM	2
-- 
cgit 


From aec51036a166a0aecc26cba101bfbbdc53d64139 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Wed, 4 Nov 2020 19:35:17 +0000
Subject: tty: tty_io: Move 'tty_sysctl_init's prototype to shared space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following W=1 kernel build warning(s):

 drivers/tty/tty_ldisc.c:883:6: warning: no previous prototype for ‘tty_sysctl_init’ [-Wmissing-prototypes]
 883 | void tty_sysctl_init(void)
 | ^~~~~~~~~~~~~~~

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Nick Holloway <alfie@dcs.warwick.ac.uk>
Cc: -- <julian@uhunix.uhcc.hawaii.edu>
Cc: Marko Kohtala <Marko.Kohtala@hut.fi>
Cc: Bill Hawes <whawes@star.net>
Cc: "C. Scott Ananian" <cananian@alumni.princeton.edu>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Andrew Morton <andrewm@uow.edu.eu>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20201104193549.4026187-5-lee.jones@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 2 --
 include/linux/tty.h  | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 7a4c02548fb3..88b00c47b606 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -514,8 +514,6 @@ static const struct file_operations hung_up_tty_fops = {
 static DEFINE_SPINLOCK(redirect_lock);
 static struct file *redirect;
 
-extern void tty_sysctl_init(void);
-
 /**
  *	tty_wakeup	-	request more data
  *	@tty: terminal
diff --git a/include/linux/tty.h b/include/linux/tty.h
index a99e9b8e4e31..10212c6e4345 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -716,6 +716,7 @@ extern int __must_check tty_ldisc_init(struct tty_struct *tty);
 extern void tty_ldisc_deinit(struct tty_struct *tty);
 extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 				 char *f, int count);
+extern void tty_sysctl_init(void);
 
 /* n_tty.c */
 extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
-- 
cgit 


From 0264c8c9e1b53e9dbb41fae5e54756e84644bc60 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:36 -0500
Subject: ftrace: Move the recursion testing into global headers

Currently, if a callback is registered to a ftrace function and its
ftrace_ops does not have the RECURSION flag set, it is encapsulated in a
helper function that does the recursion for it.

Really, all the callbacks should have their own recursion protection for
performance reasons. But they should not all implement their own. Move the
recursion helpers to global headers, so that all callbacks can use them.

Link: https://lkml.kernel.org/r/20201028115612.460535535@goodmis.org
Link: https://lkml.kernel.org/r/20201106023546.166456258@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h          |   1 +
 include/linux/trace_recursion.h | 187 ++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h            | 177 -------------------------------------
 3 files changed, 188 insertions(+), 177 deletions(-)
 create mode 100644 include/linux/trace_recursion.h

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1bd3a0356ae4..0e4164a7f56d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -7,6 +7,7 @@
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
+#include <linux/trace_recursion.h>
 #include <linux/trace_clock.h>
 #include <linux/kallsyms.h>
 #include <linux/linkage.h>
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
new file mode 100644
index 000000000000..dbb7b6d4c94c
--- /dev/null
+++ b/include/linux/trace_recursion.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TRACE_RECURSION_H
+#define _LINUX_TRACE_RECURSION_H
+
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_TRACING
+
+/* Only current can touch trace_recursion */
+
+/*
+ * For function tracing recursion:
+ *  The order of these bits are important.
+ *
+ *  When function tracing occurs, the following steps are made:
+ *   If arch does not support a ftrace feature:
+ *    call internal function (uses INTERNAL bits) which calls...
+ *   If callback is registered to the "global" list, the list
+ *    function is called and recursion checks the GLOBAL bits.
+ *    then this function calls...
+ *   The function callback, which can use the FTRACE bits to
+ *    check for recursion.
+ *
+ * Now if the arch does not support a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+	/* Function recursion bits */
+	TRACE_FTRACE_BIT,
+	TRACE_FTRACE_NMI_BIT,
+	TRACE_FTRACE_IRQ_BIT,
+	TRACE_FTRACE_SIRQ_BIT,
+
+	/* INTERNAL_BITs must be greater than FTRACE_BITs */
+	TRACE_INTERNAL_BIT,
+	TRACE_INTERNAL_NMI_BIT,
+	TRACE_INTERNAL_IRQ_BIT,
+	TRACE_INTERNAL_SIRQ_BIT,
+
+	TRACE_BRANCH_BIT,
+/*
+ * Abuse of the trace_recursion.
+ * As we need a way to maintain state if we are tracing the function
+ * graph in irq because we want to trace a particular function that
+ * was called in irq context but we have irq tracing off. Since this
+ * can only be modified by current, we can reuse trace_recursion.
+ */
+	TRACE_IRQ_BIT,
+
+	/* Set if the function is in the set_graph_function file */
+	TRACE_GRAPH_BIT,
+
+	/*
+	 * In the very unlikely case that an interrupt came in
+	 * at a start of graph tracing, and we want to trace
+	 * the function in that interrupt, the depth can be greater
+	 * than zero, because of the preempted start of a previous
+	 * trace. In an even more unlikely case, depth could be 2
+	 * if a softirq interrupted the start of graph tracing,
+	 * followed by an interrupt preempting a start of graph
+	 * tracing in the softirq, and depth can even be 3
+	 * if an NMI came in at the start of an interrupt function
+	 * that preempted a softirq start of a function that
+	 * preempted normal context!!!! Luckily, it can't be
+	 * greater than 3, so the next two bits are a mask
+	 * of what the depth is when we set TRACE_GRAPH_BIT
+	 */
+
+	TRACE_GRAPH_DEPTH_START_BIT,
+	TRACE_GRAPH_DEPTH_END_BIT,
+
+	/*
+	 * To implement set_graph_notrace, if this bit is set, we ignore
+	 * function graph tracing of called functions, until the return
+	 * function is called to clear it.
+	 */
+	TRACE_GRAPH_NOTRACE_BIT,
+
+	/*
+	 * When transitioning between context, the preempt_count() may
+	 * not be correct. Allow for a single recursion to cover this case.
+	 */
+	TRACE_TRANSITION_BIT,
+};
+
+#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit)	((current)->trace_recursion & (1<<(bit)))
+
+#define trace_recursion_depth() \
+	(((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
+#define trace_recursion_set_depth(depth) \
+	do {								\
+		current->trace_recursion &=				\
+			~(3 << TRACE_GRAPH_DEPTH_START_BIT);		\
+		current->trace_recursion |=				\
+			((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT;	\
+	} while (0)
+
+#define TRACE_CONTEXT_BITS	4
+
+#define TRACE_FTRACE_START	TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_LIST_START	TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_CONTEXT_MASK	TRACE_LIST_MAX
+
+static __always_inline int trace_get_context_bit(void)
+{
+	int bit;
+
+	if (in_interrupt()) {
+		if (in_nmi())
+			bit = 0;
+
+		else if (in_irq())
+			bit = 1;
+		else
+			bit = 2;
+	} else
+		bit = 3;
+
+	return bit;
+}
+
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+	unsigned int val = current->trace_recursion;
+	int bit;
+
+	/* A previous recursion check was made */
+	if ((val & TRACE_CONTEXT_MASK) > max)
+		return 0;
+
+	bit = trace_get_context_bit() + start;
+	if (unlikely(val & (1 << bit))) {
+		/*
+		 * It could be that preempt_count has not been updated during
+		 * a switch between contexts. Allow for a single recursion.
+		 */
+		bit = TRACE_TRANSITION_BIT;
+		if (trace_recursion_test(bit))
+			return -1;
+		trace_recursion_set(bit);
+		barrier();
+		return bit + 1;
+	}
+
+	/* Normal check passed, clear the transition to allow it again */
+	trace_recursion_clear(TRACE_TRANSITION_BIT);
+
+	val |= 1 << bit;
+	current->trace_recursion = val;
+	barrier();
+
+	return bit + 1;
+}
+
+static __always_inline void trace_clear_recursion(int bit)
+{
+	unsigned int val = current->trace_recursion;
+
+	if (!bit)
+		return;
+
+	bit--;
+	bit = 1 << bit;
+	val &= ~bit;
+
+	barrier();
+	current->trace_recursion = val;
+}
+
+#endif /* CONFIG_TRACING */
+#endif /* _LINUX_TRACE_RECURSION_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1dadef445cd1..9462251cab92 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -558,183 +558,6 @@ struct tracer {
 	bool			noboot;
 };
 
-
-/* Only current can touch trace_recursion */
-
-/*
- * For function tracing recursion:
- *  The order of these bits are important.
- *
- *  When function tracing occurs, the following steps are made:
- *   If arch does not support a ftrace feature:
- *    call internal function (uses INTERNAL bits) which calls...
- *   If callback is registered to the "global" list, the list
- *    function is called and recursion checks the GLOBAL bits.
- *    then this function calls...
- *   The function callback, which can use the FTRACE bits to
- *    check for recursion.
- *
- * Now if the arch does not support a feature, and it calls
- * the global list function which calls the ftrace callback
- * all three of these steps will do a recursion protection.
- * There's no reason to do one if the previous caller already
- * did. The recursion that we are protecting against will
- * go through the same steps again.
- *
- * To prevent the multiple recursion checks, if a recursion
- * bit is set that is higher than the MAX bit of the current
- * check, then we know that the check was made by the previous
- * caller, and we can skip the current check.
- */
-enum {
-	/* Function recursion bits */
-	TRACE_FTRACE_BIT,
-	TRACE_FTRACE_NMI_BIT,
-	TRACE_FTRACE_IRQ_BIT,
-	TRACE_FTRACE_SIRQ_BIT,
-
-	/* INTERNAL_BITs must be greater than FTRACE_BITs */
-	TRACE_INTERNAL_BIT,
-	TRACE_INTERNAL_NMI_BIT,
-	TRACE_INTERNAL_IRQ_BIT,
-	TRACE_INTERNAL_SIRQ_BIT,
-
-	TRACE_BRANCH_BIT,
-/*
- * Abuse of the trace_recursion.
- * As we need a way to maintain state if we are tracing the function
- * graph in irq because we want to trace a particular function that
- * was called in irq context but we have irq tracing off. Since this
- * can only be modified by current, we can reuse trace_recursion.
- */
-	TRACE_IRQ_BIT,
-
-	/* Set if the function is in the set_graph_function file */
-	TRACE_GRAPH_BIT,
-
-	/*
-	 * In the very unlikely case that an interrupt came in
-	 * at a start of graph tracing, and we want to trace
-	 * the function in that interrupt, the depth can be greater
-	 * than zero, because of the preempted start of a previous
-	 * trace. In an even more unlikely case, depth could be 2
-	 * if a softirq interrupted the start of graph tracing,
-	 * followed by an interrupt preempting a start of graph
-	 * tracing in the softirq, and depth can even be 3
-	 * if an NMI came in at the start of an interrupt function
-	 * that preempted a softirq start of a function that
-	 * preempted normal context!!!! Luckily, it can't be
-	 * greater than 3, so the next two bits are a mask
-	 * of what the depth is when we set TRACE_GRAPH_BIT
-	 */
-
-	TRACE_GRAPH_DEPTH_START_BIT,
-	TRACE_GRAPH_DEPTH_END_BIT,
-
-	/*
-	 * To implement set_graph_notrace, if this bit is set, we ignore
-	 * function graph tracing of called functions, until the return
-	 * function is called to clear it.
-	 */
-	TRACE_GRAPH_NOTRACE_BIT,
-
-	/*
-	 * When transitioning between context, the preempt_count() may
-	 * not be correct. Allow for a single recursion to cover this case.
-	 */
-	TRACE_TRANSITION_BIT,
-};
-
-#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
-#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
-#define trace_recursion_test(bit)	((current)->trace_recursion & (1<<(bit)))
-
-#define trace_recursion_depth() \
-	(((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
-#define trace_recursion_set_depth(depth) \
-	do {								\
-		current->trace_recursion &=				\
-			~(3 << TRACE_GRAPH_DEPTH_START_BIT);		\
-		current->trace_recursion |=				\
-			((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT;	\
-	} while (0)
-
-#define TRACE_CONTEXT_BITS	4
-
-#define TRACE_FTRACE_START	TRACE_FTRACE_BIT
-#define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_LIST_START	TRACE_INTERNAL_BIT
-#define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_CONTEXT_MASK	TRACE_LIST_MAX
-
-static __always_inline int trace_get_context_bit(void)
-{
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = 0;
-
-		else if (in_irq())
-			bit = 1;
-		else
-			bit = 2;
-	} else
-		bit = 3;
-
-	return bit;
-}
-
-static __always_inline int trace_test_and_set_recursion(int start, int max)
-{
-	unsigned int val = current->trace_recursion;
-	int bit;
-
-	/* A previous recursion check was made */
-	if ((val & TRACE_CONTEXT_MASK) > max)
-		return 0;
-
-	bit = trace_get_context_bit() + start;
-	if (unlikely(val & (1 << bit))) {
-		/*
-		 * It could be that preempt_count has not been updated during
-		 * a switch between contexts. Allow for a single recursion.
-		 */
-		bit = TRACE_TRANSITION_BIT;
-		if (trace_recursion_test(bit))
-			return -1;
-		trace_recursion_set(bit);
-		barrier();
-		return bit + 1;
-	}
-
-	/* Normal check passed, clear the transition to allow it again */
-	trace_recursion_clear(TRACE_TRANSITION_BIT);
-
-	val |= 1 << bit;
-	current->trace_recursion = val;
-	barrier();
-
-	return bit + 1;
-}
-
-static __always_inline void trace_clear_recursion(int bit)
-{
-	unsigned int val = current->trace_recursion;
-
-	if (!bit)
-		return;
-
-	bit--;
-	bit = 1 << bit;
-	val &= ~bit;
-
-	barrier();
-	current->trace_recursion = val;
-}
-
 static inline struct ring_buffer_iter *
 trace_buffer_iter(struct trace_iterator *iter, int cpu)
 {
-- 
cgit 


From 6e4eb9cb22fc8a893cb708ed42644de5ee7c3827 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:37 -0500
Subject: ftrace: Add ftrace_test_recursion_trylock() helper function

To make it easier for ftrace callbacks to have recursion protection, provide
a ftrace_test_recursion_trylock() and ftrace_test_recursion_unlock() helper
that tests for recursion.

Link: https://lkml.kernel.org/r/20201028115612.634927593@goodmis.org
Link: https://lkml.kernel.org/r/20201106023546.378584067@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 25 +++++++++++++++++++++++++
 kernel/trace/trace_functions.c  | 12 +++++-------
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index dbb7b6d4c94c..f2a949dbfec7 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -183,5 +183,30 @@ static __always_inline void trace_clear_recursion(int bit)
 	current->trace_recursion = val;
 }
 
+/**
+ * ftrace_test_recursion_trylock - tests for recursion in same context
+ *
+ * Use this for ftrace callbacks. This will detect if the function
+ * tracing recursed in the same context (normal vs interrupt),
+ *
+ * Returns: -1 if a recursion happened.
+ *           >= 0 if no recursion
+ */
+static __always_inline int ftrace_test_recursion_trylock(void)
+{
+	return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+}
+
+/**
+ * ftrace_test_recursion_unlock - called when function callback is complete
+ * @bit: The return of a successful ftrace_test_recursion_trylock()
+ *
+ * This is used at the end of a ftrace callback.
+ */
+static __always_inline void ftrace_test_recursion_unlock(int bit)
+{
+	trace_clear_recursion(bit);
+}
+
 #endif /* CONFIG_TRACING */
 #endif /* _LINUX_TRACE_RECURSION_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 2c2126e1871d..943756c01190 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -141,22 +141,20 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (unlikely(!tr->function_enabled))
 		return;
 
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
+		return;
+
 	pc = preempt_count();
 	preempt_disable_notrace();
 
-	bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
-	if (bit < 0)
-		goto out;
-
 	cpu = smp_processor_id();
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
 	if (!atomic_read(&data->disabled)) {
 		local_save_flags(flags);
 		trace_function(tr, ip, parent_ip, flags, pc);
 	}
-	trace_clear_recursion(bit);
-
- out:
+	ftrace_test_recursion_unlock(bit);
 	preempt_enable_notrace();
 }
 
-- 
cgit 


From da5afbeb1724609996ca7bb4fbce2cd104c95914 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:38 -0500
Subject: ftrace: Optimize testing what context current is in

The preempt_count() is not a simple location in memory, it could be part of
per_cpu code or more. Each access to preempt_count(), or one of its accessor
functions (like in_interrupt()) takes several cycles. By reading
preempt_count() once, and then doing tests to find the context against the
value return is slightly faster than using in_nmi() and in_interrupt().

Link: https://lkml.kernel.org/r/20201028115612.780796355@goodmis.org
Link: https://lkml.kernel.org/r/20201106023546.558881845@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index f2a949dbfec7..ac3d73484cb2 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -117,22 +117,29 @@ enum {
 
 #define TRACE_CONTEXT_MASK	TRACE_LIST_MAX
 
+/*
+ * Used for setting context
+ *  NMI     = 0
+ *  IRQ     = 1
+ *  SOFTIRQ = 2
+ *  NORMAL  = 3
+ */
+enum {
+	TRACE_CTX_NMI,
+	TRACE_CTX_IRQ,
+	TRACE_CTX_SOFTIRQ,
+	TRACE_CTX_NORMAL,
+};
+
 static __always_inline int trace_get_context_bit(void)
 {
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = 0;
-
-		else if (in_irq())
-			bit = 1;
-		else
-			bit = 2;
-	} else
-		bit = 3;
+	unsigned long pc = preempt_count();
 
-	return bit;
+	if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+		return TRACE_CTX_NORMAL;
+	else
+		return pc & NMI_MASK ? TRACE_CTX_NMI :
+			pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ;
 }
 
 static __always_inline int trace_test_and_set_recursion(int start, int max)
-- 
cgit 


From a25d036d939a30623ff73ecad9c8b9116b02e823 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:45 -0500
Subject: ftrace: Reverse what the RECURSION flag means in the ftrace_ops

Now that all callbacks are recursion safe, reverse the meaning of the
RECURSION flag and rename it from RECURSION_SAFE to simply RECURSION.
Now only callbacks that request to have recursion protecting it will
have the added trampoline to do so.

Also remove the outdated comment about "PER_CPU" when determining to
use the ftrace_ops_assist_func.

Link: https://lkml.kernel.org/r/20201028115613.742454631@goodmis.org
Link: https://lkml.kernel.org/r/20201106023547.904270143@goodmis.org

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh  Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: linux-doc@vger.kernel.org
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-uses.rst | 82 +++++++++++++++++++++++++++----------
 include/linux/ftrace.h              | 12 +++---
 kernel/trace/fgraph.c               |  3 +-
 kernel/trace/ftrace.c               | 20 ++++-----
 kernel/trace/trace_events.c         |  1 -
 kernel/trace/trace_functions.c      |  2 +-
 kernel/trace/trace_selftest.c       |  7 +---
 kernel/trace/trace_stack.c          |  1 -
 8 files changed, 79 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index a4955f7e3d19..86cd14b8e126 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -30,8 +30,8 @@ The ftrace context
   This requires extra care to what can be done inside a callback. A callback
   can be called outside the protective scope of RCU.
 
-The ftrace infrastructure has some protections against recursions and RCU
-but one must still be very careful how they use the callbacks.
+There are helper functions to help against recursion, and making sure
+RCU is watching. These are explained below.
 
 
 The ftrace_ops structure
@@ -108,6 +108,50 @@ The prototype of the callback function is as follows (as of v4.14):
 	at the start of the function where ftrace was tracing. Otherwise it
 	either contains garbage, or NULL.
 
+Protect your callback
+=====================
+
+As functions can be called from anywhere, and it is possible that a function
+called by a callback may also be traced, and call that same callback,
+recursion protection must be used. There are two helper functions that
+can help in this regard. If you start your code with:
+
+	int bit;
+
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
+		return;
+
+and end it with:
+
+	ftrace_test_recursion_unlock(bit);
+
+The code in between will be safe to use, even if it ends up calling a
+function that the callback is tracing. Note, on success,
+ftrace_test_recursion_trylock() will disable preemption, and the
+ftrace_test_recursion_unlock() will enable it again (if it was previously
+enabled).
+
+Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops
+(as explained below), then a helper trampoline will be used to test
+for recursion for the callback and no recursion test needs to be done.
+But this is at the expense of a slightly more overhead from an extra
+function call.
+
+If your callback accesses any data or critical section that requires RCU
+protection, it is best to make sure that RCU is "watching", otherwise
+that data or critical section will not be protected as expected. In this
+case add:
+
+	if (!rcu_is_watching())
+		return;
+
+Alternatively, if the FTRACE_OPS_FL_RCU flag is set on the ftrace_ops
+(as explained below), then a helper trampoline will be used to test
+for rcu_is_watching for the callback and no other test needs to be done.
+But this is at the expense of a slightly more overhead from an extra
+function call.
+
 
 The ftrace FLAGS
 ================
@@ -128,26 +172,20 @@ FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED
 	will not fail with this flag set. But the callback must check if
 	regs is NULL or not to determine if the architecture supports it.
 
-FTRACE_OPS_FL_RECURSION_SAFE
-	By default, a wrapper is added around the callback to
-	make sure that recursion of the function does not occur. That is,
-	if a function that is called as a result of the callback's execution
-	is also traced, ftrace will prevent the callback from being called
-	again. But this wrapper adds some overhead, and if the callback is
-	safe from recursion, it can set this flag to disable the ftrace
-	protection.
-
-	Note, if this flag is set, and recursion does occur, it could cause
-	the system to crash, and possibly reboot via a triple fault.
-
-	It is OK if another callback traces a function that is called by a
-	callback that is marked recursion safe. Recursion safe callbacks
-	must never trace any function that are called by the callback
-	itself or any nested functions that those functions call.
-
-	If this flag is set, it is possible that the callback will also
-	be called with preemption enabled (when CONFIG_PREEMPTION is set),
-	but this is not guaranteed.
+FTRACE_OPS_FL_RECURSION
+	By default, it is expected that the callback can handle recursion.
+	But if the callback is not that worried about overehead, then
+	setting this bit will add the recursion protection around the
+	callback by calling a helper function that will do the recursion
+	protection and only call the callback if it did not recurse.
+
+	Note, if this flag is not set, and recursion does occur, it could
+	cause the system to crash, and possibly reboot via a triple fault.
+
+	Not, if this flag is set, then the callback will always be called
+	with preemption disabled. If it is not set, then it is possible
+	(but not guaranteed) that the callback will be called in
+	preemptable context.
 
 FTRACE_OPS_FL_IPMODIFY
 	Requires FTRACE_OPS_FL_SAVE_REGS set. If the callback is to "hijack"
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0e4164a7f56d..806196345c3f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -98,7 +98,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
 /*
  * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are
  * set in the flags member.
- * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION_SAFE, STUB and
+ * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION, STUB and
  * IPMODIFY are a kind of attribute flags which can be set only before
  * registering the ftrace_ops, and can not be modified while registered.
  * Changing those attribute flags after registering ftrace_ops will
@@ -121,10 +121,10 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  *            passing regs to the handler.
  *            Note, if this flag is set, the SAVE_REGS flag will automatically
  *            get set upon registering the ftrace_ops, if the arch supports it.
- * RECURSION_SAFE - The ftrace_ops can set this to tell the ftrace infrastructure
- *            that the call back has its own recursion protection. If it does
- *            not set this, then the ftrace infrastructure will add recursion
- *            protection for the caller.
+ * RECURSION - The ftrace_ops can set this to tell the ftrace infrastructure
+ *            that the call back needs recursion protection. If it does
+ *            not set this, then the ftrace infrastructure will assume
+ *            that the callback can handle recursion on its own.
  * STUB   - The ftrace_ops is just a place holder.
  * INITIALIZED - The ftrace_ops has already been initialized (first use time
  *            register_ftrace_function() is called, it will initialized the ops)
@@ -156,7 +156,7 @@ enum {
 	FTRACE_OPS_FL_DYNAMIC			= BIT(1),
 	FTRACE_OPS_FL_SAVE_REGS			= BIT(2),
 	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= BIT(3),
-	FTRACE_OPS_FL_RECURSION_SAFE		= BIT(4),
+	FTRACE_OPS_FL_RECURSION			= BIT(4),
 	FTRACE_OPS_FL_STUB			= BIT(5),
 	FTRACE_OPS_FL_INITIALIZED		= BIT(6),
 	FTRACE_OPS_FL_DELETED			= BIT(7),
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 5658f13037b3..73edb9e4f354 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -334,8 +334,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 
 static struct ftrace_ops graph_ops = {
 	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
-				   FTRACE_OPS_FL_INITIALIZED |
+	.flags			= FTRACE_OPS_FL_INITIALIZED |
 				   FTRACE_OPS_FL_PID |
 				   FTRACE_OPS_FL_STUB,
 #ifdef FTRACE_GRAPH_TRAMP_ADDR
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8185f7240095..39f2bba89b76 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,7 +80,7 @@ enum {
 
 struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+	.flags		= FTRACE_OPS_FL_STUB,
 	INIT_OPS_HASH(ftrace_list_end)
 };
 
@@ -866,7 +866,7 @@ static void unregister_ftrace_profiler(void)
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly = {
 	.func		= function_profile_call,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	.flags		= FTRACE_OPS_FL_INITIALIZED,
 	INIT_OPS_HASH(ftrace_profile_ops)
 };
 
@@ -1040,8 +1040,7 @@ struct ftrace_ops global_ops = {
 	.local_hash.notrace_hash	= EMPTY_HASH,
 	.local_hash.filter_hash		= EMPTY_HASH,
 	INIT_OPS_HASH(global_ops)
-	.flags				= FTRACE_OPS_FL_RECURSION_SAFE |
-					  FTRACE_OPS_FL_INITIALIZED |
+	.flags				= FTRACE_OPS_FL_INITIALIZED |
 					  FTRACE_OPS_FL_PID,
 };
 
@@ -2382,7 +2381,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
 
 struct ftrace_ops direct_ops = {
 	.func		= call_direct_funcs,
-	.flags		= FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
+	.flags		= FTRACE_OPS_FL_IPMODIFY
 			  | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
 			  | FTRACE_OPS_FL_PERMANENT,
 	/*
@@ -6864,8 +6863,7 @@ void ftrace_init_trace_array(struct trace_array *tr)
 
 struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
-				  FTRACE_OPS_FL_INITIALIZED |
+	.flags			= FTRACE_OPS_FL_INITIALIZED |
 				  FTRACE_OPS_FL_PID,
 };
 
@@ -7023,11 +7021,11 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func);
 ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 {
 	/*
-	 * If the function does not handle recursion, needs to be RCU safe,
-	 * or does per cpu logic, then we need to call the assist handler.
+	 * If the function does not handle recursion or needs to be RCU safe,
+	 * then we need to call the assist handler.
 	 */
-	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
-	    ops->flags & FTRACE_OPS_FL_RCU)
+	if (ops->flags & (FTRACE_OPS_FL_RECURSION |
+			  FTRACE_OPS_FL_RCU))
 		return ftrace_ops_assist_func;
 
 	return ops->func;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 47a71f96e5bc..244abbcd1db5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3712,7 +3712,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __initdata  =
 {
 	.func = function_test_events_call,
-	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 943756c01190..89c414ce1388 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -48,7 +48,7 @@ int ftrace_allocate_ftrace_ops(struct trace_array *tr)
 
 	/* Currently only the non stack version is supported */
 	ops->func = function_trace_call;
-	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
+	ops->flags = FTRACE_OPS_FL_PID;
 
 	tr->ops = ops;
 	ops->private = tr;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 4738ad48a667..8ee3c0bb5d8a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -150,17 +150,14 @@ static void trace_selftest_test_dyn_func(unsigned long ip,
 
 static struct ftrace_ops test_probe1 = {
 	.func			= trace_selftest_test_probe1_func,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static struct ftrace_ops test_probe2 = {
 	.func			= trace_selftest_test_probe2_func,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static struct ftrace_ops test_probe3 = {
 	.func			= trace_selftest_test_probe3_func,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static void print_counts(void)
@@ -448,11 +445,11 @@ static void trace_selftest_test_recursion_safe_func(unsigned long ip,
 
 static struct ftrace_ops test_rec_probe = {
 	.func			= trace_selftest_test_recursion_func,
+	.flags			= FTRACE_OPS_FL_RECURSION,
 };
 
 static struct ftrace_ops test_recsafe_probe = {
 	.func			= trace_selftest_test_recursion_safe_func,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static int
@@ -561,7 +558,7 @@ static void trace_selftest_test_regs_func(unsigned long ip,
 
 static struct ftrace_ops test_regs_probe = {
 	.func		= trace_selftest_test_regs_func,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
+	.flags		= FTRACE_OPS_FL_SAVE_REGS,
 };
 
 static int
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c408423e5d65..969db526a563 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -318,7 +318,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = stack_trace_call,
-	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static ssize_t
-- 
cgit 


From 773c16705058e9be7b0f4ce124e89cd231c120a2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:46 -0500
Subject: ftrace: Add recording of functions that caused recursion

This adds CONFIG_FTRACE_RECORD_RECURSION that will record to a file
"recursed_functions" all the functions that caused recursion while a
callback to the function tracer was running.

Link: https://lkml.kernel.org/r/20201106023548.102375687@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Guo Ren <guoren@kernel.org>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-csky@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: live-patching@vger.kernel.org
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-uses.rst   |   6 +-
 arch/csky/kernel/probes/ftrace.c      |   2 +-
 arch/parisc/kernel/ftrace.c           |   2 +-
 arch/powerpc/kernel/kprobes-ftrace.c  |   2 +-
 arch/s390/kernel/ftrace.c             |   2 +-
 arch/x86/kernel/kprobes/ftrace.c      |   2 +-
 fs/pstore/ftrace.c                    |   2 +-
 include/linux/trace_recursion.h       |  29 ++++-
 kernel/livepatch/patch.c              |   2 +-
 kernel/trace/Kconfig                  |  25 ++++
 kernel/trace/Makefile                 |   1 +
 kernel/trace/ftrace.c                 |   4 +-
 kernel/trace/trace_event_perf.c       |   2 +-
 kernel/trace/trace_functions.c        |   2 +-
 kernel/trace/trace_output.c           |   6 +-
 kernel/trace/trace_output.h           |   1 +
 kernel/trace/trace_recursion_record.c | 236 ++++++++++++++++++++++++++++++++++
 17 files changed, 306 insertions(+), 20 deletions(-)
 create mode 100644 kernel/trace/trace_recursion_record.c

(limited to 'include/linux')

diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index 86cd14b8e126..5981d5691745 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -118,7 +118,7 @@ can help in this regard. If you start your code with:
 
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
@@ -130,7 +130,9 @@ The code in between will be safe to use, even if it ends up calling a
 function that the callback is tracing. Note, on success,
 ftrace_test_recursion_trylock() will disable preemption, and the
 ftrace_test_recursion_unlock() will enable it again (if it was previously
-enabled).
+enabled). The instruction pointer (ip) and its parent (parent_ip) is passed to
+ftrace_test_recursion_trylock() to record where the recursion happened
+(if CONFIG_FTRACE_RECORD_RECURSION is set).
 
 Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops
 (as explained below), then a helper trampoline will be used to test
diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c
index 5eb2604fdf71..f30b179924ef 100644
--- a/arch/csky/kernel/probes/ftrace.c
+++ b/arch/csky/kernel/probes/ftrace.c
@@ -18,7 +18,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 13d85042810a..1c5d3732bda2 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -210,7 +210,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 5df8d50c65ae..fdfee39938ea 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 	struct kprobe_ctlblk *kcb;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(nip, parent_nip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 8f31c726537a..657c1ab45408 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -204,7 +204,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index a40a6cdfcca3..954d930a7127 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe_ctlblk *kcb;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 816210fc5d3a..adb0935eb062 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -41,7 +41,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 	if (unlikely(oops_in_progress))
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index ac3d73484cb2..228cc56ed66e 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -91,6 +91,9 @@ enum {
 	 * not be correct. Allow for a single recursion to cover this case.
 	 */
 	TRACE_TRANSITION_BIT,
+
+	/* Used to prevent recursion recording from recursing. */
+	TRACE_RECORD_RECURSION_BIT,
 };
 
 #define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
@@ -142,7 +145,22 @@ static __always_inline int trace_get_context_bit(void)
 			pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ;
 }
 
-static __always_inline int trace_test_and_set_recursion(int start, int max)
+#ifdef CONFIG_FTRACE_RECORD_RECURSION
+extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
+# define do_ftrace_record_recursion(ip, pip)				\
+	do {								\
+		if (!trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
+			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
+			ftrace_record_recursion(ip, pip);		\
+			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
+		}							\
+	} while (0)
+#else
+# define do_ftrace_record_recursion(ip, pip)	do { } while (0)
+#endif
+
+static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip,
+							int start, int max)
 {
 	unsigned int val = current->trace_recursion;
 	int bit;
@@ -158,8 +176,10 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
 		 * a switch between contexts. Allow for a single recursion.
 		 */
 		bit = TRACE_TRANSITION_BIT;
-		if (trace_recursion_test(bit))
+		if (trace_recursion_test(bit)) {
+			do_ftrace_record_recursion(ip, pip);
 			return -1;
+		}
 		trace_recursion_set(bit);
 		barrier();
 		return bit + 1;
@@ -199,9 +219,10 @@ static __always_inline void trace_clear_recursion(int bit)
  * Returns: -1 if a recursion happened.
  *           >= 0 if no recursion
  */
-static __always_inline int ftrace_test_recursion_trylock(void)
+static __always_inline int ftrace_test_recursion_trylock(unsigned long ip,
+							 unsigned long parent_ip)
 {
-	return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+	return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX);
 }
 
 /**
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 15480bf3ce88..875c5dbbdd33 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -49,7 +49,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 
 	ops = container_of(fops, struct klp_ops, fops);
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (WARN_ON_ONCE(bit < 0))
 		return;
 	/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a4020c0b4508..9b11c096d139 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -727,6 +727,31 @@ config TRACE_EVAL_MAP_FILE
 
 	If unsure, say N.
 
+config FTRACE_RECORD_RECURSION
+	bool "Record functions that recurse in function tracing"
+	depends on FUNCTION_TRACER
+	help
+	  All callbacks that attach to the function tracing have some sort
+	  of protection against recursion. Even though the protection exists,
+	  it adds overhead. This option will create a file in the tracefs
+	  file system called "recursed_functions" that will list the functions
+	  that triggered a recursion.
+
+	  This will add more overhead to cases that have recursion.
+
+	  If unsure, say N
+
+config FTRACE_RECORD_RECURSION_SIZE
+	int "Max number of recursed functions to record"
+	default	128
+	depends on FTRACE_RECORD_RECURSION
+	help
+	  This defines the limit of number of functions that can be
+	  listed in the "recursed_functions" file, that lists all
+	  the functions that caused a recursion to happen.
+	  This file can be reset, but the limit can not change in
+	  size at runtime.
+
 config GCOV_PROFILE_FTRACE
 	bool "Enable GCOV profiling on ftrace subsystem"
 	depends on GCOV_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index e153be351548..7e44cea89fdc 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
 obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
 obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
 obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
+obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
 
 obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 39f2bba89b76..03aad2b5cd5e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6918,7 +6918,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 	struct ftrace_ops *op;
 	int bit;
 
-	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+	bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
 	if (bit < 0)
 		return;
 
@@ -6993,7 +6993,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 {
 	int bit;
 
-	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+	bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a2b9fddb8148..1b202e28dfaa 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -447,7 +447,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 89c414ce1388..646eda6c44a5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -141,7 +141,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (unlikely(!tr->function_enabled))
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 000e9dc224c6..92b1575ae0ca 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,8 +353,8 @@ static inline const char *kretprobed(const char *name)
 }
 #endif /* CONFIG_KRETPROBES */
 
-static void
-seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
+void
+trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
 {
 #ifdef CONFIG_KALLSYMS
 	char str[KSYM_SYMBOL_LEN];
@@ -420,7 +420,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 		goto out;
 	}
 
-	seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
+	trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
 
 	if (sym_flags & TRACE_ITER_SYM_ADDR)
 		trace_seq_printf(s, " <" IP_FMT ">", ip);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 2f742b74e7e6..4c954636caf0 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,6 +16,7 @@ extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 
+extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
new file mode 100644
index 000000000000..b2edac1fe156
--- /dev/null
+++ b/kernel/trace/trace_recursion_record.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+struct recursed_functions {
+	unsigned long		ip;
+	unsigned long		parent_ip;
+};
+
+static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE];
+static atomic_t nr_records;
+
+/*
+ * Cache the last found function. Yes, updates to this is racey, but
+ * so is memory cache ;-)
+ */
+static unsigned long cached_function;
+
+void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip)
+{
+	int index = 0;
+	int i;
+	unsigned long old;
+
+ again:
+	/* First check the last one recorded */
+	if (ip == cached_function)
+		return;
+
+	i = atomic_read(&nr_records);
+	/* nr_records is -1 when clearing records */
+	smp_mb__after_atomic();
+	if (i < 0)
+		return;
+
+	/*
+	 * If there's two writers and this writer comes in second,
+	 * the cmpxchg() below to update the ip will fail. Then this
+	 * writer will try again. It is possible that index will now
+	 * be greater than nr_records. This is because the writer
+	 * that succeeded has not updated the nr_records yet.
+	 * This writer could keep trying again until the other writer
+	 * updates nr_records. But if the other writer takes an
+	 * interrupt, and that interrupt locks up that CPU, we do
+	 * not want this CPU to lock up due to the recursion protection,
+	 * and have a bug report showing this CPU as the cause of
+	 * locking up the computer. To not lose this record, this
+	 * writer will simply use the next position to update the
+	 * recursed_functions, and it will update the nr_records
+	 * accordingly.
+	 */
+	if (index < i)
+		index = i;
+	if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE)
+		return;
+
+	for (i = index - 1; i >= 0; i--) {
+		if (recursed_functions[i].ip == ip) {
+			cached_function = ip;
+			return;
+		}
+	}
+
+	cached_function = ip;
+
+	/*
+	 * We only want to add a function if it hasn't been added before.
+	 * Add to the current location before incrementing the count.
+	 * If it fails to add, then increment the index (save in i)
+	 * and try again.
+	 */
+	old = cmpxchg(&recursed_functions[index].ip, 0, ip);
+	if (old != 0) {
+		/* Did something else already added this for us? */
+		if (old == ip)
+			return;
+		/* Try the next location (use i for the next index) */
+		index++;
+		goto again;
+	}
+
+	recursed_functions[index].parent_ip = parent_ip;
+
+	/*
+	 * It's still possible that we could race with the clearing
+	 *    CPU0                                    CPU1
+	 *    ----                                    ----
+	 *                                       ip = func
+	 *  nr_records = -1;
+	 *  recursed_functions[0] = 0;
+	 *                                       i = -1
+	 *                                       if (i < 0)
+	 *  nr_records = 0;
+	 *  (new recursion detected)
+	 *      recursed_functions[0] = func
+	 *                                            cmpxchg(recursed_functions[0],
+	 *                                                    func, 0)
+	 *
+	 * But the worse that could happen is that we get a zero in
+	 * the recursed_functions array, and it's likely that "func" will
+	 * be recorded again.
+	 */
+	i = atomic_read(&nr_records);
+	smp_mb__after_atomic();
+	if (i < 0)
+		cmpxchg(&recursed_functions[index].ip, ip, 0);
+	else if (i <= index)
+		atomic_cmpxchg(&nr_records, i, index + 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_record_recursion);
+
+static DEFINE_MUTEX(recursed_function_lock);
+static struct trace_seq *tseq;
+
+static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos)
+{
+	void *ret = NULL;
+	int index;
+
+	mutex_lock(&recursed_function_lock);
+	index = atomic_read(&nr_records);
+	if (*pos < index) {
+		ret = &recursed_functions[*pos];
+	}
+
+	tseq = kzalloc(sizeof(*tseq), GFP_KERNEL);
+	if (!tseq)
+		return ERR_PTR(-ENOMEM);
+
+	trace_seq_init(tseq);
+
+	return ret;
+}
+
+static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	int index;
+	int p;
+
+	index = atomic_read(&nr_records);
+	p = ++(*pos);
+
+	return p < index ? &recursed_functions[p] : NULL;
+}
+
+static void recursed_function_seq_stop(struct seq_file *m, void *v)
+{
+	kfree(tseq);
+	mutex_unlock(&recursed_function_lock);
+}
+
+static int recursed_function_seq_show(struct seq_file *m, void *v)
+{
+	struct recursed_functions *record = v;
+	int ret = 0;
+
+	if (record) {
+		trace_seq_print_sym(tseq, record->parent_ip, true);
+		trace_seq_puts(tseq, ":\t");
+		trace_seq_print_sym(tseq, record->ip, true);
+		trace_seq_putc(tseq, '\n');
+		ret = trace_print_seq(m, tseq);
+	}
+
+	return ret;
+}
+
+static const struct seq_operations recursed_function_seq_ops = {
+	.start  = recursed_function_seq_start,
+	.next   = recursed_function_seq_next,
+	.stop   = recursed_function_seq_stop,
+	.show   = recursed_function_seq_show
+};
+
+static int recursed_function_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&recursed_function_lock);
+	/* If this file was opened for write, then erase contents */
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+		/* disable updating records */
+		atomic_set(&nr_records, -1);
+		smp_mb__after_atomic();
+		memset(recursed_functions, 0, sizeof(recursed_functions));
+		smp_wmb();
+		/* enable them again */
+		atomic_set(&nr_records, 0);
+	}
+	if (file->f_mode & FMODE_READ)
+		ret = seq_open(file, &recursed_function_seq_ops);
+	mutex_unlock(&recursed_function_lock);
+
+	return ret;
+}
+
+static ssize_t recursed_function_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+static int recursed_function_release(struct inode *inode, struct file *file)
+{
+	if (file->f_mode & FMODE_READ)
+		seq_release(inode, file);
+	return 0;
+}
+
+static const struct file_operations recursed_functions_fops = {
+	.open           = recursed_function_open,
+	.write		= recursed_function_write,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = recursed_function_release,
+};
+
+__init static int create_recursed_functions(void)
+{
+	struct dentry *dentry;
+
+	dentry = trace_create_file("recursed_functions", 0644, NULL, NULL,
+				   &recursed_functions_fops);
+	if (!dentry)
+		pr_warn("WARNING: Failed to create recursed_functions\n");
+	return 0;
+}
+
+fs_initcall(create_recursed_functions);
-- 
cgit 


From 88b8138b240b43d5215bf7cb422692cd8db51f6f Mon Sep 17 00:00:00 2001
From: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Date: Fri, 6 Nov 2020 14:03:31 +0100
Subject: tty: serial: remove pnx8xxx uart driver

Commit 625326ea9c84 ("MIPS: Remove PNX833x alias NXP_STB22x") removed
support for PNX833x, so it's time to remove serial driver, too.

Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Link: https://lore.kernel.org/r/20201106130332.103476-1-tsbogend@alpha.franken.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig        |  16 -
 drivers/tty/serial/Makefile       |   1 -
 drivers/tty/serial/pnx8xxx_uart.c | 858 --------------------------------------
 include/linux/serial_pnx8xxx.h    |  67 ---
 include/uapi/linux/serial_core.h  |   2 -
 5 files changed, 944 deletions(-)
 delete mode 100644 drivers/tty/serial/pnx8xxx_uart.c
 delete mode 100644 include/linux/serial_pnx8xxx.h

(limited to 'include/linux')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 1044fc387691..b146c93146ee 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -703,22 +703,6 @@ config SERIAL_SH_SCI_DMA
 	depends on SERIAL_SH_SCI && DMA_ENGINE
 	default ARCH_RENESAS
 
-config SERIAL_PNX8XXX
-	bool "Enable PNX8XXX SoCs' UART Support"
-	depends on SOC_PNX833X
-	select SERIAL_CORE
-	help
-	  If you have a MIPS-based Philips SoC such as PNX8330 and you want
-	  to use serial ports, say Y.  Otherwise, say N.
-
-config SERIAL_PNX8XXX_CONSOLE
-	bool "Enable PNX8XX0 serial console"
-	depends on SERIAL_PNX8XXX
-	select SERIAL_CORE_CONSOLE
-	help
-	  If you have a MIPS-based Philips SoC such as PNX8330 and you want
-	  to use serial console, say Y. Otherwise, say N.
-
 config SERIAL_HS_LPC32XX
 	tristate "LPC32XX high speed serial port support"
 	depends on ARCH_LPC32XX || COMPILE_TEST
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index caf167f0c10a..af44b231123c 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -27,7 +27,6 @@ obj-$(CONFIG_SERIAL_AMBA_PL010) += amba-pl010.o
 obj-$(CONFIG_SERIAL_AMBA_PL011) += amba-pl011.o
 obj-$(CONFIG_SERIAL_CLPS711X) += clps711x.o
 obj-$(CONFIG_SERIAL_PXA_NON8250) += pxa.o
-obj-$(CONFIG_SERIAL_PNX8XXX) += pnx8xxx_uart.o
 obj-$(CONFIG_SERIAL_SA1100) += sa1100.o
 obj-$(CONFIG_SERIAL_BCM63XX) += bcm63xx_uart.o
 obj-$(CONFIG_SERIAL_SAMSUNG) += samsung_tty.o
diff --git a/drivers/tty/serial/pnx8xxx_uart.c b/drivers/tty/serial/pnx8xxx_uart.c
deleted file mode 100644
index 972d94e8d32b..000000000000
--- a/drivers/tty/serial/pnx8xxx_uart.c
+++ /dev/null
@@ -1,858 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * UART driver for PNX8XXX SoCs
- *
- * Author: Per Hallsmark per.hallsmark@mvista.com
- * Ported to 2.6 kernel by EmbeddedAlley
- * Reworked by Vitaly Wool <vitalywool@gmail.com>
- *
- * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
- * Copyright (C) 2000 Deep Blue Solutions Ltd.
- */
-
-#include <linux/module.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/console.h>
-#include <linux/sysrq.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/serial_core.h>
-#include <linux/serial.h>
-#include <linux/serial_pnx8xxx.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-
-/* We'll be using StrongARM sa1100 serial port major/minor */
-#define SERIAL_PNX8XXX_MAJOR	204
-#define MINOR_START		5
-
-#define NR_PORTS		2
-
-#define PNX8XXX_ISR_PASS_LIMIT	256
-
-/*
- * Convert from ignore_status_mask or read_status_mask to FIFO
- * and interrupt status bits
- */
-#define SM_TO_FIFO(x)	((x) >> 10)
-#define SM_TO_ISTAT(x)	((x) & 0x000001ff)
-#define FIFO_TO_SM(x)	((x) << 10)
-#define ISTAT_TO_SM(x)	((x) & 0x000001ff)
-
-/*
- * This is the size of our serial port register set.
- */
-#define UART_PORT_SIZE	0x1000
-
-/*
- * This determines how often we check the modem status signals
- * for any change.  They generally aren't connected to an IRQ
- * so we have to poll them.  We also check immediately before
- * filling the TX fifo incase CTS has been dropped.
- */
-#define MCTRL_TIMEOUT	(250*HZ/1000)
-
-extern struct pnx8xxx_port pnx8xxx_ports[];
-
-static inline int serial_in(struct pnx8xxx_port *sport, int offset)
-{
-	return (__raw_readl(sport->port.membase + offset));
-}
-
-static inline void serial_out(struct pnx8xxx_port *sport, int offset, int value)
-{
-	__raw_writel(value, sport->port.membase + offset);
-}
-
-/*
- * Handle any change of modem status signal since we were last called.
- */
-static void pnx8xxx_mctrl_check(struct pnx8xxx_port *sport)
-{
-	unsigned int status, changed;
-
-	status = sport->port.ops->get_mctrl(&sport->port);
-	changed = status ^ sport->old_status;
-
-	if (changed == 0)
-		return;
-
-	sport->old_status = status;
-
-	if (changed & TIOCM_RI)
-		sport->port.icount.rng++;
-	if (changed & TIOCM_DSR)
-		sport->port.icount.dsr++;
-	if (changed & TIOCM_CAR)
-		uart_handle_dcd_change(&sport->port, status & TIOCM_CAR);
-	if (changed & TIOCM_CTS)
-		uart_handle_cts_change(&sport->port, status & TIOCM_CTS);
-
-	wake_up_interruptible(&sport->port.state->port.delta_msr_wait);
-}
-
-/*
- * This is our per-port timeout handler, for checking the
- * modem status signals.
- */
-static void pnx8xxx_timeout(struct timer_list *t)
-{
-	struct pnx8xxx_port *sport = from_timer(sport, t, timer);
-	unsigned long flags;
-
-	if (sport->port.state) {
-		spin_lock_irqsave(&sport->port.lock, flags);
-		pnx8xxx_mctrl_check(sport);
-		spin_unlock_irqrestore(&sport->port.lock, flags);
-
-		mod_timer(&sport->timer, jiffies + MCTRL_TIMEOUT);
-	}
-}
-
-/*
- * interrupts disabled on entry
- */
-static void pnx8xxx_stop_tx(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	u32 ien;
-
-	/* Disable TX intr */
-	ien = serial_in(sport, PNX8XXX_IEN);
-	serial_out(sport, PNX8XXX_IEN, ien & ~PNX8XXX_UART_INT_ALLTX);
-
-	/* Clear all pending TX intr */
-	serial_out(sport, PNX8XXX_ICLR, PNX8XXX_UART_INT_ALLTX);
-}
-
-/*
- * interrupts may not be disabled on entry
- */
-static void pnx8xxx_start_tx(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	u32 ien;
-
-	/* Clear all pending TX intr */
-	serial_out(sport, PNX8XXX_ICLR, PNX8XXX_UART_INT_ALLTX);
-
-	/* Enable TX intr */
-	ien = serial_in(sport, PNX8XXX_IEN);
-	serial_out(sport, PNX8XXX_IEN, ien | PNX8XXX_UART_INT_ALLTX);
-}
-
-/*
- * Interrupts enabled
- */
-static void pnx8xxx_stop_rx(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	u32 ien;
-
-	/* Disable RX intr */
-	ien = serial_in(sport, PNX8XXX_IEN);
-	serial_out(sport, PNX8XXX_IEN, ien & ~PNX8XXX_UART_INT_ALLRX);
-
-	/* Clear all pending RX intr */
-	serial_out(sport, PNX8XXX_ICLR, PNX8XXX_UART_INT_ALLRX);
-}
-
-/*
- * Set the modem control timer to fire immediately.
- */
-static void pnx8xxx_enable_ms(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-
-	mod_timer(&sport->timer, jiffies);
-}
-
-static void pnx8xxx_rx_chars(struct pnx8xxx_port *sport)
-{
-	unsigned int status, ch, flg;
-
-	status = FIFO_TO_SM(serial_in(sport, PNX8XXX_FIFO)) |
-		 ISTAT_TO_SM(serial_in(sport, PNX8XXX_ISTAT));
-	while (status & FIFO_TO_SM(PNX8XXX_UART_FIFO_RXFIFO)) {
-		ch = serial_in(sport, PNX8XXX_FIFO) & 0xff;
-
-		sport->port.icount.rx++;
-
-		flg = TTY_NORMAL;
-
-		/*
-		 * note that the error handling code is
-		 * out of the main execution path
-		 */
-		if (status & (FIFO_TO_SM(PNX8XXX_UART_FIFO_RXFE |
-					PNX8XXX_UART_FIFO_RXPAR |
-					PNX8XXX_UART_FIFO_RXBRK) |
-			      ISTAT_TO_SM(PNX8XXX_UART_INT_RXOVRN))) {
-			if (status & FIFO_TO_SM(PNX8XXX_UART_FIFO_RXBRK)) {
-				status &= ~(FIFO_TO_SM(PNX8XXX_UART_FIFO_RXFE) |
-					FIFO_TO_SM(PNX8XXX_UART_FIFO_RXPAR));
-				sport->port.icount.brk++;
-				if (uart_handle_break(&sport->port))
-					goto ignore_char;
-			} else if (status & FIFO_TO_SM(PNX8XXX_UART_FIFO_RXPAR))
-				sport->port.icount.parity++;
-			else if (status & FIFO_TO_SM(PNX8XXX_UART_FIFO_RXFE))
-				sport->port.icount.frame++;
-			if (status & ISTAT_TO_SM(PNX8XXX_UART_INT_RXOVRN))
-				sport->port.icount.overrun++;
-
-			status &= sport->port.read_status_mask;
-
-			if (status & FIFO_TO_SM(PNX8XXX_UART_FIFO_RXPAR))
-				flg = TTY_PARITY;
-			else if (status & FIFO_TO_SM(PNX8XXX_UART_FIFO_RXFE))
-				flg = TTY_FRAME;
-
-			sport->port.sysrq = 0;
-		}
-
-		if (uart_handle_sysrq_char(&sport->port, ch))
-			goto ignore_char;
-
-		uart_insert_char(&sport->port, status,
-				ISTAT_TO_SM(PNX8XXX_UART_INT_RXOVRN), ch, flg);
-
-	ignore_char:
-		serial_out(sport, PNX8XXX_LCR, serial_in(sport, PNX8XXX_LCR) |
-				PNX8XXX_UART_LCR_RX_NEXT);
-		status = FIFO_TO_SM(serial_in(sport, PNX8XXX_FIFO)) |
-			 ISTAT_TO_SM(serial_in(sport, PNX8XXX_ISTAT));
-	}
-
-	spin_unlock(&sport->port.lock);
-	tty_flip_buffer_push(&sport->port.state->port);
-	spin_lock(&sport->port.lock);
-}
-
-static void pnx8xxx_tx_chars(struct pnx8xxx_port *sport)
-{
-	struct circ_buf *xmit = &sport->port.state->xmit;
-
-	if (sport->port.x_char) {
-		serial_out(sport, PNX8XXX_FIFO, sport->port.x_char);
-		sport->port.icount.tx++;
-		sport->port.x_char = 0;
-		return;
-	}
-
-	/*
-	 * Check the modem control lines before
-	 * transmitting anything.
-	 */
-	pnx8xxx_mctrl_check(sport);
-
-	if (uart_circ_empty(xmit) || uart_tx_stopped(&sport->port)) {
-		pnx8xxx_stop_tx(&sport->port);
-		return;
-	}
-
-	/*
-	 * TX while bytes available
-	 */
-	while (((serial_in(sport, PNX8XXX_FIFO) &
-					PNX8XXX_UART_FIFO_TXFIFO) >> 16) < 16) {
-		serial_out(sport, PNX8XXX_FIFO, xmit->buf[xmit->tail]);
-		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
-		sport->port.icount.tx++;
-		if (uart_circ_empty(xmit))
-			break;
-	}
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
-		uart_write_wakeup(&sport->port);
-
-	if (uart_circ_empty(xmit))
-		pnx8xxx_stop_tx(&sport->port);
-}
-
-static irqreturn_t pnx8xxx_int(int irq, void *dev_id)
-{
-	struct pnx8xxx_port *sport = dev_id;
-	unsigned int status;
-
-	spin_lock(&sport->port.lock);
-	/* Get the interrupts */
-	status  = serial_in(sport, PNX8XXX_ISTAT) & serial_in(sport, PNX8XXX_IEN);
-
-	/* Byte or break signal received */
-	if (status & (PNX8XXX_UART_INT_RX | PNX8XXX_UART_INT_BREAK))
-		pnx8xxx_rx_chars(sport);
-
-	/* TX holding register empty - transmit a byte */
-	if (status & PNX8XXX_UART_INT_TX)
-		pnx8xxx_tx_chars(sport);
-
-	/* Clear the ISTAT register */
-	serial_out(sport, PNX8XXX_ICLR, status);
-
-	spin_unlock(&sport->port.lock);
-	return IRQ_HANDLED;
-}
-
-/*
- * Return TIOCSER_TEMT when transmitter is not busy.
- */
-static unsigned int pnx8xxx_tx_empty(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-
-	return serial_in(sport, PNX8XXX_FIFO) & PNX8XXX_UART_FIFO_TXFIFO_STA ? 0 : TIOCSER_TEMT;
-}
-
-static unsigned int pnx8xxx_get_mctrl(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	unsigned int mctrl = TIOCM_DSR;
-	unsigned int msr;
-
-	/* REVISIT */
-
-	msr = serial_in(sport, PNX8XXX_MCR);
-
-	mctrl |= msr & PNX8XXX_UART_MCR_CTS ? TIOCM_CTS : 0;
-	mctrl |= msr & PNX8XXX_UART_MCR_DCD ? TIOCM_CAR : 0;
-
-	return mctrl;
-}
-
-static void pnx8xxx_set_mctrl(struct uart_port *port, unsigned int mctrl)
-{
-#if	0	/* FIXME */
-	struct pnx8xxx_port *sport = (struct pnx8xxx_port *)port;
-	unsigned int msr;
-#endif
-}
-
-/*
- * Interrupts always disabled.
- */
-static void pnx8xxx_break_ctl(struct uart_port *port, int break_state)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	unsigned long flags;
-	unsigned int lcr;
-
-	spin_lock_irqsave(&sport->port.lock, flags);
-	lcr = serial_in(sport, PNX8XXX_LCR);
-	if (break_state == -1)
-		lcr |= PNX8XXX_UART_LCR_TXBREAK;
-	else
-		lcr &= ~PNX8XXX_UART_LCR_TXBREAK;
-	serial_out(sport, PNX8XXX_LCR, lcr);
-	spin_unlock_irqrestore(&sport->port.lock, flags);
-}
-
-static int pnx8xxx_startup(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	int retval;
-
-	/*
-	 * Allocate the IRQ
-	 */
-	retval = request_irq(sport->port.irq, pnx8xxx_int, 0,
-			     "pnx8xxx-uart", sport);
-	if (retval)
-		return retval;
-
-	/*
-	 * Finally, clear and enable interrupts
-	 */
-
-	serial_out(sport, PNX8XXX_ICLR, PNX8XXX_UART_INT_ALLRX |
-			     PNX8XXX_UART_INT_ALLTX);
-
-	serial_out(sport, PNX8XXX_IEN, serial_in(sport, PNX8XXX_IEN) |
-			    PNX8XXX_UART_INT_ALLRX |
-			    PNX8XXX_UART_INT_ALLTX);
-
-	/*
-	 * Enable modem status interrupts
-	 */
-	spin_lock_irq(&sport->port.lock);
-	pnx8xxx_enable_ms(&sport->port);
-	spin_unlock_irq(&sport->port.lock);
-
-	return 0;
-}
-
-static void pnx8xxx_shutdown(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	int lcr;
-
-	/*
-	 * Stop our timer.
-	 */
-	del_timer_sync(&sport->timer);
-
-	/*
-	 * Disable all interrupts
-	 */
-	serial_out(sport, PNX8XXX_IEN, 0);
-
-	/*
-	 * Reset the Tx and Rx FIFOS, disable the break condition
-	 */
-	lcr = serial_in(sport, PNX8XXX_LCR);
-	lcr &= ~PNX8XXX_UART_LCR_TXBREAK;
-	lcr |= PNX8XXX_UART_LCR_TX_RST | PNX8XXX_UART_LCR_RX_RST;
-	serial_out(sport, PNX8XXX_LCR, lcr);
-
-	/*
-	 * Clear all interrupts
-	 */
-	serial_out(sport, PNX8XXX_ICLR, PNX8XXX_UART_INT_ALLRX |
-			     PNX8XXX_UART_INT_ALLTX);
-
-	/*
-	 * Free the interrupt
-	 */
-	free_irq(sport->port.irq, sport);
-}
-
-static void
-pnx8xxx_set_termios(struct uart_port *port, struct ktermios *termios,
-		   struct ktermios *old)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	unsigned long flags;
-	unsigned int lcr_fcr, old_ien, baud, quot;
-	unsigned int old_csize = old ? old->c_cflag & CSIZE : CS8;
-
-	/*
-	 * We only support CS7 and CS8.
-	 */
-	while ((termios->c_cflag & CSIZE) != CS7 &&
-	       (termios->c_cflag & CSIZE) != CS8) {
-		termios->c_cflag &= ~CSIZE;
-		termios->c_cflag |= old_csize;
-		old_csize = CS8;
-	}
-
-	if ((termios->c_cflag & CSIZE) == CS8)
-		lcr_fcr = PNX8XXX_UART_LCR_8BIT;
-	else
-		lcr_fcr = 0;
-
-	if (termios->c_cflag & CSTOPB)
-		lcr_fcr |= PNX8XXX_UART_LCR_2STOPB;
-	if (termios->c_cflag & PARENB) {
-		lcr_fcr |= PNX8XXX_UART_LCR_PAREN;
-		if (!(termios->c_cflag & PARODD))
-			lcr_fcr |= PNX8XXX_UART_LCR_PAREVN;
-	}
-
-	/*
-	 * Ask the core to calculate the divisor for us.
-	 */
-	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16);
-	quot = uart_get_divisor(port, baud);
-
-	spin_lock_irqsave(&sport->port.lock, flags);
-
-	sport->port.read_status_mask = ISTAT_TO_SM(PNX8XXX_UART_INT_RXOVRN) |
-				ISTAT_TO_SM(PNX8XXX_UART_INT_EMPTY) |
-				ISTAT_TO_SM(PNX8XXX_UART_INT_RX);
-	if (termios->c_iflag & INPCK)
-		sport->port.read_status_mask |=
-			FIFO_TO_SM(PNX8XXX_UART_FIFO_RXFE) |
-			FIFO_TO_SM(PNX8XXX_UART_FIFO_RXPAR);
-	if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
-		sport->port.read_status_mask |=
-			ISTAT_TO_SM(PNX8XXX_UART_INT_BREAK);
-
-	/*
-	 * Characters to ignore
-	 */
-	sport->port.ignore_status_mask = 0;
-	if (termios->c_iflag & IGNPAR)
-		sport->port.ignore_status_mask |=
-			FIFO_TO_SM(PNX8XXX_UART_FIFO_RXFE) |
-			FIFO_TO_SM(PNX8XXX_UART_FIFO_RXPAR);
-	if (termios->c_iflag & IGNBRK) {
-		sport->port.ignore_status_mask |=
-			ISTAT_TO_SM(PNX8XXX_UART_INT_BREAK);
-		/*
-		 * If we're ignoring parity and break indicators,
-		 * ignore overruns too (for real raw support).
-		 */
-		if (termios->c_iflag & IGNPAR)
-			sport->port.ignore_status_mask |=
-				ISTAT_TO_SM(PNX8XXX_UART_INT_RXOVRN);
-	}
-
-	/*
-	 * ignore all characters if CREAD is not set
-	 */
-	if ((termios->c_cflag & CREAD) == 0)
-		sport->port.ignore_status_mask |=
-			ISTAT_TO_SM(PNX8XXX_UART_INT_RX);
-
-	del_timer_sync(&sport->timer);
-
-	/*
-	 * Update the per-port timeout.
-	 */
-	uart_update_timeout(port, termios->c_cflag, baud);
-
-	/*
-	 * disable interrupts and drain transmitter
-	 */
-	old_ien = serial_in(sport, PNX8XXX_IEN);
-	serial_out(sport, PNX8XXX_IEN, old_ien & ~(PNX8XXX_UART_INT_ALLTX |
-					PNX8XXX_UART_INT_ALLRX));
-
-	while (serial_in(sport, PNX8XXX_FIFO) & PNX8XXX_UART_FIFO_TXFIFO_STA)
-		barrier();
-
-	/* then, disable everything */
-	serial_out(sport, PNX8XXX_IEN, 0);
-
-	/* Reset the Rx and Tx FIFOs too */
-	lcr_fcr |= PNX8XXX_UART_LCR_TX_RST;
-	lcr_fcr |= PNX8XXX_UART_LCR_RX_RST;
-
-	/* set the parity, stop bits and data size */
-	serial_out(sport, PNX8XXX_LCR, lcr_fcr);
-
-	/* set the baud rate */
-	quot -= 1;
-	serial_out(sport, PNX8XXX_BAUD, quot);
-
-	serial_out(sport, PNX8XXX_ICLR, -1);
-
-	serial_out(sport, PNX8XXX_IEN, old_ien);
-
-	if (UART_ENABLE_MS(&sport->port, termios->c_cflag))
-		pnx8xxx_enable_ms(&sport->port);
-
-	spin_unlock_irqrestore(&sport->port.lock, flags);
-}
-
-static const char *pnx8xxx_type(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-
-	return sport->port.type == PORT_PNX8XXX ? "PNX8XXX" : NULL;
-}
-
-/*
- * Release the memory region(s) being used by 'port'.
- */
-static void pnx8xxx_release_port(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-
-	release_mem_region(sport->port.mapbase, UART_PORT_SIZE);
-}
-
-/*
- * Request the memory region(s) being used by 'port'.
- */
-static int pnx8xxx_request_port(struct uart_port *port)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	return request_mem_region(sport->port.mapbase, UART_PORT_SIZE,
-			"pnx8xxx-uart") != NULL ? 0 : -EBUSY;
-}
-
-/*
- * Configure/autoconfigure the port.
- */
-static void pnx8xxx_config_port(struct uart_port *port, int flags)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-
-	if (flags & UART_CONFIG_TYPE &&
-	    pnx8xxx_request_port(&sport->port) == 0)
-		sport->port.type = PORT_PNX8XXX;
-}
-
-/*
- * Verify the new serial_struct (for TIOCSSERIAL).
- * The only change we allow are to the flags and type, and
- * even then only between PORT_PNX8XXX and PORT_UNKNOWN
- */
-static int
-pnx8xxx_verify_port(struct uart_port *port, struct serial_struct *ser)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port,	port);
-	int ret = 0;
-
-	if (ser->type != PORT_UNKNOWN && ser->type != PORT_PNX8XXX)
-		ret = -EINVAL;
-	if (sport->port.irq != ser->irq)
-		ret = -EINVAL;
-	if (ser->io_type != SERIAL_IO_MEM)
-		ret = -EINVAL;
-	if (sport->port.uartclk / 16 != ser->baud_base)
-		ret = -EINVAL;
-	if ((void *)sport->port.mapbase != ser->iomem_base)
-		ret = -EINVAL;
-	if (sport->port.iobase != ser->port)
-		ret = -EINVAL;
-	if (ser->hub6 != 0)
-		ret = -EINVAL;
-	return ret;
-}
-
-static const struct uart_ops pnx8xxx_pops = {
-	.tx_empty	= pnx8xxx_tx_empty,
-	.set_mctrl	= pnx8xxx_set_mctrl,
-	.get_mctrl	= pnx8xxx_get_mctrl,
-	.stop_tx	= pnx8xxx_stop_tx,
-	.start_tx	= pnx8xxx_start_tx,
-	.stop_rx	= pnx8xxx_stop_rx,
-	.enable_ms	= pnx8xxx_enable_ms,
-	.break_ctl	= pnx8xxx_break_ctl,
-	.startup	= pnx8xxx_startup,
-	.shutdown	= pnx8xxx_shutdown,
-	.set_termios	= pnx8xxx_set_termios,
-	.type		= pnx8xxx_type,
-	.release_port	= pnx8xxx_release_port,
-	.request_port	= pnx8xxx_request_port,
-	.config_port	= pnx8xxx_config_port,
-	.verify_port	= pnx8xxx_verify_port,
-};
-
-
-/*
- * Setup the PNX8XXX serial ports.
- *
- * Note also that we support "console=ttySx" where "x" is either 0 or 1.
- */
-static void __init pnx8xxx_init_ports(void)
-{
-	static int first = 1;
-	int i;
-
-	if (!first)
-		return;
-	first = 0;
-
-	for (i = 0; i < NR_PORTS; i++) {
-		timer_setup(&pnx8xxx_ports[i].timer, pnx8xxx_timeout, 0);
-		pnx8xxx_ports[i].port.ops = &pnx8xxx_pops;
-	}
-}
-
-#ifdef CONFIG_SERIAL_PNX8XXX_CONSOLE
-
-static void pnx8xxx_console_putchar(struct uart_port *port, int ch)
-{
-	struct pnx8xxx_port *sport =
-		container_of(port, struct pnx8xxx_port, port);
-	int status;
-
-	do {
-		/* Wait for UART_TX register to empty */
-		status = serial_in(sport, PNX8XXX_FIFO);
-	} while (status & PNX8XXX_UART_FIFO_TXFIFO);
-	serial_out(sport, PNX8XXX_FIFO, ch);
-}
-
-/*
- * Interrupts are disabled on entering
- */static void
-pnx8xxx_console_write(struct console *co, const char *s, unsigned int count)
-{
-	struct pnx8xxx_port *sport = &pnx8xxx_ports[co->index];
-	unsigned int old_ien, status;
-
-	/*
-	 *	First, save IEN and then disable interrupts
-	 */
-	old_ien = serial_in(sport, PNX8XXX_IEN);
-	serial_out(sport, PNX8XXX_IEN, old_ien & ~(PNX8XXX_UART_INT_ALLTX |
-					PNX8XXX_UART_INT_ALLRX));
-
-	uart_console_write(&sport->port, s, count, pnx8xxx_console_putchar);
-
-	/*
-	 *	Finally, wait for transmitter to become empty
-	 *	and restore IEN
-	 */
-	do {
-		/* Wait for UART_TX register to empty */
-		status = serial_in(sport, PNX8XXX_FIFO);
-	} while (status & PNX8XXX_UART_FIFO_TXFIFO);
-
-	/* Clear TX and EMPTY interrupt */
-	serial_out(sport, PNX8XXX_ICLR, PNX8XXX_UART_INT_TX |
-			     PNX8XXX_UART_INT_EMPTY);
-
-	serial_out(sport, PNX8XXX_IEN, old_ien);
-}
-
-static int __init
-pnx8xxx_console_setup(struct console *co, char *options)
-{
-	struct pnx8xxx_port *sport;
-	int baud = 38400;
-	int bits = 8;
-	int parity = 'n';
-	int flow = 'n';
-
-	/*
-	 * Check whether an invalid uart number has been specified, and
-	 * if so, search for the first available port that does have
-	 * console support.
-	 */
-	if (co->index == -1 || co->index >= NR_PORTS)
-		co->index = 0;
-	sport = &pnx8xxx_ports[co->index];
-
-	if (options)
-		uart_parse_options(options, &baud, &parity, &bits, &flow);
-
-	return uart_set_options(&sport->port, co, baud, parity, bits, flow);
-}
-
-static struct uart_driver pnx8xxx_reg;
-static struct console pnx8xxx_console = {
-	.name		= "ttyS",
-	.write		= pnx8xxx_console_write,
-	.device		= uart_console_device,
-	.setup		= pnx8xxx_console_setup,
-	.flags		= CON_PRINTBUFFER,
-	.index		= -1,
-	.data		= &pnx8xxx_reg,
-};
-
-static int __init pnx8xxx_rs_console_init(void)
-{
-	pnx8xxx_init_ports();
-	register_console(&pnx8xxx_console);
-	return 0;
-}
-console_initcall(pnx8xxx_rs_console_init);
-
-#define PNX8XXX_CONSOLE	&pnx8xxx_console
-#else
-#define PNX8XXX_CONSOLE	NULL
-#endif
-
-static struct uart_driver pnx8xxx_reg = {
-	.owner			= THIS_MODULE,
-	.driver_name		= "ttyS",
-	.dev_name		= "ttyS",
-	.major			= SERIAL_PNX8XXX_MAJOR,
-	.minor			= MINOR_START,
-	.nr			= NR_PORTS,
-	.cons			= PNX8XXX_CONSOLE,
-};
-
-static int pnx8xxx_serial_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	struct pnx8xxx_port *sport = platform_get_drvdata(pdev);
-
-	return uart_suspend_port(&pnx8xxx_reg, &sport->port);
-}
-
-static int pnx8xxx_serial_resume(struct platform_device *pdev)
-{
-	struct pnx8xxx_port *sport = platform_get_drvdata(pdev);
-
-	return uart_resume_port(&pnx8xxx_reg, &sport->port);
-}
-
-static int pnx8xxx_serial_probe(struct platform_device *pdev)
-{
-	struct resource *res = pdev->resource;
-	int i;
-
-	for (i = 0; i < pdev->num_resources; i++, res++) {
-		if (!(res->flags & IORESOURCE_MEM))
-			continue;
-
-		for (i = 0; i < NR_PORTS; i++) {
-			if (pnx8xxx_ports[i].port.mapbase != res->start)
-				continue;
-
-			pnx8xxx_ports[i].port.has_sysrq = IS_ENABLED(CONFIG_SERIAL_PNX8XXX_CONSOLE);
-			pnx8xxx_ports[i].port.dev = &pdev->dev;
-			uart_add_one_port(&pnx8xxx_reg, &pnx8xxx_ports[i].port);
-			platform_set_drvdata(pdev, &pnx8xxx_ports[i]);
-			break;
-		}
-	}
-
-	return 0;
-}
-
-static int pnx8xxx_serial_remove(struct platform_device *pdev)
-{
-	struct pnx8xxx_port *sport = platform_get_drvdata(pdev);
-
-	if (sport)
-		uart_remove_one_port(&pnx8xxx_reg, &sport->port);
-
-	return 0;
-}
-
-static struct platform_driver pnx8xxx_serial_driver = {
-	.driver		= {
-		.name	= "pnx8xxx-uart",
-	},
-	.probe		= pnx8xxx_serial_probe,
-	.remove		= pnx8xxx_serial_remove,
-	.suspend	= pnx8xxx_serial_suspend,
-	.resume		= pnx8xxx_serial_resume,
-};
-
-static int __init pnx8xxx_serial_init(void)
-{
-	int ret;
-
-	printk(KERN_INFO "Serial: PNX8XXX driver\n");
-
-	pnx8xxx_init_ports();
-
-	ret = uart_register_driver(&pnx8xxx_reg);
-	if (ret == 0) {
-		ret = platform_driver_register(&pnx8xxx_serial_driver);
-		if (ret)
-			uart_unregister_driver(&pnx8xxx_reg);
-	}
-	return ret;
-}
-
-static void __exit pnx8xxx_serial_exit(void)
-{
-	platform_driver_unregister(&pnx8xxx_serial_driver);
-	uart_unregister_driver(&pnx8xxx_reg);
-}
-
-module_init(pnx8xxx_serial_init);
-module_exit(pnx8xxx_serial_exit);
-
-MODULE_AUTHOR("Embedded Alley Solutions, Inc.");
-MODULE_DESCRIPTION("PNX8XXX SoCs serial port driver");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CHARDEV_MAJOR(SERIAL_PNX8XXX_MAJOR);
-MODULE_ALIAS("platform:pnx8xxx-uart");
diff --git a/include/linux/serial_pnx8xxx.h b/include/linux/serial_pnx8xxx.h
deleted file mode 100644
index 619d748dcd44..000000000000
--- a/include/linux/serial_pnx8xxx.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Embedded Alley Solutions, source@embeddedalley.com.
- */
-
-#ifndef _LINUX_SERIAL_PNX8XXX_H
-#define _LINUX_SERIAL_PNX8XXX_H
-
-#include <linux/serial_core.h>
-
-#define PNX8XXX_NR_PORTS	2
-
-struct pnx8xxx_port {
-	struct uart_port	port;
-	struct timer_list	timer;
-	unsigned int		old_status;
-};
-
-/* register offsets */
-#define PNX8XXX_LCR		0
-#define PNX8XXX_MCR		0x004
-#define PNX8XXX_BAUD		0x008
-#define PNX8XXX_CFG		0x00c
-#define PNX8XXX_FIFO		0x028
-#define PNX8XXX_ISTAT		0xfe0
-#define PNX8XXX_IEN		0xfe4
-#define PNX8XXX_ICLR		0xfe8
-#define PNX8XXX_ISET		0xfec
-#define PNX8XXX_PD		0xff4
-#define PNX8XXX_MID		0xffc
-
-#define PNX8XXX_UART_LCR_TXBREAK	(1<<30)
-#define PNX8XXX_UART_LCR_PAREVN		0x10000000
-#define PNX8XXX_UART_LCR_PAREN		0x08000000
-#define PNX8XXX_UART_LCR_2STOPB		0x04000000
-#define PNX8XXX_UART_LCR_8BIT		0x01000000
-#define PNX8XXX_UART_LCR_TX_RST		0x00040000
-#define PNX8XXX_UART_LCR_RX_RST		0x00020000
-#define PNX8XXX_UART_LCR_RX_NEXT	0x00010000
-
-#define PNX8XXX_UART_MCR_SCR		0xFF000000
-#define PNX8XXX_UART_MCR_DCD		0x00800000
-#define PNX8XXX_UART_MCR_CTS		0x00100000
-#define PNX8XXX_UART_MCR_LOOP		0x00000010
-#define PNX8XXX_UART_MCR_RTS		0x00000002
-#define PNX8XXX_UART_MCR_DTR		0x00000001
-
-#define PNX8XXX_UART_INT_TX		0x00000080
-#define PNX8XXX_UART_INT_EMPTY		0x00000040
-#define PNX8XXX_UART_INT_RCVTO		0x00000020
-#define PNX8XXX_UART_INT_RX		0x00000010
-#define PNX8XXX_UART_INT_RXOVRN		0x00000008
-#define PNX8XXX_UART_INT_FRERR		0x00000004
-#define PNX8XXX_UART_INT_BREAK		0x00000002
-#define PNX8XXX_UART_INT_PARITY		0x00000001
-#define PNX8XXX_UART_INT_ALLRX		0x0000003F
-#define PNX8XXX_UART_INT_ALLTX		0x000000C0
-
-#define PNX8XXX_UART_FIFO_TXFIFO	0x001F0000
-#define PNX8XXX_UART_FIFO_TXFIFO_STA	(0x1f<<16)
-#define PNX8XXX_UART_FIFO_RXBRK		0x00008000
-#define PNX8XXX_UART_FIFO_RXFE		0x00004000
-#define PNX8XXX_UART_FIFO_RXPAR		0x00002000
-#define PNX8XXX_UART_FIFO_RXFIFO	0x00001F00
-#define PNX8XXX_UART_FIFO_RBRTHR	0x000000FF
-
-#endif
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 851b982f8c4b..62c22045fe65 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -134,8 +134,6 @@
 /*Digi jsm */
 #define PORT_JSM        69
 
-#define PORT_PNX8XXX	70
-
 /* SUN4V Hypervisor Console */
 #define PORT_SUNHV	72
 
-- 
cgit 


From 4cf1bc1f10452065a29d576fc5693fc4fab5b919 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Fri, 6 Nov 2020 10:37:40 +0000
Subject: bpf: Implement task local storage

Similar to bpf_local_storage for sockets and inodes add local storage
for task_struct.

The life-cycle of storage is managed with the life-cycle of the
task_struct.  i.e. the storage is destroyed along with the owning task
with a callback to the bpf_task_storage_free from the task_free LSM
hook.

The BPF LSM allocates an __rcu pointer to the bpf_local_storage in
the security blob which are now stackable and can co-exist with other
LSMs.

The userspace map operations can be done by using a pid fd as a key
passed to the lookup, update and delete operations.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201106103747.2780972-3-kpsingh@chromium.org
---
 include/linux/bpf_lsm.h        |  23 +++
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |  39 +++++
 kernel/bpf/Makefile            |   1 +
 kernel/bpf/bpf_lsm.c           |   4 +
 kernel/bpf/bpf_task_storage.c  | 315 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |   3 +-
 kernel/bpf/verifier.c          |  10 ++
 security/bpf/hooks.c           |   2 +
 tools/include/uapi/linux/bpf.h |  39 +++++
 10 files changed, 436 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/bpf_task_storage.c

(limited to 'include/linux')

diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index aaacb6aafc87..73226181b744 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -7,6 +7,7 @@
 #ifndef _LINUX_BPF_LSM_H
 #define _LINUX_BPF_LSM_H
 
+#include <linux/sched.h>
 #include <linux/bpf.h>
 #include <linux/lsm_hooks.h>
 
@@ -35,9 +36,21 @@ static inline struct bpf_storage_blob *bpf_inode(
 	return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
 }
 
+static inline struct bpf_storage_blob *bpf_task(
+	const struct task_struct *task)
+{
+	if (unlikely(!task->security))
+		return NULL;
+
+	return task->security + bpf_lsm_blob_sizes.lbs_task;
+}
+
 extern const struct bpf_func_proto bpf_inode_storage_get_proto;
 extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
+extern const struct bpf_func_proto bpf_task_storage_get_proto;
+extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 void bpf_inode_storage_free(struct inode *inode);
+void bpf_task_storage_free(struct task_struct *task);
 
 #else /* !CONFIG_BPF_LSM */
 
@@ -53,10 +66,20 @@ static inline struct bpf_storage_blob *bpf_inode(
 	return NULL;
 }
 
+static inline struct bpf_storage_blob *bpf_task(
+	const struct task_struct *task)
+{
+	return NULL;
+}
+
 static inline void bpf_inode_storage_free(struct inode *inode)
 {
 }
 
+static inline void bpf_task_storage_free(struct task_struct *task)
+{
+}
+
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2e6f568377f1..99f7fd657d87 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -109,6 +109,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
 #ifdef CONFIG_BPF_LSM
 BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e6ceac3f7d62..f4037b2161a6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -157,6 +157,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STRUCT_OPS,
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
+	BPF_MAP_TYPE_TASK_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -3742,6 +3743,42 @@ union bpf_attr {
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
+ *
+ * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags)
+ *	Description
+ *		Get a bpf_local_storage from the *task*.
+ *
+ *		Logically, it could be thought of as getting the value from
+ *		a *map* with *task* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
+ *		helper enforces the key must be an task_struct and the map must also
+ *		be a **BPF_MAP_TYPE_TASK_STORAGE**.
+ *
+ *		Underneath, the value is stored locally at *task* instead of
+ *		the *map*.  The *map* is used as the bpf-local-storage
+ *		"type". The bpf-local-storage "type" (i.e. the *map*) is
+ *		searched against all bpf_local_storage residing at *task*.
+ *
+ *		An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ *		used such that a new bpf_local_storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ *		the initial value of a bpf_local_storage.  If *value* is
+ *		**NULL**, the new bpf_local_storage will be zero initialized.
+ *	Return
+ *		A bpf_local_storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf_local_storage.
+ *
+ * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task)
+ *	Description
+ *		Delete a bpf_local_storage from a *task*.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf_local_storage cannot be found.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3900,6 +3937,8 @@ union bpf_attr {
 	FN(bpf_per_cpu_ptr),            \
 	FN(bpf_this_cpu_ptr),		\
 	FN(redirect_peer),		\
+	FN(task_storage_get),		\
+	FN(task_storage_delete),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index bdc8cd1b6767..f0b93ced5a7f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_i
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
+obj-${CONFIG_BPF_LSM}	  += bpf_task_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index cd8a617f2109..e92c51bebb47 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -63,6 +63,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_spin_lock_proto;
 	case BPF_FUNC_spin_unlock:
 		return &bpf_spin_unlock_proto;
+	case BPF_FUNC_task_storage_get:
+		return &bpf_task_storage_get_proto;
+	case BPF_FUNC_task_storage_delete:
+		return &bpf_task_storage_delete_proto;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
new file mode 100644
index 000000000000..39a45fba4fb0
--- /dev/null
+++ b/kernel/bpf/bpf_task_storage.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/filter.h>
+#include <uapi/linux/btf.h>
+#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/fdtable.h>
+
+DEFINE_BPF_STORAGE_CACHE(task_cache);
+
+static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
+{
+	struct task_struct *task = owner;
+	struct bpf_storage_blob *bsb;
+
+	bsb = bpf_task(task);
+	if (!bsb)
+		return NULL;
+	return &bsb->storage;
+}
+
+static struct bpf_local_storage_data *
+task_storage_lookup(struct task_struct *task, struct bpf_map *map,
+		    bool cacheit_lockit)
+{
+	struct bpf_local_storage *task_storage;
+	struct bpf_local_storage_map *smap;
+	struct bpf_storage_blob *bsb;
+
+	bsb = bpf_task(task);
+	if (!bsb)
+		return NULL;
+
+	task_storage = rcu_dereference(bsb->storage);
+	if (!task_storage)
+		return NULL;
+
+	smap = (struct bpf_local_storage_map *)map;
+	return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit);
+}
+
+void bpf_task_storage_free(struct task_struct *task)
+{
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage *local_storage;
+	bool free_task_storage = false;
+	struct bpf_storage_blob *bsb;
+	struct hlist_node *n;
+
+	bsb = bpf_task(task);
+	if (!bsb)
+		return;
+
+	rcu_read_lock();
+
+	local_storage = rcu_dereference(bsb->storage);
+	if (!local_storage) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* Neither the bpf_prog nor the bpf-map's syscall
+	 * could be modifying the local_storage->list now.
+	 * Thus, no elem can be added-to or deleted-from the
+	 * local_storage->list by the bpf_prog or by the bpf-map's syscall.
+	 *
+	 * It is racing with bpf_local_storage_map_free() alone
+	 * when unlinking elem from the local_storage->list and
+	 * the map's bucket->list.
+	 */
+	raw_spin_lock_bh(&local_storage->lock);
+	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+		/* Always unlink from map before unlinking from
+		 * local_storage.
+		 */
+		bpf_selem_unlink_map(selem);
+		free_task_storage = bpf_selem_unlink_storage_nolock(
+			local_storage, selem, false);
+	}
+	raw_spin_unlock_bh(&local_storage->lock);
+	rcu_read_unlock();
+
+	/* free_task_storage should always be true as long as
+	 * local_storage->list was non-empty.
+	 */
+	if (free_task_storage)
+		kfree_rcu(local_storage, rcu);
+}
+
+static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_local_storage_data *sdata;
+	struct task_struct *task;
+	unsigned int f_flags;
+	struct pid *pid;
+	int fd, err;
+
+	fd = *(int *)key;
+	pid = pidfd_get_pid(fd, &f_flags);
+	if (IS_ERR(pid))
+		return ERR_CAST(pid);
+
+	/* We should be in an RCU read side critical section, it should be safe
+	 * to call pid_task.
+	 */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	sdata = task_storage_lookup(task, map, true);
+	put_pid(pid);
+	return sdata ? sdata->data : NULL;
+out:
+	put_pid(pid);
+	return ERR_PTR(err);
+}
+
+static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+					    void *value, u64 map_flags)
+{
+	struct bpf_local_storage_data *sdata;
+	struct task_struct *task;
+	unsigned int f_flags;
+	struct pid *pid;
+	int fd, err;
+
+	fd = *(int *)key;
+	pid = pidfd_get_pid(fd, &f_flags);
+	if (IS_ERR(pid))
+		return PTR_ERR(pid);
+
+	/* We should be in an RCU read side critical section, it should be safe
+	 * to call pid_task.
+	 */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	sdata = bpf_local_storage_update(
+		task, (struct bpf_local_storage_map *)map, value, map_flags);
+
+	err = PTR_ERR_OR_ZERO(sdata);
+out:
+	put_pid(pid);
+	return err;
+}
+
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
+{
+	struct bpf_local_storage_data *sdata;
+
+	sdata = task_storage_lookup(task, map, false);
+	if (!sdata)
+		return -ENOENT;
+
+	bpf_selem_unlink(SELEM(sdata));
+
+	return 0;
+}
+
+static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+{
+	struct task_struct *task;
+	unsigned int f_flags;
+	struct pid *pid;
+	int fd, err;
+
+	fd = *(int *)key;
+	pid = pidfd_get_pid(fd, &f_flags);
+	if (IS_ERR(pid))
+		return PTR_ERR(pid);
+
+	/* We should be in an RCU read side critical section, it should be safe
+	 * to call pid_task.
+	 */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	err = task_storage_delete(task, map);
+out:
+	put_pid(pid);
+	return err;
+}
+
+BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+	   task, void *, value, u64, flags)
+{
+	struct bpf_local_storage_data *sdata;
+
+	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return (unsigned long)NULL;
+
+	/* explicitly check that the task_storage_ptr is not
+	 * NULL as task_storage_lookup returns NULL in this case and
+	 * bpf_local_storage_update expects the owner to have a
+	 * valid storage pointer.
+	 */
+	if (!task_storage_ptr(task))
+		return (unsigned long)NULL;
+
+	sdata = task_storage_lookup(task, map, true);
+	if (sdata)
+		return (unsigned long)sdata->data;
+
+	/* This helper must only be called from places where the lifetime of the task
+	 * is guaranteed. Either by being refcounted or by being protected
+	 * by an RCU read-side critical section.
+	 */
+	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+		sdata = bpf_local_storage_update(
+			task, (struct bpf_local_storage_map *)map, value,
+			BPF_NOEXIST);
+		return IS_ERR(sdata) ? (unsigned long)NULL :
+					     (unsigned long)sdata->data;
+	}
+
+	return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+	   task)
+{
+	/* This helper must only be called from places where the lifetime of the task
+	 * is guaranteed. Either by being refcounted or by being protected
+	 * by an RCU read-side critical section.
+	 */
+	return task_storage_delete(task, map);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	return -ENOTSUPP;
+}
+
+static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_local_storage_map *smap;
+
+	smap = bpf_local_storage_map_alloc(attr);
+	if (IS_ERR(smap))
+		return ERR_CAST(smap);
+
+	smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
+	return &smap->map;
+}
+
+static void task_storage_map_free(struct bpf_map *map)
+{
+	struct bpf_local_storage_map *smap;
+
+	smap = (struct bpf_local_storage_map *)map;
+	bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
+	bpf_local_storage_map_free(smap);
+}
+
+static int task_storage_map_btf_id;
+const struct bpf_map_ops task_storage_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc_check = bpf_local_storage_map_alloc_check,
+	.map_alloc = task_storage_map_alloc,
+	.map_free = task_storage_map_free,
+	.map_get_next_key = notsupp_get_next_key,
+	.map_lookup_elem = bpf_pid_task_storage_lookup_elem,
+	.map_update_elem = bpf_pid_task_storage_update_elem,
+	.map_delete_elem = bpf_pid_task_storage_delete_elem,
+	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_btf_name = "bpf_local_storage_map",
+	.map_btf_id = &task_storage_map_btf_id,
+	.map_owner_storage_ptr = task_storage_ptr,
+};
+
+BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct)
+
+const struct bpf_func_proto bpf_task_storage_get_proto = {
+	.func = bpf_task_storage_get,
+	.gpl_only = false,
+	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_btf_id = &bpf_task_storage_btf_ids[0],
+	.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_task_storage_delete_proto = {
+	.func = bpf_task_storage_delete,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_btf_id = &bpf_task_storage_btf_ids[0],
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8f50c9c19f1b..f3fe9f53f93c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -773,7 +773,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		    map->map_type != BPF_MAP_TYPE_ARRAY &&
 		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
 		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
+		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
 			return -ENOTSUPP;
 		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
 		    map->value_size) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f863aa84d0a2..00960f6a83ec 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4469,6 +4469,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_inode_storage_delete)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_TASK_STORAGE:
+		if (func_id != BPF_FUNC_task_storage_get &&
+		    func_id != BPF_FUNC_task_storage_delete)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -4547,6 +4552,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
 			goto error;
 		break;
+	case BPF_FUNC_task_storage_get:
+	case BPF_FUNC_task_storage_delete:
+		if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c
index 788667d582ae..e5971fa74fd7 100644
--- a/security/bpf/hooks.c
+++ b/security/bpf/hooks.c
@@ -12,6 +12,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = {
 	#include <linux/lsm_hook_defs.h>
 	#undef LSM_HOOK
 	LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free),
+	LSM_HOOK_INIT(task_free, bpf_task_storage_free),
 };
 
 static int __init bpf_lsm_init(void)
@@ -23,6 +24,7 @@ static int __init bpf_lsm_init(void)
 
 struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = {
 	.lbs_inode = sizeof(struct bpf_storage_blob),
+	.lbs_task = sizeof(struct bpf_storage_blob),
 };
 
 DEFINE_LSM(bpf) = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e6ceac3f7d62..f4037b2161a6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -157,6 +157,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STRUCT_OPS,
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
+	BPF_MAP_TYPE_TASK_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -3742,6 +3743,42 @@ union bpf_attr {
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
+ *
+ * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags)
+ *	Description
+ *		Get a bpf_local_storage from the *task*.
+ *
+ *		Logically, it could be thought of as getting the value from
+ *		a *map* with *task* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
+ *		helper enforces the key must be an task_struct and the map must also
+ *		be a **BPF_MAP_TYPE_TASK_STORAGE**.
+ *
+ *		Underneath, the value is stored locally at *task* instead of
+ *		the *map*.  The *map* is used as the bpf-local-storage
+ *		"type". The bpf-local-storage "type" (i.e. the *map*) is
+ *		searched against all bpf_local_storage residing at *task*.
+ *
+ *		An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ *		used such that a new bpf_local_storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ *		the initial value of a bpf_local_storage.  If *value* is
+ *		**NULL**, the new bpf_local_storage will be zero initialized.
+ *	Return
+ *		A bpf_local_storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf_local_storage.
+ *
+ * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task)
+ *	Description
+ *		Delete a bpf_local_storage from a *task*.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf_local_storage cannot be found.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3900,6 +3937,8 @@ union bpf_attr {
 	FN(bpf_per_cpu_ptr),            \
 	FN(bpf_this_cpu_ptr),		\
 	FN(redirect_peer),		\
+	FN(task_storage_get),		\
+	FN(task_storage_delete),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit 


From 3ca1032ab7ab010eccb107aa515598788f7d93bb Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Fri, 6 Nov 2020 10:37:43 +0000
Subject: bpf: Implement get_current_task_btf and RET_PTR_TO_BTF_ID

The currently available bpf_get_current_task returns an unsigned integer
which can be used along with BPF_CORE_READ to read data from
the task_struct but still cannot be used as an input argument to a
helper that accepts an ARG_PTR_TO_BTF_ID of type task_struct.

In order to implement this helper a new return type, RET_PTR_TO_BTF_ID,
is added. This is similar to RET_PTR_TO_BTF_ID_OR_NULL but does not
require checking the nullness of returned pointer.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201106103747.2780972-6-kpsingh@chromium.org
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  9 +++++++++
 kernel/bpf/verifier.c          |  7 +++++--
 kernel/trace/bpf_trace.c       | 16 ++++++++++++++++
 tools/include/uapi/linux/bpf.h |  9 +++++++++
 5 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2fffd30e13ac..73d5381a5d5c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -310,6 +310,7 @@ enum bpf_return_type {
 	RET_PTR_TO_BTF_ID_OR_NULL,	/* returns a pointer to a btf_id or NULL */
 	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
 	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */
+	RET_PTR_TO_BTF_ID,		/* returns a pointer to a btf_id */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4037b2161a6..9879d6793e90 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3779,6 +3779,14 @@ union bpf_attr {
  *		0 on success.
  *
  *		**-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * struct task_struct *bpf_get_current_task_btf(void)
+ *	Description
+ *		Return a BTF pointer to the "current" task.
+ *		This pointer can also be used in helpers that accept an
+ *		*ARG_PTR_TO_BTF_ID* of type *task_struct*.
+ *	Return
+ *		Pointer to the current task.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3939,6 +3947,7 @@ union bpf_attr {
 	FN(redirect_peer),		\
 	FN(task_storage_get),		\
 	FN(task_storage_delete),	\
+	FN(get_current_task_btf),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 00960f6a83ec..10da26e55130 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5186,11 +5186,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 				PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
-	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
+	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
+		   fn->ret_type == RET_PTR_TO_BTF_ID) {
 		int ret_btf_id;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL;
+		regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
+						     PTR_TO_BTF_ID :
+						     PTR_TO_BTF_ID_OR_NULL;
 		ret_btf_id = *fn->ret_btf_id;
 		if (ret_btf_id == 0) {
 			verbose(env, "invalid return type %d of func %s#%d\n",
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4517c8b66518..e4515b0f62a8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1022,6 +1022,20 @@ const struct bpf_func_proto bpf_get_current_task_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+BPF_CALL_0(bpf_get_current_task_btf)
+{
+	return (unsigned long) current;
+}
+
+BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct)
+
+static const struct bpf_func_proto bpf_get_current_task_btf_proto = {
+	.func		= bpf_get_current_task_btf,
+	.gpl_only	= true,
+	.ret_type	= RET_PTR_TO_BTF_ID,
+	.ret_btf_id	= &bpf_get_current_btf_ids[0],
+};
+
 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -1265,6 +1279,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_pid_tgid_proto;
 	case BPF_FUNC_get_current_task:
 		return &bpf_get_current_task_proto;
+	case BPF_FUNC_get_current_task_btf:
+		return &bpf_get_current_task_btf_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4037b2161a6..9879d6793e90 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3779,6 +3779,14 @@ union bpf_attr {
  *		0 on success.
  *
  *		**-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * struct task_struct *bpf_get_current_task_btf(void)
+ *	Description
+ *		Return a BTF pointer to the "current" task.
+ *		This pointer can also be used in helpers that accept an
+ *		*ARG_PTR_TO_BTF_ID* of type *task_struct*.
+ *	Return
+ *		Pointer to the current task.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3939,6 +3947,7 @@ union bpf_attr {
 	FN(redirect_peer),		\
 	FN(task_storage_get),		\
 	FN(task_storage_delete),	\
+	FN(get_current_task_btf),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit 


From d4d50710a8b46082224376ef119a4dbb75b25c56 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 4 Nov 2020 09:27:33 +0100
Subject: seq_file: add seq_read_iter

iov_iter based variant for reading a seq_file.  seq_read is
reimplemented on top of the iter variant.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 45 ++++++++++++++++++++++++++++++++-------------
 include/linux/seq_file.h |  1 +
 2 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 31219c1db17d..3b20e21604e7 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/printk.h>
 #include <linux/string_helpers.h>
+#include <linux/uio.h>
 
 #include <linux/uaccess.h>
 #include <asm/page.h>
@@ -146,7 +147,28 @@ Eoverflow:
  */
 ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 {
-	struct seq_file *m = file->private_data;
+	struct iovec iov = { .iov_base = buf, .iov_len = size};
+	struct kiocb kiocb;
+	struct iov_iter iter;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, file);
+	iov_iter_init(&iter, READ, &iov, 1, size);
+
+	kiocb.ki_pos = *ppos;
+	ret = seq_read_iter(&kiocb, &iter);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+EXPORT_SYMBOL(seq_read);
+
+/*
+ * Ready-made ->f_op->read_iter()
+ */
+ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct seq_file *m = iocb->ki_filp->private_data;
+	size_t size = iov_iter_count(iter);
 	size_t copied = 0;
 	size_t n;
 	void *p;
@@ -158,14 +180,14 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	 * if request is to read from zero offset, reset iterator to first
 	 * record as it might have been already advanced by previous requests
 	 */
-	if (*ppos == 0) {
+	if (iocb->ki_pos == 0) {
 		m->index = 0;
 		m->count = 0;
 	}
 
-	/* Don't assume *ppos is where we left it */
-	if (unlikely(*ppos != m->read_pos)) {
-		while ((err = traverse(m, *ppos)) == -EAGAIN)
+	/* Don't assume ki_pos is where we left it */
+	if (unlikely(iocb->ki_pos != m->read_pos)) {
+		while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN)
 			;
 		if (err) {
 			/* With prejudice... */
@@ -174,7 +196,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 			m->count = 0;
 			goto Done;
 		} else {
-			m->read_pos = *ppos;
+			m->read_pos = iocb->ki_pos;
 		}
 	}
 
@@ -187,13 +209,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	/* if not empty - flush it first */
 	if (m->count) {
 		n = min(m->count, size);
-		err = copy_to_user(buf, m->buf + m->from, n);
-		if (err)
+		if (copy_to_iter(m->buf + m->from, n, iter) != n)
 			goto Efault;
 		m->count -= n;
 		m->from += n;
 		size -= n;
-		buf += n;
 		copied += n;
 		if (!size)
 			goto Done;
@@ -254,8 +274,7 @@ Fill:
 	}
 	m->op->stop(m, p);
 	n = min(m->count, size);
-	err = copy_to_user(buf, m->buf, n);
-	if (err)
+	if (copy_to_iter(m->buf, n, iter) != n)
 		goto Efault;
 	copied += n;
 	m->count -= n;
@@ -264,7 +283,7 @@ Done:
 	if (!copied)
 		copied = err;
 	else {
-		*ppos += copied;
+		iocb->ki_pos += copied;
 		m->read_pos += copied;
 	}
 	mutex_unlock(&m->lock);
@@ -276,7 +295,7 @@ Efault:
 	err = -EFAULT;
 	goto Done;
 }
-EXPORT_SYMBOL(seq_read);
+EXPORT_SYMBOL(seq_read_iter);
 
 /**
  *	seq_lseek -	->llseek() method for sequential files.
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 813614d4b71f..b83b3ae3c877 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -107,6 +107,7 @@ void seq_pad(struct seq_file *m, char c);
 char *mangle_path(char *s, const char *p, const char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
+ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 loff_t seq_lseek(struct file *, loff_t, int);
 int seq_release(struct inode *, struct file *);
 int seq_write(struct seq_file *seq, const void *data, size_t len);
-- 
cgit 


From b819fd9da38508e0504624b87d9983fcc4237f3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Nov 2020 10:27:14 +0100
Subject: highmem: Remove unused functions

Nothing uses totalhigh_pages_dec() and totalhigh_pages_set().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20201103095856.732891880@linutronix.de
---
 include/linux/highmem.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 14e6202ce47f..f5c31338f0a3 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -104,21 +104,11 @@ static inline void totalhigh_pages_inc(void)
 	atomic_long_inc(&_totalhigh_pages);
 }
 
-static inline void totalhigh_pages_dec(void)
-{
-	atomic_long_dec(&_totalhigh_pages);
-}
-
 static inline void totalhigh_pages_add(long count)
 {
 	atomic_long_add(count, &_totalhigh_pages);
 }
 
-static inline void totalhigh_pages_set(long val)
-{
-	atomic_long_set(&_totalhigh_pages, val);
-}
-
 void kmap_flush_unused(void);
 
 struct page *kmap_to_page(void *addr);
-- 
cgit 


From 298fa1ad5571f59cb3ca5497a9455f36867f065e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Nov 2020 10:27:18 +0100
Subject: highmem: Provide generic variant of kmap_atomic*

The kmap_atomic* interfaces in all architectures are pretty much the same
except for post map operations (flush) and pre- and post unmap operations.

Provide a generic variant for that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20201103095857.175939340@linutronix.de
---
 include/linux/highmem.h |  82 +++++++++++++++++++++------
 mm/Kconfig              |   3 +
 mm/highmem.c            | 144 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 211 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index f5c31338f0a3..f5ecee9c2576 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -31,9 +31,16 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
 
 #include <asm/kmap_types.h>
 
+/*
+ * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
+ */
+#ifdef CONFIG_KMAP_LOCAL
+void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
+void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
+void kunmap_local_indexed(void *vaddr);
+#endif
+
 #ifdef CONFIG_HIGHMEM
-extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
-extern void kunmap_atomic_high(void *kvaddr);
 #include <asm/highmem.h>
 
 #ifndef ARCH_HAS_KMAP_FLUSH_TLB
@@ -81,6 +88,11 @@ static inline void kunmap(struct page *page)
  * be used in IRQ contexts, so in some (very limited) cases we need
  * it.
  */
+
+#ifndef CONFIG_KMAP_LOCAL
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
+void kunmap_atomic_high(void *kvaddr);
+
 static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
 	preempt_disable();
@@ -89,7 +101,38 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 		return page_address(page);
 	return kmap_atomic_high_prot(page, prot);
 }
-#define kmap_atomic(page)	kmap_atomic_prot(page, kmap_prot)
+
+static inline void __kunmap_atomic(void *vaddr)
+{
+	kunmap_atomic_high(vaddr);
+}
+#else /* !CONFIG_KMAP_LOCAL */
+
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+	preempt_disable();
+	pagefault_disable();
+	return __kmap_local_page_prot(page, prot);
+}
+
+static inline void *kmap_atomic_pfn(unsigned long pfn)
+{
+	preempt_disable();
+	pagefault_disable();
+	return __kmap_local_pfn_prot(pfn, kmap_prot);
+}
+
+static inline void __kunmap_atomic(void *addr)
+{
+	kunmap_local_indexed(addr);
+}
+
+#endif /* CONFIG_KMAP_LOCAL */
+
+static inline void *kmap_atomic(struct page *page)
+{
+	return kmap_atomic_prot(page, kmap_prot);
+}
 
 /* declarations for linux/mm/highmem.c */
 unsigned int nr_free_highpages(void);
@@ -147,25 +190,33 @@ static inline void *kmap_atomic(struct page *page)
 	pagefault_disable();
 	return page_address(page);
 }
-#define kmap_atomic_prot(page, prot)	kmap_atomic(page)
 
-static inline void kunmap_atomic_high(void *addr)
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+	return kmap_atomic(page);
+}
+
+static inline void *kmap_atomic_pfn(unsigned long pfn)
+{
+	return kmap_atomic(pfn_to_page(pfn));
+}
+
+static inline void __kunmap_atomic(void *addr)
 {
 	/*
 	 * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic()
-	 * handles re-enabling faults + preemption
+	 * handles re-enabling faults and preemption
 	 */
 #ifdef ARCH_HAS_FLUSH_ON_KUNMAP
 	kunmap_flush_on_unmap(addr);
 #endif
 }
 
-#define kmap_atomic_pfn(pfn)	kmap_atomic(pfn_to_page(pfn))
-
 #define kmap_flush_unused()	do {} while(0)
 
 #endif /* CONFIG_HIGHMEM */
 
+#if !defined(CONFIG_KMAP_LOCAL)
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 
 DECLARE_PER_CPU(int, __kmap_atomic_idx);
@@ -196,22 +247,21 @@ static inline void kmap_atomic_idx_pop(void)
 	__this_cpu_dec(__kmap_atomic_idx);
 #endif
 }
-
+#endif
 #endif
 
 /*
  * Prevent people trying to call kunmap_atomic() as if it were kunmap()
  * kunmap_atomic() should get the return value of kmap_atomic, not the page.
  */
-#define kunmap_atomic(addr)                                     \
-do {                                                            \
-	BUILD_BUG_ON(__same_type((addr), struct page *));       \
-	kunmap_atomic_high(addr);                                  \
-	pagefault_enable();                                     \
-	preempt_enable();                                       \
+#define kunmap_atomic(__addr)					\
+do {								\
+	BUILD_BUG_ON(__same_type((__addr), struct page *));	\
+	__kunmap_atomic(__addr);				\
+	pagefault_enable();					\
+	preempt_enable();					\
 } while (0)
 
-
 /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
 #ifndef clear_user_highpage
 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
diff --git a/mm/Kconfig b/mm/Kconfig
index d42423f884a7..a1ccf98b7333 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -872,4 +872,7 @@ config ARCH_HAS_HUGEPD
 config MAPPING_DIRTY_HELPERS
         bool
 
+config KMAP_LOCAL
+	bool
+
 endmenu
diff --git a/mm/highmem.c b/mm/highmem.c
index 6abfd762eee7..bb4ce13ee7e7 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -31,9 +31,11 @@
 #include <asm/tlbflush.h>
 #include <linux/vmalloc.h>
 
+#ifndef CONFIG_KMAP_LOCAL
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 DEFINE_PER_CPU(int, __kmap_atomic_idx);
 #endif
+#endif
 
 /*
  * Virtual_count is not a pure "count".
@@ -365,9 +367,147 @@ void kunmap_high(struct page *page)
 	if (need_wakeup)
 		wake_up(pkmap_map_wait);
 }
-
 EXPORT_SYMBOL(kunmap_high);
-#endif	/* CONFIG_HIGHMEM */
+#endif /* CONFIG_HIGHMEM */
+
+#ifdef CONFIG_KMAP_LOCAL
+
+#include <asm/kmap_size.h>
+
+static DEFINE_PER_CPU(int, __kmap_local_idx);
+
+static inline int kmap_local_idx_push(void)
+{
+	int idx = __this_cpu_inc_return(__kmap_local_idx) - 1;
+
+	WARN_ON_ONCE(in_irq() && !irqs_disabled());
+	BUG_ON(idx >= KM_MAX_IDX);
+	return idx;
+}
+
+static inline int kmap_local_idx(void)
+{
+	return __this_cpu_read(__kmap_local_idx) - 1;
+}
+
+static inline void kmap_local_idx_pop(void)
+{
+	int idx = __this_cpu_dec_return(__kmap_local_idx);
+
+	BUG_ON(idx < 0);
+}
+
+#ifndef arch_kmap_local_post_map
+# define arch_kmap_local_post_map(vaddr, pteval)	do { } while (0)
+#endif
+#ifndef arch_kmap_local_pre_unmap
+# define arch_kmap_local_pre_unmap(vaddr)		do { } while (0)
+#endif
+
+#ifndef arch_kmap_local_post_unmap
+# define arch_kmap_local_post_unmap(vaddr)		do { } while (0)
+#endif
+
+#ifndef arch_kmap_local_map_idx
+#define arch_kmap_local_map_idx(idx, pfn)	kmap_local_calc_idx(idx)
+#endif
+
+#ifndef arch_kmap_local_unmap_idx
+#define arch_kmap_local_unmap_idx(idx, vaddr)	kmap_local_calc_idx(idx)
+#endif
+
+#ifndef arch_kmap_local_high_get
+static inline void *arch_kmap_local_high_get(struct page *page)
+{
+	return NULL;
+}
+#endif
+
+/* Unmap a local mapping which was obtained by kmap_high_get() */
+static inline void kmap_high_unmap_local(unsigned long vaddr)
+{
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+	if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP))
+		kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
+#endif
+}
+
+static inline int kmap_local_calc_idx(int idx)
+{
+	return idx + KM_MAX_IDX * smp_processor_id();
+}
+
+static pte_t *__kmap_pte;
+
+static pte_t *kmap_get_pte(void)
+{
+	if (!__kmap_pte)
+		__kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
+	return __kmap_pte;
+}
+
+void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+	pte_t pteval, *kmap_pte = kmap_get_pte();
+	unsigned long vaddr;
+	int idx;
+
+	preempt_disable();
+	idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn);
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	BUG_ON(!pte_none(*(kmap_pte - idx)));
+	pteval = pfn_pte(pfn, prot);
+	set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval);
+	arch_kmap_local_post_map(vaddr, pteval);
+	preempt_enable();
+
+	return (void *)vaddr;
+}
+EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot);
+
+void *__kmap_local_page_prot(struct page *page, pgprot_t prot)
+{
+	void *kmap;
+
+	if (!PageHighMem(page))
+		return page_address(page);
+
+	/* Try kmap_high_get() if architecture has it enabled */
+	kmap = arch_kmap_local_high_get(page);
+	if (kmap)
+		return kmap;
+
+	return __kmap_local_pfn_prot(page_to_pfn(page), prot);
+}
+EXPORT_SYMBOL(__kmap_local_page_prot);
+
+void kunmap_local_indexed(void *vaddr)
+{
+	unsigned long addr = (unsigned long) vaddr & PAGE_MASK;
+	pte_t *kmap_pte = kmap_get_pte();
+	int idx;
+
+	if (addr < __fix_to_virt(FIX_KMAP_END) ||
+	    addr > __fix_to_virt(FIX_KMAP_BEGIN)) {
+		WARN_ON_ONCE(addr < PAGE_OFFSET);
+
+		/* Handle mappings which were obtained by kmap_high_get() */
+		kmap_high_unmap_local(addr);
+		return;
+	}
+
+	preempt_disable();
+	idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr);
+	WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+
+	arch_kmap_local_pre_unmap(addr);
+	pte_clear(&init_mm, addr, kmap_pte - idx);
+	arch_kmap_local_post_unmap(addr);
+	kmap_local_idx_pop();
+	preempt_enable();
+}
+EXPORT_SYMBOL(kunmap_local_indexed);
+#endif
 
 #if defined(HASHED_PAGE_VIRTUAL)
 
-- 
cgit 


From 157e118b55113d1e6c7f8ddfcec0a1dbf3a69511 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Nov 2020 10:27:20 +0100
Subject: x86/mm/highmem: Use generic kmap atomic implementation

Convert X86 to the generic kmap atomic implementation and make the
iomap_atomic() naming convention consistent while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201103095857.375127260@linutronix.de
---
 arch/x86/Kconfig                      |  3 +-
 arch/x86/include/asm/fixmap.h         |  5 ++-
 arch/x86/include/asm/highmem.h        | 13 +++++---
 arch/x86/include/asm/iomap.h          | 18 ++++++-----
 arch/x86/include/asm/kmap_types.h     | 13 --------
 arch/x86/include/asm/paravirt_types.h |  1 -
 arch/x86/mm/highmem_32.c              | 59 -----------------------------------
 arch/x86/mm/init_32.c                 | 15 ---------
 arch/x86/mm/iomap_32.c                | 59 +++--------------------------------
 include/linux/highmem.h               |  2 +-
 include/linux/io-mapping.h            |  2 +-
 mm/highmem.c                          |  2 +-
 12 files changed, 31 insertions(+), 161 deletions(-)
 delete mode 100644 arch/x86/include/asm/kmap_types.h

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f6946b81f74a..33c273cb3023 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -14,10 +14,11 @@ config X86_32
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select CLKSRC_I8253
 	select CLONE_BACKWARDS
+	select GENERIC_VDSO_32
 	select HAVE_DEBUG_STACKOVERFLOW
+	select KMAP_LOCAL
 	select MODULES_USE_ELF_REL
 	select OLD_SIGACTION
-	select GENERIC_VDSO_32
 
 config X86_64
 	def_bool y
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 77217bd292bd..8eba66a33e39 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -31,7 +31,7 @@
 #include <asm/pgtable_types.h>
 #ifdef CONFIG_X86_32
 #include <linux/threads.h>
-#include <asm/kmap_types.h>
+#include <asm/kmap_size.h>
 #else
 #include <uapi/asm/vsyscall.h>
 #endif
@@ -94,7 +94,7 @@ enum fixed_addresses {
 #endif
 #ifdef CONFIG_X86_32
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
-	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+	FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
 #ifdef CONFIG_PCI_MMCONFIG
 	FIX_PCIE_MCFG,
 #endif
@@ -151,7 +151,6 @@ extern void reserve_top_address(unsigned long reserve);
 
 extern int fixmaps_set;
 
-extern pte_t *kmap_pte;
 extern pte_t *pkmap_page_table;
 
 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 0f420b24e0fc..032e020853aa 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -23,7 +23,6 @@
 
 #include <linux/interrupt.h>
 #include <linux/threads.h>
-#include <asm/kmap_types.h>
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
 #include <asm/fixmap.h>
@@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, highend_pfn;
 #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
 
-void *kmap_atomic_pfn(unsigned long pfn);
-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
-
 #define flush_cache_kmaps()	do { } while (0)
 
+#define	arch_kmap_local_post_map(vaddr, pteval)		\
+	arch_flush_lazy_mmu_mode()
+
+#define	arch_kmap_local_post_unmap(vaddr)		\
+	do {						\
+		flush_tlb_one_kernel((vaddr));		\
+		arch_flush_lazy_mmu_mode();		\
+	} while (0)
+
 extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index bacf68c4d70e..0be7a30fd6bc 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -9,19 +9,21 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
+#include <linux/highmem.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
+void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot);
 
-void
-iounmap_atomic(void __iomem *kvaddr);
+static inline void iounmap_atomic(void __iomem *vaddr)
+{
+	kunmap_local_indexed((void __force *)vaddr);
+	pagefault_enable();
+	preempt_enable();
+}
 
-int
-iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
 
-void
-iomap_free(resource_size_t base, unsigned long size);
+void iomap_free(resource_size_t base, unsigned long size);
 
 #endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
deleted file mode 100644
index 04ab8266e347..000000000000
--- a/arch/x86/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_KMAP_TYPES_H
-#define _ASM_X86_KMAP_TYPES_H
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
-#define  __WITH_KM_FENCE
-#endif
-
-#include <asm-generic/kmap_types.h>
-
-#undef __WITH_KM_FENCE
-
-#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0fad9f61c76a..b6b02b7c19cc 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -41,7 +41,6 @@
 #ifndef __ASSEMBLY__
 
 #include <asm/desc_defs.h>
-#include <asm/kmap_types.h>
 #include <asm/pgtable_types.h>
 #include <asm/nospec-branch.h>
 
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 075fe51317b0..2c54b76d8f84 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -4,65 +4,6 @@
 #include <linux/swap.h> /* for totalram_pages */
 #include <linux/memblock.h>
 
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
-{
-	unsigned long vaddr;
-	int idx, type;
-
-	type = kmap_atomic_idx_push();
-	idx = type + KM_TYPE_NR*smp_processor_id();
-	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	BUG_ON(!pte_none(*(kmap_pte-idx)));
-	set_pte(kmap_pte-idx, mk_pte(page, prot));
-	arch_flush_lazy_mmu_mode();
-
-	return (void *)vaddr;
-}
-EXPORT_SYMBOL(kmap_atomic_high_prot);
-
-/*
- * This is the same as kmap_atomic() but can map memory that doesn't
- * have a struct page associated with it.
- */
-void *kmap_atomic_pfn(unsigned long pfn)
-{
-	return kmap_atomic_prot_pfn(pfn, kmap_prot);
-}
-EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
-
-void kunmap_atomic_high(void *kvaddr)
-{
-	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-
-	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
-	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
-		int idx, type;
-
-		type = kmap_atomic_idx();
-		idx = type + KM_TYPE_NR * smp_processor_id();
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-#endif
-		/*
-		 * Force other mappings to Oops if they'll try to access this
-		 * pte without first remap it.  Keeping stale mappings around
-		 * is a bad idea also, in case the page changes cacheability
-		 * attributes or becomes a protected page in a hypervisor.
-		 */
-		kpte_clear_flush(kmap_pte-idx, vaddr);
-		kmap_atomic_idx_pop();
-		arch_flush_lazy_mmu_mode();
-	}
-#ifdef CONFIG_DEBUG_HIGHMEM
-	else {
-		BUG_ON(vaddr < PAGE_OFFSET);
-		BUG_ON(vaddr >= (unsigned long)high_memory);
-	}
-#endif
-}
-EXPORT_SYMBOL(kunmap_atomic_high);
-
 void __init set_highmem_pages_init(void)
 {
 	struct zone *zone;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 7c055259de3a..da31c2635ee4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -394,19 +394,6 @@ repeat:
 	return last_map_addr;
 }
 
-pte_t *kmap_pte;
-
-static void __init kmap_init(void)
-{
-	unsigned long kmap_vstart;
-
-	/*
-	 * Cache the first kmap pte:
-	 */
-	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
-	kmap_pte = virt_to_kpte(kmap_vstart);
-}
-
 #ifdef CONFIG_HIGHMEM
 static void __init permanent_kmaps_init(pgd_t *pgd_base)
 {
@@ -712,8 +699,6 @@ void __init paging_init(void)
 
 	__flush_tlb_all();
 
-	kmap_init();
-
 	/*
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index f60398aeb644..e0a40d7cc66c 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -44,28 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(iomap_free);
 
-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
-{
-	unsigned long vaddr;
-	int idx, type;
-
-	preempt_disable();
-	pagefault_disable();
-
-	type = kmap_atomic_idx_push();
-	idx = type + KM_TYPE_NR * smp_processor_id();
-	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
-	arch_flush_lazy_mmu_mode();
-
-	return (void *)vaddr;
-}
-
-/*
- * Map 'pfn' using protections 'prot'
- */
-void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot)
 {
 	/*
 	 * For non-PAT systems, translate non-WB request to UC- just in
@@ -81,36 +60,8 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	/* Filter out unsupported __PAGE_KERNEL* bits: */
 	pgprot_val(prot) &= __default_kernel_pte_mask;
 
-	return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
-}
-EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
-
-void
-iounmap_atomic(void __iomem *kvaddr)
-{
-	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-
-	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
-	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
-		int idx, type;
-
-		type = kmap_atomic_idx();
-		idx = type + KM_TYPE_NR * smp_processor_id();
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-#endif
-		/*
-		 * Force other mappings to Oops if they'll try to access this
-		 * pte without first remap it.  Keeping stale mappings around
-		 * is a bad idea also, in case the page changes cacheability
-		 * attributes or becomes a protected page in a hypervisor.
-		 */
-		kpte_clear_flush(kmap_pte-idx, vaddr);
-		kmap_atomic_idx_pop();
-	}
-
-	pagefault_enable();
-	preempt_enable();
+	preempt_disable();
+	pagefault_disable();
+	return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot);
 }
-EXPORT_SYMBOL_GPL(iounmap_atomic);
+EXPORT_SYMBOL_GPL(iomap_atomic_pfn_prot);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index f5ecee9c2576..1222a31be842 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -217,7 +217,7 @@ static inline void __kunmap_atomic(void *addr)
 #endif /* CONFIG_HIGHMEM */
 
 #if !defined(CONFIG_KMAP_LOCAL)
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+#if defined(CONFIG_HIGHMEM)
 
 DECLARE_PER_CPU(int, __kmap_atomic_idx);
 
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index c75e4d3d8833..3b0940be72e9 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -69,7 +69,7 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
 
 	BUG_ON(offset >= mapping->size);
 	phys_addr = mapping->base + offset;
-	return iomap_atomic_prot_pfn(PHYS_PFN(phys_addr), mapping->prot);
+	return iomap_atomic_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
 }
 
 static inline void
diff --git a/mm/highmem.c b/mm/highmem.c
index 67d2d5983cb0..77677c6844f7 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -32,7 +32,7 @@
 #include <linux/vmalloc.h>
 
 #ifndef CONFIG_KMAP_LOCAL
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+#ifdef CONFIG_HIGHMEM
 DEFINE_PER_CPU(int, __kmap_atomic_idx);
 #endif
 #endif
-- 
cgit 


From d7029e4549691ecaf1ead536d3322a00bda85659 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Nov 2020 10:27:30 +0100
Subject: highmem: Get rid of kmap_types.h

The header is not longer used and on alpha, ia64, openrisc, parisc and um
it was completely unused anyway as these architectures have no highmem
support.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20201103095858.422094352@linutronix.de
---
 arch/alpha/include/asm/kmap_types.h  | 15 ---------------
 arch/ia64/include/asm/kmap_types.h   | 13 -------------
 arch/openrisc/mm/init.c              |  1 -
 arch/openrisc/mm/ioremap.c           |  1 -
 arch/parisc/include/asm/kmap_types.h | 13 -------------
 arch/um/include/asm/fixmap.h         |  1 -
 arch/um/include/asm/kmap_types.h     | 13 -------------
 include/asm-generic/Kbuild           |  1 -
 include/asm-generic/kmap_types.h     | 11 -----------
 include/linux/highmem.h              |  2 --
 10 files changed, 71 deletions(-)
 delete mode 100644 arch/alpha/include/asm/kmap_types.h
 delete mode 100644 arch/ia64/include/asm/kmap_types.h
 delete mode 100644 arch/parisc/include/asm/kmap_types.h
 delete mode 100644 arch/um/include/asm/kmap_types.h
 delete mode 100644 include/asm-generic/kmap_types.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/kmap_types.h b/arch/alpha/include/asm/kmap_types.h
deleted file mode 100644
index 651714b45729..000000000000
--- a/arch/alpha/include/asm/kmap_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_KMAP_TYPES_H
-#define _ASM_KMAP_TYPES_H
-
-/* Dummy header just to define km_type. */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include <asm-generic/kmap_types.h>
-
-#undef __WITH_KM_FENCE
-
-#endif
diff --git a/arch/ia64/include/asm/kmap_types.h b/arch/ia64/include/asm/kmap_types.h
deleted file mode 100644
index 5c268cf7c2bd..000000000000
--- a/arch/ia64/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_KMAP_TYPES_H
-#define _ASM_IA64_KMAP_TYPES_H
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include <asm-generic/kmap_types.h>
-
-#undef __WITH_KM_FENCE
-
-#endif /* _ASM_IA64_KMAP_TYPES_H */
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index 8348feaaf46e..bf9b2310fc93 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -33,7 +33,6 @@
 #include <asm/io.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
-#include <asm/kmap_types.h>
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c
index a978590d802d..5aed97a18bac 100644
--- a/arch/openrisc/mm/ioremap.c
+++ b/arch/openrisc/mm/ioremap.c
@@ -15,7 +15,6 @@
 #include <linux/io.h>
 #include <linux/pgtable.h>
 #include <asm/pgalloc.h>
-#include <asm/kmap_types.h>
 #include <asm/fixmap.h>
 #include <asm/bug.h>
 #include <linux/sched.h>
diff --git a/arch/parisc/include/asm/kmap_types.h b/arch/parisc/include/asm/kmap_types.h
deleted file mode 100644
index 3e70b5cd1123..000000000000
--- a/arch/parisc/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_KMAP_TYPES_H
-#define _ASM_KMAP_TYPES_H
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include <asm-generic/kmap_types.h>
-
-#undef __WITH_KM_FENCE
-
-#endif
diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h
index 2c697a145ac1..2efac5827188 100644
--- a/arch/um/include/asm/fixmap.h
+++ b/arch/um/include/asm/fixmap.h
@@ -3,7 +3,6 @@
 #define __UM_FIXMAP_H
 
 #include <asm/processor.h>
-#include <asm/kmap_types.h>
 #include <asm/archparam.h>
 #include <asm/page.h>
 #include <linux/threads.h>
diff --git a/arch/um/include/asm/kmap_types.h b/arch/um/include/asm/kmap_types.h
deleted file mode 100644
index b0bd12de1d23..000000000000
--- a/arch/um/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* 
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
- */
-
-#ifndef __UM_KMAP_TYPES_H
-#define __UM_KMAP_TYPES_H
-
-/* No more #include "asm/arch/kmap_types.h" ! */
-
-#define KM_TYPE_NR 14
-
-#endif
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index ed62d384fb79..4365b9aa3e3f 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -30,7 +30,6 @@ mandatory-y += irq.h
 mandatory-y += irq_regs.h
 mandatory-y += irq_work.h
 mandatory-y += kdebug.h
-mandatory-y += kmap_types.h
 mandatory-y += kmap_size.h
 mandatory-y += kprobes.h
 mandatory-y += linkage.h
diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h
deleted file mode 100644
index 9f95b7b63d19..000000000000
--- a/include/asm-generic/kmap_types.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_GENERIC_KMAP_TYPES_H
-#define _ASM_GENERIC_KMAP_TYPES_H
-
-#ifdef __WITH_KM_FENCE
-# define KM_TYPE_NR 41
-#else
-# define KM_TYPE_NR 20
-#endif
-
-#endif
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 1222a31be842..de78869454b1 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -29,8 +29,6 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
 }
 #endif
 
-#include <asm/kmap_types.h>
-
 /*
  * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
  */
-- 
cgit 


From 3c1016b53c311906878c703af1e2b29855a9a962 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Nov 2020 10:27:31 +0100
Subject: mm/highmem: Remove the old kmap_atomic cruft

All users gone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20201103095858.516281567@linutronix.de
---
 include/linux/highmem.h | 63 ++++---------------------------------------------
 mm/highmem.c            |  7 +-----
 2 files changed, 5 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index de78869454b1..3180a8f7307f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -86,31 +86,16 @@ static inline void kunmap(struct page *page)
  * be used in IRQ contexts, so in some (very limited) cases we need
  * it.
  */
-
-#ifndef CONFIG_KMAP_LOCAL
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
-void kunmap_atomic_high(void *kvaddr);
-
 static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
 	preempt_disable();
 	pagefault_disable();
-	if (!PageHighMem(page))
-		return page_address(page);
-	return kmap_atomic_high_prot(page, prot);
-}
-
-static inline void __kunmap_atomic(void *vaddr)
-{
-	kunmap_atomic_high(vaddr);
+	return __kmap_local_page_prot(page, prot);
 }
-#else /* !CONFIG_KMAP_LOCAL */
 
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_atomic(struct page *page)
 {
-	preempt_disable();
-	pagefault_disable();
-	return __kmap_local_page_prot(page, prot);
+	return kmap_atomic_prot(page, kmap_prot);
 }
 
 static inline void *kmap_atomic_pfn(unsigned long pfn)
@@ -125,13 +110,6 @@ static inline void __kunmap_atomic(void *addr)
 	kunmap_local_indexed(addr);
 }
 
-#endif /* CONFIG_KMAP_LOCAL */
-
-static inline void *kmap_atomic(struct page *page)
-{
-	return kmap_atomic_prot(page, kmap_prot);
-}
-
 /* declarations for linux/mm/highmem.c */
 unsigned int nr_free_highpages(void);
 extern atomic_long_t _totalhigh_pages;
@@ -212,41 +190,8 @@ static inline void __kunmap_atomic(void *addr)
 
 #define kmap_flush_unused()	do {} while(0)
 
-#endif /* CONFIG_HIGHMEM */
-
-#if !defined(CONFIG_KMAP_LOCAL)
-#if defined(CONFIG_HIGHMEM)
-
-DECLARE_PER_CPU(int, __kmap_atomic_idx);
-
-static inline int kmap_atomic_idx_push(void)
-{
-	int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-	WARN_ON_ONCE(in_irq() && !irqs_disabled());
-	BUG_ON(idx >= KM_TYPE_NR);
-#endif
-	return idx;
-}
-
-static inline int kmap_atomic_idx(void)
-{
-	return __this_cpu_read(__kmap_atomic_idx) - 1;
-}
 
-static inline void kmap_atomic_idx_pop(void)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-	int idx = __this_cpu_dec_return(__kmap_atomic_idx);
-
-	BUG_ON(idx < 0);
-#else
-	__this_cpu_dec(__kmap_atomic_idx);
-#endif
-}
-#endif
-#endif
+#endif /* CONFIG_HIGHMEM */
 
 /*
  * Prevent people trying to call kunmap_atomic() as if it were kunmap()
diff --git a/mm/highmem.c b/mm/highmem.c
index 77677c6844f7..499dfafd36b7 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -31,12 +31,6 @@
 #include <asm/tlbflush.h>
 #include <linux/vmalloc.h>
 
-#ifndef CONFIG_KMAP_LOCAL
-#ifdef CONFIG_HIGHMEM
-DEFINE_PER_CPU(int, __kmap_atomic_idx);
-#endif
-#endif
-
 /*
  * Virtual_count is not a pure "count".
  *  0 means that it is not mapped, and has not been mapped
@@ -410,6 +404,7 @@ static inline void kmap_local_idx_pop(void)
 #ifndef arch_kmap_local_post_map
 # define arch_kmap_local_post_map(vaddr, pteval)	do { } while (0)
 #endif
+
 #ifndef arch_kmap_local_pre_unmap
 # define arch_kmap_local_pre_unmap(vaddr)		do { } while (0)
 #endif
-- 
cgit 


From 351191ad55c8a1eccaf23e4187c62056229c0779 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Nov 2020 10:27:32 +0100
Subject: io-mapping: Cleanup atomic iomap

Switch the atomic iomap implementation over to kmap_local and stick the
preempt/pagefault mechanics into the generic code similar to the
kmap_atomic variants.

Rename the x86 map function in preparation for a non-atomic variant.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20201103095858.625310005@linutronix.de
---
 arch/x86/include/asm/iomap.h | 9 +--------
 arch/x86/mm/iomap_32.c       | 6 ++----
 include/linux/io-mapping.h   | 8 ++++++--
 3 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 0be7a30fd6bc..e2de092fc38c 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -13,14 +13,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot);
-
-static inline void iounmap_atomic(void __iomem *vaddr)
-{
-	kunmap_local_indexed((void __force *)vaddr);
-	pagefault_enable();
-	preempt_enable();
-}
+void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
 
 int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
 
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index e0a40d7cc66c..9aaa756ddf21 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -44,7 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(iomap_free);
 
-void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot)
+void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
 {
 	/*
 	 * For non-PAT systems, translate non-WB request to UC- just in
@@ -60,8 +60,6 @@ void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot)
 	/* Filter out unsupported __PAGE_KERNEL* bits: */
 	pgprot_val(prot) &= __default_kernel_pte_mask;
 
-	preempt_disable();
-	pagefault_disable();
 	return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot);
 }
-EXPORT_SYMBOL_GPL(iomap_atomic_pfn_prot);
+EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot);
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 3b0940be72e9..60e7c83e4904 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -69,13 +69,17 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
 
 	BUG_ON(offset >= mapping->size);
 	phys_addr = mapping->base + offset;
-	return iomap_atomic_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
+	preempt_disable();
+	pagefault_disable();
+	return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
 }
 
 static inline void
 io_mapping_unmap_atomic(void __iomem *vaddr)
 {
-	iounmap_atomic(vaddr);
+	kunmap_local_indexed((void __force *)vaddr);
+	pagefault_enable();
+	preempt_enable();
 }
 
 static inline void __iomem *
-- 
cgit 


From 13f876ba77ebd5125799bb042201f22cf73df154 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Nov 2020 10:27:34 +0100
Subject: highmem: High implementation details and document API

Move the gory details of kmap & al into a private header and only document
the interfaces which are usable by drivers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20201103095858.827582066@linutronix.de
---
 include/linux/highmem-internal.h | 174 +++++++++++++++++++++++++
 include/linux/highmem.h          | 266 ++++++++++++++-------------------------
 mm/highmem.c                     |  11 +-
 3 files changed, 274 insertions(+), 177 deletions(-)
 create mode 100644 include/linux/highmem-internal.h

(limited to 'include/linux')

diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
new file mode 100644
index 000000000000..6ceed907b14e
--- /dev/null
+++ b/include/linux/highmem-internal.h
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HIGHMEM_INTERNAL_H
+#define _LINUX_HIGHMEM_INTERNAL_H
+
+/*
+ * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
+ */
+#ifdef CONFIG_KMAP_LOCAL
+void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
+void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
+void kunmap_local_indexed(void *vaddr);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#include <asm/highmem.h>
+
+#ifndef ARCH_HAS_KMAP_FLUSH_TLB
+static inline void kmap_flush_tlb(unsigned long addr) { }
+#endif
+
+#ifndef kmap_prot
+#define kmap_prot PAGE_KERNEL
+#endif
+
+void *kmap_high(struct page *page);
+void kunmap_high(struct page *page);
+void __kmap_flush_unused(void);
+struct page *__kmap_to_page(void *addr);
+
+static inline void *kmap(struct page *page)
+{
+	void *addr;
+
+	might_sleep();
+	if (!PageHighMem(page))
+		addr = page_address(page);
+	else
+		addr = kmap_high(page);
+	kmap_flush_tlb((unsigned long)addr);
+	return addr;
+}
+
+static inline void kunmap(struct page *page)
+{
+	might_sleep();
+	if (!PageHighMem(page))
+		return;
+	kunmap_high(page);
+}
+
+static inline struct page *kmap_to_page(void *addr)
+{
+	return __kmap_to_page(addr);
+}
+
+static inline void kmap_flush_unused(void)
+{
+	__kmap_flush_unused();
+}
+
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+	preempt_disable();
+	pagefault_disable();
+	return __kmap_local_page_prot(page, prot);
+}
+
+static inline void *kmap_atomic(struct page *page)
+{
+	return kmap_atomic_prot(page, kmap_prot);
+}
+
+static inline void *kmap_atomic_pfn(unsigned long pfn)
+{
+	preempt_disable();
+	pagefault_disable();
+	return __kmap_local_pfn_prot(pfn, kmap_prot);
+}
+
+static inline void __kunmap_atomic(void *addr)
+{
+	kunmap_local_indexed(addr);
+	pagefault_enable();
+	preempt_enable();
+}
+
+unsigned int __nr_free_highpages(void);
+extern atomic_long_t _totalhigh_pages;
+
+static inline unsigned int nr_free_highpages(void)
+{
+	return __nr_free_highpages();
+}
+
+static inline unsigned long totalhigh_pages(void)
+{
+	return (unsigned long)atomic_long_read(&_totalhigh_pages);
+}
+
+static inline void totalhigh_pages_inc(void)
+{
+	atomic_long_inc(&_totalhigh_pages);
+}
+
+static inline void totalhigh_pages_add(long count)
+{
+	atomic_long_add(count, &_totalhigh_pages);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+static inline struct page *kmap_to_page(void *addr)
+{
+	return virt_to_page(addr);
+}
+
+static inline void *kmap(struct page *page)
+{
+	might_sleep();
+	return page_address(page);
+}
+
+static inline void kunmap_high(struct page *page) { }
+static inline void kmap_flush_unused(void) { }
+
+static inline void kunmap(struct page *page)
+{
+#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
+	kunmap_flush_on_unmap(page_address(page));
+#endif
+}
+
+static inline void *kmap_atomic(struct page *page)
+{
+	preempt_disable();
+	pagefault_disable();
+	return page_address(page);
+}
+
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+	return kmap_atomic(page);
+}
+
+static inline void *kmap_atomic_pfn(unsigned long pfn)
+{
+	return kmap_atomic(pfn_to_page(pfn));
+}
+
+static inline void __kunmap_atomic(void *addr)
+{
+#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
+	kunmap_flush_on_unmap(addr);
+#endif
+	pagefault_enable();
+	preempt_enable();
+}
+
+static inline unsigned int nr_free_highpages(void) { return 0; }
+static inline unsigned long totalhigh_pages(void) { return 0UL; }
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * Prevent people trying to call kunmap_atomic() as if it were kunmap()
+ * kunmap_atomic() should get the return value of kmap_atomic, not the page.
+ */
+#define kunmap_atomic(__addr)					\
+do {								\
+	BUILD_BUG_ON(__same_type((__addr), struct page *));	\
+	__kunmap_atomic(__addr);				\
+} while (0)
+
+#endif
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 3180a8f7307f..7d098bd621f6 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -11,199 +11,125 @@
 
 #include <asm/cacheflush.h>
 
-#ifndef ARCH_HAS_FLUSH_ANON_PAGE
-static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr)
-{
-}
-#endif
-
-#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
-}
-static inline void flush_kernel_vmap_range(void *vaddr, int size)
-{
-}
-static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
-{
-}
-#endif
+#include "highmem-internal.h"
 
-/*
- * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
+/**
+ * kmap - Map a page for long term usage
+ * @page:	Pointer to the page to be mapped
+ *
+ * Returns: The virtual address of the mapping
+ *
+ * Can only be invoked from preemptible task context because on 32bit
+ * systems with CONFIG_HIGHMEM enabled this function might sleep.
+ *
+ * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area
+ * this returns the virtual address of the direct kernel mapping.
+ *
+ * The returned virtual address is globally visible and valid up to the
+ * point where it is unmapped via kunmap(). The pointer can be handed to
+ * other contexts.
+ *
+ * For highmem pages on 32bit systems this can be slow as the mapping space
+ * is limited and protected by a global lock. In case that there is no
+ * mapping slot available the function blocks until a slot is released via
+ * kunmap().
  */
-#ifdef CONFIG_KMAP_LOCAL
-void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
-void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
-void kunmap_local_indexed(void *vaddr);
-#endif
-
-#ifdef CONFIG_HIGHMEM
-#include <asm/highmem.h>
+static inline void *kmap(struct page *page);
 
-#ifndef ARCH_HAS_KMAP_FLUSH_TLB
-static inline void kmap_flush_tlb(unsigned long addr) { }
-#endif
-
-#ifndef kmap_prot
-#define kmap_prot PAGE_KERNEL
-#endif
-
-void *kmap_high(struct page *page);
-static inline void *kmap(struct page *page)
-{
-	void *addr;
-
-	might_sleep();
-	if (!PageHighMem(page))
-		addr = page_address(page);
-	else
-		addr = kmap_high(page);
-	kmap_flush_tlb((unsigned long)addr);
-	return addr;
-}
+/**
+ * kunmap - Unmap the virtual address mapped by kmap()
+ * @addr:	Virtual address to be unmapped
+ *
+ * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of
+ * pages in the low memory area.
+ */
+static inline void kunmap(struct page *page);
 
-void kunmap_high(struct page *page);
+/**
+ * kmap_to_page - Get the page for a kmap'ed address
+ * @addr:	The address to look up
+ *
+ * Returns: The page which is mapped to @addr.
+ */
+static inline struct page *kmap_to_page(void *addr);
 
-static inline void kunmap(struct page *page)
-{
-	might_sleep();
-	if (!PageHighMem(page))
-		return;
-	kunmap_high(page);
-}
+/**
+ * kmap_flush_unused - Flush all unused kmap mappings in order to
+ *		       remove stray mappings
+ */
+static inline void kmap_flush_unused(void);
 
-/*
- * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
- * no global lock is needed and because the kmap code must perform a global TLB
- * invalidation when the kmap pool wraps.
+/**
+ * kmap_atomic - Atomically map a page for temporary usage
+ * @page:	Pointer to the page to be mapped
+ *
+ * Returns: The virtual address of the mapping
+ *
+ * Side effect: On return pagefaults and preemption are disabled.
+ *
+ * Can be invoked from any context.
  *
- * However when holding an atomic kmap it is not legal to sleep, so atomic
- * kmaps are appropriate for short, tight code paths only.
+ * Requires careful handling when nesting multiple mappings because the map
+ * management is stack based. The unmap has to be in the reverse order of
+ * the map operation:
  *
- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
- * gives a more generic (and caching) interface. But kmap_atomic can
- * be used in IRQ contexts, so in some (very limited) cases we need
- * it.
+ * addr1 = kmap_atomic(page1);
+ * addr2 = kmap_atomic(page2);
+ * ...
+ * kunmap_atomic(addr2);
+ * kunmap_atomic(addr1);
+ *
+ * Unmapping addr1 before addr2 is invalid and causes malfunction.
+ *
+ * Contrary to kmap() mappings the mapping is only valid in the context of
+ * the caller and cannot be handed to other contexts.
+ *
+ * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
+ * virtual address of the direct mapping. Only real highmem pages are
+ * temporarily mapped.
+ *
+ * While it is significantly faster than kmap() it comes with restrictions
+ * about the pointer validity and the side effects of disabling page faults
+ * and preemption. Use it only when absolutely necessary, e.g. from non
+ * preemptible contexts.
  */
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
-{
-	preempt_disable();
-	pagefault_disable();
-	return __kmap_local_page_prot(page, prot);
-}
+static inline void *kmap_atomic(struct page *page);
 
-static inline void *kmap_atomic(struct page *page)
-{
-	return kmap_atomic_prot(page, kmap_prot);
-}
-
-static inline void *kmap_atomic_pfn(unsigned long pfn)
-{
-	preempt_disable();
-	pagefault_disable();
-	return __kmap_local_pfn_prot(pfn, kmap_prot);
-}
-
-static inline void __kunmap_atomic(void *addr)
-{
-	kunmap_local_indexed(addr);
-}
-
-/* declarations for linux/mm/highmem.c */
-unsigned int nr_free_highpages(void);
-extern atomic_long_t _totalhigh_pages;
-static inline unsigned long totalhigh_pages(void)
-{
-	return (unsigned long)atomic_long_read(&_totalhigh_pages);
-}
-
-static inline void totalhigh_pages_inc(void)
-{
-	atomic_long_inc(&_totalhigh_pages);
-}
-
-static inline void totalhigh_pages_add(long count)
-{
-	atomic_long_add(count, &_totalhigh_pages);
-}
-
-void kmap_flush_unused(void);
-
-struct page *kmap_to_page(void *addr);
-
-#else /* CONFIG_HIGHMEM */
-
-static inline unsigned int nr_free_highpages(void) { return 0; }
-
-static inline struct page *kmap_to_page(void *addr)
-{
-	return virt_to_page(addr);
-}
-
-static inline unsigned long totalhigh_pages(void) { return 0UL; }
+/**
+ * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic()
+ * @addr:	Virtual address to be unmapped
+ *
+ * Counterpart to kmap_atomic().
+ *
+ * Undoes the side effects of kmap_atomic(), i.e. reenabling pagefaults and
+ * preemption.
+ *
+ * Other than that a NOOP for CONFIG_HIGHMEM=n and for mappings of pages
+ * in the low memory area. For real highmen pages the mapping which was
+ * established with kmap_atomic() is destroyed.
+ */
 
-static inline void *kmap(struct page *page)
-{
-	might_sleep();
-	return page_address(page);
-}
+/* Highmem related interfaces for management code */
+static inline unsigned int nr_free_highpages(void);
+static inline unsigned long totalhigh_pages(void);
 
-static inline void kunmap_high(struct page *page)
+#ifndef ARCH_HAS_FLUSH_ANON_PAGE
+static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr)
 {
 }
-
-static inline void kunmap(struct page *page)
-{
-#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
-	kunmap_flush_on_unmap(page_address(page));
 #endif
-}
 
-static inline void *kmap_atomic(struct page *page)
+#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+static inline void flush_kernel_dcache_page(struct page *page)
 {
-	preempt_disable();
-	pagefault_disable();
-	return page_address(page);
 }
-
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+static inline void flush_kernel_vmap_range(void *vaddr, int size)
 {
-	return kmap_atomic(page);
 }
-
-static inline void *kmap_atomic_pfn(unsigned long pfn)
+static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
 {
-	return kmap_atomic(pfn_to_page(pfn));
 }
-
-static inline void __kunmap_atomic(void *addr)
-{
-	/*
-	 * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic()
-	 * handles re-enabling faults and preemption
-	 */
-#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
-	kunmap_flush_on_unmap(addr);
 #endif
-}
-
-#define kmap_flush_unused()	do {} while(0)
-
-
-#endif /* CONFIG_HIGHMEM */
-
-/*
- * Prevent people trying to call kunmap_atomic() as if it were kunmap()
- * kunmap_atomic() should get the return value of kmap_atomic, not the page.
- */
-#define kunmap_atomic(__addr)					\
-do {								\
-	BUILD_BUG_ON(__same_type((__addr), struct page *));	\
-	__kunmap_atomic(__addr);				\
-	pagefault_enable();					\
-	preempt_enable();					\
-} while (0)
 
 /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
 #ifndef clear_user_highpage
diff --git a/mm/highmem.c b/mm/highmem.c
index 499dfafd36b7..54bd233846c9 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
 atomic_long_t _totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(_totalhigh_pages);
 
-unsigned int nr_free_highpages (void)
+unsigned int __nr_free_highpages (void)
 {
 	struct zone *zone;
 	unsigned int pages = 0;
@@ -141,7 +141,7 @@ pte_t * pkmap_page_table;
 		do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
 #endif
 
-struct page *kmap_to_page(void *vaddr)
+struct page *__kmap_to_page(void *vaddr)
 {
 	unsigned long addr = (unsigned long)vaddr;
 
@@ -152,7 +152,7 @@ struct page *kmap_to_page(void *vaddr)
 
 	return virt_to_page(addr);
 }
-EXPORT_SYMBOL(kmap_to_page);
+EXPORT_SYMBOL(__kmap_to_page);
 
 static void flush_all_zero_pkmaps(void)
 {
@@ -194,10 +194,7 @@ static void flush_all_zero_pkmaps(void)
 		flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
 }
 
-/**
- * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
- */
-void kmap_flush_unused(void)
+void __kmap_flush_unused(void)
 {
 	lock_kmap();
 	flush_all_zero_pkmaps();
-- 
cgit 


From 3fcd6a230fa7d03bffcb831a81b40435c146c12b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 3 Sep 2020 15:23:29 -0700
Subject: x86/cpu: Avoid cpuinfo-induced IPIing of idle CPUs

Currently, accessing /proc/cpuinfo sends IPIs to idle CPUs in order to
learn their clock frequency.  Which is a bit strange, given that waking
them from idle likely significantly changes their clock frequency.
This commit therefore avoids sending /proc/cpuinfo-induced IPIs to
idle CPUs.

[ paulmck: Also check for idle in arch_freq_prepare_all(). ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
---
 arch/x86/kernel/cpu/aperfmperf.c | 6 ++++++
 include/linux/rcutiny.h          | 2 ++
 include/linux/rcutree.h          | 1 +
 kernel/rcu/tree.c                | 8 ++++++++
 4 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index dd3261dab0fb..22911deacb6e 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -14,6 +14,7 @@
 #include <linux/cpufreq.h>
 #include <linux/smp.h>
 #include <linux/sched/isolation.h>
+#include <linux/rcupdate.h>
 
 #include "cpu.h"
 
@@ -93,6 +94,9 @@ unsigned int aperfmperf_get_khz(int cpu)
 	if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
 		return 0;
 
+	if (rcu_is_idle_cpu(cpu))
+		return 0; /* Idle CPUs are completely uninteresting. */
+
 	aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
 	return per_cpu(samples.khz, cpu);
 }
@@ -112,6 +116,8 @@ void arch_freq_prepare_all(void)
 	for_each_online_cpu(cpu) {
 		if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
 			continue;
+		if (rcu_is_idle_cpu(cpu))
+			continue; /* Idle CPUs are completely uninteresting. */
 		if (!aperfmperf_snapshot_cpu(cpu, now, false))
 			wait = true;
 	}
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 7c1ecdb356d8..2a97334eb786 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -89,6 +89,8 @@ static inline void rcu_irq_enter_irqson(void) { }
 static inline void rcu_irq_exit(void) { }
 static inline void rcu_irq_exit_preempt(void) { }
 static inline void rcu_irq_exit_check_preempt(void) { }
+#define rcu_is_idle_cpu(cpu) \
+	(is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq())
 static inline void exit_rcu(void) { }
 static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 {
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 59eb5cd567d7..df578b73960f 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -50,6 +50,7 @@ void rcu_irq_exit(void);
 void rcu_irq_exit_preempt(void);
 void rcu_irq_enter_irqson(void);
 void rcu_irq_exit_irqson(void);
+bool rcu_is_idle_cpu(int cpu);
 
 #ifdef CONFIG_PROVE_RCU
 void rcu_irq_exit_check_preempt(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 06895ef85d69..1d84c0b6a9f3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -341,6 +341,14 @@ static bool rcu_dynticks_in_eqs(int snap)
 	return !(snap & RCU_DYNTICK_CTRL_CTR);
 }
 
+/* Return true if the specified CPU is currently idle from an RCU viewpoint.  */
+bool rcu_is_idle_cpu(int cpu)
+{
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+	return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
+}
+
 /*
  * Return true if the CPU corresponding to the specified rcu_data
  * structure has spent some time in an extended quiescent state since
-- 
cgit 


From ede7dc7fa0af619afc08995776eadb9ff3b0a711 Mon Sep 17 00:00:00 2001
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Date: Thu, 5 Nov 2020 19:58:54 -0800
Subject: jbd2: rename j_maxlen to j_total_len and add
 jbd2_journal_max_txn_bufs

The on-disk superblock field sb->s_maxlen represents the total size of
the journal including the fast commit area and is no more the max
number of blocks available for a transaction. The maximum number of
blocks available to a transaction is reduced by the number of fast
commit blocks. So, this patch renames j_maxlen to j_total_len to
better represent its intent. Also, it adds a function to calculate max
number of bufs available for a transaction.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20201106035911.1942128-6-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fsmap.c      |  2 +-
 fs/ext4/super.c      |  2 +-
 fs/jbd2/commit.c     |  2 +-
 fs/jbd2/journal.c    | 12 ++++++------
 fs/jbd2/recovery.c   |  6 +++---
 fs/ocfs2/journal.c   |  2 +-
 include/linux/jbd2.h |  9 +++++++--
 7 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index b232c2767534..4c2a9fe30067 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -280,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,
 
 	/* Fabricate an rmap entry for the external log device. */
 	irec.fmr_physical = journal->j_blk_offset;
-	irec.fmr_length = journal->j_maxlen;
+	irec.fmr_length = journal->j_total_len;
 	irec.fmr_owner = EXT4_FMR_OWN_LOG;
 	irec.fmr_flags = 0;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 738a6dd4957d..8a6dd433bb70 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3976,7 +3976,7 @@ int ext4_calculate_overhead(struct super_block *sb)
 	 * loaded or not
 	 */
 	if (sbi->s_journal && !sbi->s_journal_bdev)
-		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
 	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
 		/* j_inum for internal journal is non-zero */
 		j_inode = ext4_get_journal_inode(sb, j_inum);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index fa688e163a80..ec516490cb35 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -801,7 +801,7 @@ start_journal_io:
 		if (first_block < journal->j_tail)
 			freed += journal->j_last - journal->j_first;
 		/* Update tail only if we free significant amount of space */
-		if (freed < journal->j_maxlen / 4)
+		if (freed < jbd2_journal_get_max_txn_bufs(journal))
 			update_tail = 0;
 	}
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0c7c42bd530f..c3c768248527 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1348,7 +1348,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	journal->j_dev = bdev;
 	journal->j_fs_dev = fs_dev;
 	journal->j_blk_offset = start;
-	journal->j_maxlen = len;
+	journal->j_total_len = len;
 	/* We need enough buffers to write out full descriptor block. */
 	n = journal->j_blocksize / jbd2_min_tag_size();
 	journal->j_wbufsize = n;
@@ -1531,7 +1531,7 @@ static int journal_reset(journal_t *journal)
 	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
 	journal->j_commit_request = journal->j_commit_sequence;
 
-	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+	journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
 
 	/*
 	 * As a special case, if the on-disk copy is already marked as needing
@@ -1792,15 +1792,15 @@ static int journal_get_superblock(journal_t *journal)
 		goto out;
 	}
 
-	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
-		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
-	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+	if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
+		journal->j_total_len = be32_to_cpu(sb->s_maxlen);
+	else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
 		printk(KERN_WARNING "JBD2: journal file too short\n");
 		goto out;
 	}
 
 	if (be32_to_cpu(sb->s_first) == 0 ||
-	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+	    be32_to_cpu(sb->s_first) >= journal->j_total_len) {
 		printk(KERN_WARNING
 			"JBD2: Invalid start block of journal: %u\n",
 			be32_to_cpu(sb->s_first));
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index eb2606133cd8..dc0694fcfcd1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -74,8 +74,8 @@ static int do_readahead(journal_t *journal, unsigned int start)
 
 	/* Do up to 128K of readahead */
 	max = start + (128 * 1024 / journal->j_blocksize);
-	if (max > journal->j_maxlen)
-		max = journal->j_maxlen;
+	if (max > journal->j_total_len)
+		max = journal->j_total_len;
 
 	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
 	 * a time to the block device IO layer. */
@@ -134,7 +134,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
 
 	*bhp = NULL;
 
-	if (offset >= journal->j_maxlen) {
+	if (offset >= journal->j_total_len) {
 		printk(KERN_ERR "JBD2: corrupted journal superblock\n");
 		return -EFSCORRUPTED;
 	}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index b9a9d69dde7e..db52e843002a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -877,7 +877,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 		goto done;
 	}
 
-	trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
+	trace_ocfs2_journal_init_maxlen(j_journal->j_total_len);
 
 	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
 		  OCFS2_JOURNAL_DIRTY_FL);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 1d5566af48ac..e0b6b53eae64 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -988,9 +988,9 @@ struct journal_s
 	struct block_device	*j_fs_dev;
 
 	/**
-	 * @j_maxlen: Total maximum capacity of the journal region on disk.
+	 * @j_total_len: Total maximum capacity of the journal region on disk.
 	 */
-	unsigned int		j_maxlen;
+	unsigned int		j_total_len;
 
 	/**
 	 * @j_reserved_credits:
@@ -1624,6 +1624,11 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
 int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
 int jbd2_fc_release_bufs(journal_t *journal);
 
+static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
+{
+	return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
+}
+
 /*
  * is_journal_abort
  *
-- 
cgit 


From a1e5e465b31d6015fccb359d99053b39e5180466 Mon Sep 17 00:00:00 2001
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Date: Thu, 5 Nov 2020 19:58:55 -0800
Subject: ext4: clean up the JBD2 API that initializes fast commits

This patch removes jbd2_fc_init() API and its related functions to
simplify enabling fast commits. With this change, the number of fast
commit blocks to use is solely determined by the JBD2 layer. So, we
move the default value for minimum number of fast commit blocks from
ext4/fast_commit.h to include/linux/jbd2.h. However, whether or not to
use fast commits is determined by the file system. The file system
just sets the fast commit feature using
jbd2_journal_set_features(). JBD2 layer then determines how many
blocks to use for fast commits (based on the value found in the JBD2
superblock).

Note that the JBD2 feature flag of fast commits is just an indication
that there are fast commit blocks present on disk. It doesn't tell
JBD2 layer about the intent of the file system of whether to it wants
to use fast commit or not. That's why, we blindly clear the fast
commit flag in journal_reset() after the recovery is done.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20201106035911.1942128-7-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c | 14 --------
 fs/ext4/fast_commit.h |  3 --
 fs/ext4/super.c       |  8 +++++
 fs/jbd2/journal.c     | 96 ++++++++++++++++++++++++++++++---------------------
 include/linux/jbd2.h  |  2 +-
 5 files changed, 65 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 9399e9cccb7e..bab60c5d5095 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -2091,8 +2091,6 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
 
 void ext4_fc_init(struct super_block *sb, journal_t *journal)
 {
-	int num_fc_blocks;
-
 	/*
 	 * We set replay callback even if fast commit disabled because we may
 	 * could still have fast commit blocks that need to be replayed even if
@@ -2102,18 +2100,6 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
 		return;
 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
-	if (!buffer_uptodate(journal->j_sb_buffer)
-		&& ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO,
-					true)) {
-		ext4_msg(sb, KERN_ERR, "I/O error on journal");
-		return;
-	}
-	num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks);
-	if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks :
-					EXT4_NUM_FC_BLKS)) {
-		pr_warn("Error while enabling fast commits, turning off.");
-		ext4_clear_feature_fast_commit(sb);
-	}
 }
 
 const char *fc_ineligible_reasons[] = {
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 140fbb6af19e..1d96e0ac8138 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -3,9 +3,6 @@
 #ifndef __FAST_COMMIT_H__
 #define __FAST_COMMIT_H__
 
-/* Number of blocks in journal area to allocate for fast commits */
-#define EXT4_NUM_FC_BLKS		256
-
 /* Fast commit tags */
 #define EXT4_FC_TAG_ADD_RANGE		0x0001
 #define EXT4_FC_TAG_DEL_RANGE		0x0002
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8a6dd433bb70..ba02d7c86fb3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4857,6 +4857,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	}
 
+	if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+		!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
+		ext4_msg(sb, KERN_ERR,
+			"Failed to set fast commit journal feature");
+		goto failed_mount_wq;
+	}
+
 	/* We have now updated the journal if required, so we can
 	 * validate the data journaling mode. */
 	switch (test_opt(sb, DATA_FLAGS)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c3c768248527..500152f0421a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1352,19 +1352,12 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	/* We need enough buffers to write out full descriptor block. */
 	n = journal->j_blocksize / jbd2_min_tag_size();
 	journal->j_wbufsize = n;
+	journal->j_fc_wbuf = NULL;
 	journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
 					GFP_KERNEL);
 	if (!journal->j_wbuf)
 		goto err_cleanup;
 
-	if (journal->j_fc_wbufsize > 0) {
-		journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize,
-					sizeof(struct buffer_head *),
-					GFP_KERNEL);
-		if (!journal->j_fc_wbuf)
-			goto err_cleanup;
-	}
-
 	bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
 	if (!bh) {
 		pr_err("%s: Cannot get buffer for journal superblock\n",
@@ -1378,23 +1371,11 @@ static journal_t *journal_init_common(struct block_device *bdev,
 
 err_cleanup:
 	kfree(journal->j_wbuf);
-	kfree(journal->j_fc_wbuf);
 	jbd2_journal_destroy_revoke(journal);
 	kfree(journal);
 	return NULL;
 }
 
-int jbd2_fc_init(journal_t *journal, int num_fc_blks)
-{
-	journal->j_fc_wbufsize = num_fc_blks;
-	journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize,
-				sizeof(struct buffer_head *), GFP_KERNEL);
-	if (!journal->j_fc_wbuf)
-		return -ENOMEM;
-	return 0;
-}
-EXPORT_SYMBOL(jbd2_fc_init);
-
 /* jbd2_journal_init_dev and jbd2_journal_init_inode:
  *
  * Create a journal structure assigned some fixed set of disk blocks to
@@ -1512,16 +1493,7 @@ static int journal_reset(journal_t *journal)
 	}
 
 	journal->j_first = first;
-
-	if (jbd2_has_feature_fast_commit(journal) &&
-	    journal->j_fc_wbufsize > 0) {
-		journal->j_fc_last = last;
-		journal->j_last = last - journal->j_fc_wbufsize;
-		journal->j_fc_first = journal->j_last + 1;
-		journal->j_fc_off = 0;
-	} else {
-		journal->j_last = last;
-	}
+	journal->j_last = last;
 
 	journal->j_head = journal->j_first;
 	journal->j_tail = journal->j_first;
@@ -1533,6 +1505,13 @@ static int journal_reset(journal_t *journal)
 
 	journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
 
+	/*
+	 * Now that journal recovery is done, turn fast commits off here. This
+	 * way, if fast commit was enabled before the crash but if now FS has
+	 * disabled it, we don't enable fast commits.
+	 */
+	jbd2_clear_feature_fast_commit(journal);
+
 	/*
 	 * As a special case, if the on-disk copy is already marked as needing
 	 * no recovery (s_start == 0), then we can safely defer the superblock
@@ -1872,6 +1851,7 @@ static int load_superblock(journal_t *journal)
 {
 	int err;
 	journal_superblock_t *sb;
+	int num_fc_blocks;
 
 	err = journal_get_superblock(journal);
 	if (err)
@@ -1883,15 +1863,17 @@ static int load_superblock(journal_t *journal)
 	journal->j_tail = be32_to_cpu(sb->s_start);
 	journal->j_first = be32_to_cpu(sb->s_first);
 	journal->j_errno = be32_to_cpu(sb->s_errno);
+	journal->j_last = be32_to_cpu(sb->s_maxlen);
 
-	if (jbd2_has_feature_fast_commit(journal) &&
-	    journal->j_fc_wbufsize > 0) {
+	if (jbd2_has_feature_fast_commit(journal)) {
 		journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
-		journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize;
+		num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
+		if (!num_fc_blocks)
+			num_fc_blocks = JBD2_MIN_FC_BLOCKS;
+		if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
+			journal->j_last = journal->j_fc_last - num_fc_blocks;
 		journal->j_fc_first = journal->j_last + 1;
 		journal->j_fc_off = 0;
-	} else {
-		journal->j_last = be32_to_cpu(sb->s_maxlen);
 	}
 
 	return 0;
@@ -1954,9 +1936,6 @@ int jbd2_journal_load(journal_t *journal)
 	 */
 	journal->j_flags &= ~JBD2_ABORT;
 
-	if (journal->j_fc_wbufsize > 0)
-		jbd2_journal_set_features(journal, 0, 0,
-					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT);
 	/* OK, we've finished with the dynamic journal bits:
 	 * reinitialise the dynamic contents of the superblock in memory
 	 * and reset them on disk. */
@@ -2040,8 +2019,7 @@ int jbd2_journal_destroy(journal_t *journal)
 		jbd2_journal_destroy_revoke(journal);
 	if (journal->j_chksum_driver)
 		crypto_free_shash(journal->j_chksum_driver);
-	if (journal->j_fc_wbufsize > 0)
-		kfree(journal->j_fc_wbuf);
+	kfree(journal->j_fc_wbuf);
 	kfree(journal->j_wbuf);
 	kfree(journal);
 
@@ -2116,6 +2094,37 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp
 	return 0;
 }
 
+static int
+jbd2_journal_initialize_fast_commit(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	unsigned long long num_fc_blks;
+
+	num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
+	if (num_fc_blks == 0)
+		num_fc_blks = JBD2_MIN_FC_BLOCKS;
+	if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
+		return -ENOSPC;
+
+	/* Are we called twice? */
+	WARN_ON(journal->j_fc_wbuf != NULL);
+	journal->j_fc_wbuf = kmalloc_array(num_fc_blks,
+				sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!journal->j_fc_wbuf)
+		return -ENOMEM;
+
+	journal->j_fc_wbufsize = num_fc_blks;
+	journal->j_fc_last = journal->j_last;
+	journal->j_last = journal->j_fc_last - num_fc_blks;
+	journal->j_fc_first = journal->j_last + 1;
+	journal->j_fc_off = 0;
+	journal->j_free = journal->j_last - journal->j_first;
+	journal->j_max_transaction_buffers =
+		jbd2_journal_get_max_txn_bufs(journal);
+
+	return 0;
+}
+
 /**
  * int jbd2_journal_set_features() - Mark a given journal feature in the superblock
  * @journal: Journal to act on.
@@ -2159,6 +2168,13 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
 
 	sb = journal->j_superblock;
 
+	if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) {
+		if (jbd2_journal_initialize_fast_commit(journal)) {
+			pr_err("JBD2: Cannot enable fast commits.\n");
+			return 0;
+		}
+	}
+
 	/* Load the checksum driver if necessary */
 	if ((journal->j_chksum_driver == NULL) &&
 	    INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index e0b6b53eae64..b2caf7bbd8e5 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -68,6 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags);
 extern void jbd2_free(void *ptr, size_t size);
 
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
+#define JBD2_MIN_FC_BLOCKS	256
 
 #ifdef __KERNEL__
 
@@ -1614,7 +1615,6 @@ extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
 extern int jbd2_cleanup_journal_tail(journal_t *);
 
 /* Fast commit related APIs */
-int jbd2_fc_init(journal_t *journal, int num_fc_blks);
 int jbd2_fc_begin_commit(journal_t *journal, tid_t tid);
 int jbd2_fc_end_commit(journal_t *journal);
 int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid);
-- 
cgit 


From c460e5edc85a063ec9cb60addff93d00ed378701 Mon Sep 17 00:00:00 2001
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Date: Thu, 5 Nov 2020 19:58:57 -0800
Subject: jbd2: don't use state lock during commit path

Variables journal->j_fc_off, journal->j_fc_wbuf are accessed during
commit path. Since today we allow only one process to perform a fast
commit, there is no need take state lock before accessing these
variables. This patch removes these locks and adds comments to
describe this.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20201106035911.1942128-9-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c    |  6 ------
 include/linux/jbd2.h | 10 ++++++----
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 500152f0421a..778ea50fc8d1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -865,7 +865,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
 	int fc_off;
 
 	*bh_out = NULL;
-	write_lock(&journal->j_state_lock);
 
 	if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
 		fc_off = journal->j_fc_off;
@@ -874,7 +873,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
 	} else {
 		ret = -EINVAL;
 	}
-	write_unlock(&journal->j_state_lock);
 
 	if (ret)
 		return ret;
@@ -909,9 +907,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
 	struct buffer_head *bh;
 	int i, j_fc_off;
 
-	read_lock(&journal->j_state_lock);
 	j_fc_off = journal->j_fc_off;
-	read_unlock(&journal->j_state_lock);
 
 	/*
 	 * Wait in reverse order to minimize chances of us being woken up before
@@ -939,9 +935,7 @@ int jbd2_fc_release_bufs(journal_t *journal)
 	struct buffer_head *bh;
 	int i, j_fc_off;
 
-	read_lock(&journal->j_state_lock);
 	j_fc_off = journal->j_fc_off;
-	read_unlock(&journal->j_state_lock);
 
 	/*
 	 * Wait in reverse order to minimize chances of us being woken up before
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b2caf7bbd8e5..5f0ef6380b0c 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -945,8 +945,9 @@ struct journal_s
 	/**
 	 * @j_fc_off:
 	 *
-	 * Number of fast commit blocks currently allocated.
-	 * [j_state_lock].
+	 * Number of fast commit blocks currently allocated. Accessed only
+	 * during fast commit. Currently only process can do fast commit, so
+	 * this field is not protected by any lock.
 	 */
 	unsigned long		j_fc_off;
 
@@ -1109,8 +1110,9 @@ struct journal_s
 	struct buffer_head	**j_wbuf;
 
 	/**
-	 * @j_fc_wbuf: Array of fast commit bhs for
-	 * jbd2_journal_commit_transaction.
+	 * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only
+	 * during a fast commit. Currently only process can do fast commit, so
+	 * this field is not protected by any lock.
 	 */
 	struct buffer_head	**j_fc_wbuf;
 
-- 
cgit 


From 0bce577bf9cae13ae32d391432d0030e3f67fc1d Mon Sep 17 00:00:00 2001
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Date: Thu, 5 Nov 2020 19:58:58 -0800
Subject: jbd2: don't pass tid to jbd2_fc_end_commit_fallback()

In jbd2_fc_end_commit_fallback(), we know which tid to commit. There's
no need for caller to pass it.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20201106035911.1942128-10-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c |  2 +-
 fs/jbd2/journal.c     | 12 +++++++++---
 include/linux/jbd2.h  |  2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index bab60c5d5095..e69c580fa91e 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1143,7 +1143,7 @@ out:
 		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
 		nblks, reason, subtid);
 	if (reason == EXT4_FC_REASON_FC_FAILED)
-		return jbd2_fc_end_commit_fallback(journal, commit_tid);
+		return jbd2_fc_end_commit_fallback(journal);
 	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
 		reason == EXT4_FC_REASON_INELIGIBLE)
 		return jbd2_complete_transaction(journal, commit_tid);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 778ea50fc8d1..59166e299cde 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -777,13 +777,19 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
 
 int jbd2_fc_end_commit(journal_t *journal)
 {
-	return __jbd2_fc_end_commit(journal, 0, 0);
+	return __jbd2_fc_end_commit(journal, 0, false);
 }
 EXPORT_SYMBOL(jbd2_fc_end_commit);
 
-int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid)
+int jbd2_fc_end_commit_fallback(journal_t *journal)
 {
-	return __jbd2_fc_end_commit(journal, tid, 1);
+	tid_t tid;
+
+	read_lock(&journal->j_state_lock);
+	tid = journal->j_running_transaction ?
+		journal->j_running_transaction->t_tid : 0;
+	read_unlock(&journal->j_state_lock);
+	return __jbd2_fc_end_commit(journal, tid, true);
 }
 EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5f0ef6380b0c..1c49fd62ff2e 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1619,7 +1619,7 @@ extern int jbd2_cleanup_journal_tail(journal_t *);
 /* Fast commit related APIs */
 int jbd2_fc_begin_commit(journal_t *journal, tid_t tid);
 int jbd2_fc_end_commit(journal_t *journal);
-int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid);
+int jbd2_fc_end_commit_fallback(journal_t *journal);
 int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
 int jbd2_submit_inode_data(struct jbd2_inode *jinode);
 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
-- 
cgit 


From eda2845ae5e0ae466c1aca715d642b4977311747 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 30 Oct 2020 18:59:15 +0200
Subject: irqdomain: Remove unused of_device_id forward declaration

There is no users of of_device_id in irqdomain.h. Drop it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20201030165919.86234-2-andriy.shevchenko@linux.intel.com
---
 include/linux/irqdomain.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 71535e87109f..56642188ec21 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -38,7 +38,6 @@
 
 struct device_node;
 struct irq_domain;
-struct of_device_id;
 struct irq_chip;
 struct irq_data;
 struct cpumask;
-- 
cgit 


From 08219fb1efae451c83281cb6ba4bb6c35ac88fab Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 30 Oct 2020 18:59:16 +0200
Subject: irqdomain: Add forward declaration of fwnode_handle

irqdomain.h is a user of struct fwnode_handle. Add forward declaration of it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20201030165919.86234-3-andriy.shevchenko@linux.intel.com
---
 include/linux/irqdomain.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 56642188ec21..d21f75d294d7 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -37,6 +37,7 @@
 #include <linux/radix-tree.h>
 
 struct device_node;
+struct fwnode_handle;
 struct irq_domain;
 struct irq_chip;
 struct irq_data;
-- 
cgit 


From b6e95788fde8c9bc9da729102085dd36a5a0cda6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 30 Oct 2020 18:59:18 +0200
Subject: irqdomain: Introduce irq_domain_create_legacy() API

Introduce irq_domain_create_legacy() API which is functional equivalent
to the existing irq_domain_add_legacy(), but takes a pointer to the struct
fwnode_handle as a parameter.

This is useful for non OF systems.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20201030165919.86234-5-andriy.shevchenko@linux.intel.com
---
 Documentation/core-api/irq/irq-domain.rst |  6 ++++++
 include/linux/irqdomain.h                 |  6 ++++++
 kernel/irq/irqdomain.c                    | 17 ++++++++++++++---
 3 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst
index 096db12f32d5..a77c24c27f7b 100644
--- a/Documentation/core-api/irq/irq-domain.rst
+++ b/Documentation/core-api/irq/irq-domain.rst
@@ -147,6 +147,7 @@ Legacy
 	irq_domain_add_simple()
 	irq_domain_add_legacy()
 	irq_domain_add_legacy_isa()
+	irq_domain_create_legacy()
 
 The Legacy mapping is a special case for drivers that already have a
 range of irq_descs allocated for the hwirqs.  It is used when the
@@ -185,6 +186,11 @@ that the driver using the simple domain call irq_create_mapping()
 before any irq_find_mapping() since the latter will actually work
 for the static IRQ assignment case.
 
+irq_domain_add_legacy() and irq_domain_create_legacy() are functionally
+equivalent, except for the first argument is different - the former
+accepts an Open Firmware specific 'struct device_node', while the latter
+accepts a more general abstraction 'struct fwnode_handle'.
+
 Hierarchy IRQ domain
 --------------------
 
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index d21f75d294d7..77bf7d84c673 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -271,6 +271,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 irq_hw_number_t first_hwirq,
 					 const struct irq_domain_ops *ops,
 					 void *host_data);
+struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
+					    unsigned int size,
+					    unsigned int first_irq,
+					    irq_hw_number_t first_hwirq,
+					    const struct irq_domain_ops *ops,
+					    void *host_data);
 extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 						   enum irq_domain_bus_token bus_token);
 extern bool irq_domain_check_msi_remap(void);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 831526f2e728..9c9cb8829f7a 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -350,17 +350,28 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 irq_hw_number_t first_hwirq,
 					 const struct irq_domain_ops *ops,
 					 void *host_data)
+{
+	return irq_domain_create_legacy(of_node_to_fwnode(of_node), size,
+					first_irq, first_hwirq, ops, host_data);
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
+
+struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
+					 unsigned int size,
+					 unsigned int first_irq,
+					 irq_hw_number_t first_hwirq,
+					 const struct irq_domain_ops *ops,
+					 void *host_data)
 {
 	struct irq_domain *domain;
 
-	domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size,
-				  first_hwirq + size, 0, ops, host_data);
+	domain = __irq_domain_add(fwnode, first_hwirq + size, first_hwirq + size, 0, ops, host_data);
 	if (domain)
 		irq_domain_associate_many(domain, first_irq, first_hwirq, size);
 
 	return domain;
 }
-EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
+EXPORT_SYMBOL_GPL(irq_domain_create_legacy);
 
 /**
  * irq_find_matching_fwspec() - Locates a domain for a given fwspec
-- 
cgit 


From 1d4ef9b39ebecca827642b8897d2d79ea2026682 Mon Sep 17 00:00:00 2001
From: Cristian Pop <cristian.pop@analog.com>
Date: Mon, 28 Sep 2020 12:09:55 +0300
Subject: iio: core: Add optional symbolic label to a device channel

If a label is defined in the device tree for this channel add that
to the channel specific attributes. This is useful for userspace to
be able to identify an individual channel.

Signed-off-by: Cristian Pop <cristian.pop@analog.com>
Link: https://lore.kernel.org/r/20200928090959.88842-1-cristian.pop@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-core.c | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/iio/iio.h         |  6 ++++++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 9955672fc16a..0402be441ffb 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -673,6 +673,19 @@ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals)
 }
 EXPORT_SYMBOL_GPL(iio_format_value);
 
+static ssize_t iio_read_channel_label(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	struct iio_dev_attr *this_attr = to_iio_dev_attr(attr);
+
+	if (!indio_dev->info->read_label)
+		return -EINVAL;
+
+	return indio_dev->info->read_label(indio_dev, this_attr->c, buf);
+}
+
 static ssize_t iio_read_channel_info(struct device *dev,
 				     struct device_attribute *attr,
 				     char *buf)
@@ -1141,6 +1154,29 @@ error_iio_dev_attr_free:
 	return ret;
 }
 
+static int iio_device_add_channel_label(struct iio_dev *indio_dev,
+					 struct iio_chan_spec const *chan)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	int ret;
+
+	if (!indio_dev->info->read_label)
+		return 0;
+
+	ret = __iio_add_chan_devattr("label",
+				     chan,
+				     &iio_read_channel_label,
+				     NULL,
+				     0,
+				     IIO_SEPARATE,
+				     &indio_dev->dev,
+				     &iio_dev_opaque->channel_attr_list);
+	if (ret < 0)
+		return ret;
+
+	return 1;
+}
+
 static int iio_device_add_info_mask_type(struct iio_dev *indio_dev,
 					 struct iio_chan_spec const *chan,
 					 enum iio_shared_by shared_by,
@@ -1274,6 +1310,11 @@ static int iio_device_add_channel_sysfs(struct iio_dev *indio_dev,
 		return ret;
 	attrcount += ret;
 
+	ret = iio_device_add_channel_label(indio_dev, chan);
+	if (ret < 0)
+		return ret;
+	attrcount += ret;
+
 	if (chan->ext_info) {
 		unsigned int i = 0;
 		for (ext_info = chan->ext_info; ext_info->name; ext_info++) {
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 2e45b3ceafa7..9a3cf4815148 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -362,6 +362,8 @@ struct iio_trigger; /* forward declaration */
  *			and max. For lists, all possible values are enumerated.
  * @write_raw:		function to write a value to the device.
  *			Parameters are the same as for read_raw.
+ * @read_label:		function to request label name for a specified label,
+ *			for better channel identification.
  * @write_raw_get_fmt:	callback function to query the expected
  *			format/precision. If not set by the driver, write_raw
  *			returns IIO_VAL_INT_PLUS_MICRO.
@@ -420,6 +422,10 @@ struct iio_info {
 			 int val2,
 			 long mask);
 
+	int (*read_label)(struct iio_dev *indio_dev,
+			 struct iio_chan_spec const *chan,
+			 char *label);
+
 	int (*write_raw_get_fmt)(struct iio_dev *indio_dev,
 			 struct iio_chan_spec const *chan,
 			 long mask);
-- 
cgit 


From 0e30f47232ab57c685258aa91adc3a3e67bd023e Mon Sep 17 00:00:00 2001
From: Pratyush Yadav <p.yadav@ti.com>
Date: Mon, 5 Oct 2020 21:01:26 +0530
Subject: mtd: spi-nor: add support for DTR protocol

Double Transfer Rate (DTR) is SPI protocol in which data is transferred
on each clock edge as opposed to on each clock cycle. Make
framework-level changes to allow supporting flashes in DTR mode.

Right now, mixed DTR modes are not supported. So, for example a mode
like 4S-4D-4D will not work. All phases need to be either DTR or STR.

The xSPI spec says that "The program commands provide SPI backward
compatible commands for programming data...". So 8D-8D-8D page program
opcodes are populated with using 1S-1S-1S opcodes.

Signed-off-by: Pratyush Yadav <p.yadav@ti.com>
Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Reviewed-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Link: https://lore.kernel.org/r/20201005153138.6437-4-p.yadav@ti.com
---
 drivers/mtd/spi-nor/core.c  | 319 ++++++++++++++++++++++++++++++++------------
 drivers/mtd/spi-nor/core.h  |   7 +
 drivers/mtd/spi-nor/sfdp.c  |   9 +-
 include/linux/mtd/spi-nor.h |  51 +++++--
 4 files changed, 290 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 2f24bde08c90..3cd025d79a98 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -40,6 +40,78 @@
 
 #define SPI_NOR_MAX_ADDR_WIDTH	4
 
+/**
+ * spi_nor_get_cmd_ext() - Get the command opcode extension based on the
+ *			   extension type.
+ * @nor:		pointer to a 'struct spi_nor'
+ * @op:			pointer to the 'struct spi_mem_op' whose properties
+ *			need to be initialized.
+ *
+ * Right now, only "repeat" and "invert" are supported.
+ *
+ * Return: The opcode extension.
+ */
+static u8 spi_nor_get_cmd_ext(const struct spi_nor *nor,
+			      const struct spi_mem_op *op)
+{
+	switch (nor->cmd_ext_type) {
+	case SPI_NOR_EXT_INVERT:
+		return ~op->cmd.opcode;
+
+	case SPI_NOR_EXT_REPEAT:
+		return op->cmd.opcode;
+
+	default:
+		dev_err(nor->dev, "Unknown command extension type\n");
+		return 0;
+	}
+}
+
+/**
+ * spi_nor_spimem_setup_op() - Set up common properties of a spi-mem op.
+ * @nor:		pointer to a 'struct spi_nor'
+ * @op:			pointer to the 'struct spi_mem_op' whose properties
+ *			need to be initialized.
+ * @proto:		the protocol from which the properties need to be set.
+ */
+void spi_nor_spimem_setup_op(const struct spi_nor *nor,
+			     struct spi_mem_op *op,
+			     const enum spi_nor_protocol proto)
+{
+	u8 ext;
+
+	op->cmd.buswidth = spi_nor_get_protocol_inst_nbits(proto);
+
+	if (op->addr.nbytes)
+		op->addr.buswidth = spi_nor_get_protocol_addr_nbits(proto);
+
+	if (op->dummy.nbytes)
+		op->dummy.buswidth = spi_nor_get_protocol_addr_nbits(proto);
+
+	if (op->data.nbytes)
+		op->data.buswidth = spi_nor_get_protocol_data_nbits(proto);
+
+	if (spi_nor_protocol_is_dtr(proto)) {
+		/*
+		 * SPIMEM supports mixed DTR modes, but right now we can only
+		 * have all phases either DTR or STR. IOW, SPIMEM can have
+		 * something like 4S-4D-4D, but SPI NOR can't. So, set all 4
+		 * phases to either DTR or STR.
+		 */
+		op->cmd.dtr = true;
+		op->addr.dtr = true;
+		op->dummy.dtr = true;
+		op->data.dtr = true;
+
+		/* 2 bytes per clock cycle in DTR mode. */
+		op->dummy.nbytes *= 2;
+
+		ext = spi_nor_get_cmd_ext(nor, op);
+		op->cmd.opcode = (op->cmd.opcode << 8) | ext;
+		op->cmd.nbytes = 2;
+	}
+}
+
 /**
  * spi_nor_spimem_bounce() - check if a bounce buffer is needed for the data
  *                           transfer
@@ -85,17 +157,26 @@ static int spi_nor_spimem_exec_op(struct spi_nor *nor, struct spi_mem_op *op)
 static int spi_nor_controller_ops_read_reg(struct spi_nor *nor, u8 opcode,
 					   u8 *buf, size_t len)
 {
+	if (spi_nor_protocol_is_dtr(nor->reg_proto))
+		return -EOPNOTSUPP;
+
 	return nor->controller_ops->read_reg(nor, opcode, buf, len);
 }
 
 static int spi_nor_controller_ops_write_reg(struct spi_nor *nor, u8 opcode,
 					    const u8 *buf, size_t len)
 {
+	if (spi_nor_protocol_is_dtr(nor->reg_proto))
+		return -EOPNOTSUPP;
+
 	return nor->controller_ops->write_reg(nor, opcode, buf, len);
 }
 
 static int spi_nor_controller_ops_erase(struct spi_nor *nor, loff_t offs)
 {
+	if (spi_nor_protocol_is_dtr(nor->write_proto))
+		return -EOPNOTSUPP;
+
 	return nor->controller_ops->erase(nor, offs);
 }
 
@@ -113,22 +194,20 @@ static ssize_t spi_nor_spimem_read_data(struct spi_nor *nor, loff_t from,
 					size_t len, u8 *buf)
 {
 	struct spi_mem_op op =
-		SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 1),
-			   SPI_MEM_OP_ADDR(nor->addr_width, from, 1),
-			   SPI_MEM_OP_DUMMY(nor->read_dummy, 1),
-			   SPI_MEM_OP_DATA_IN(len, buf, 1));
+		SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 0),
+			   SPI_MEM_OP_ADDR(nor->addr_width, from, 0),
+			   SPI_MEM_OP_DUMMY(nor->read_dummy, 0),
+			   SPI_MEM_OP_DATA_IN(len, buf, 0));
 	bool usebouncebuf;
 	ssize_t nbytes;
 	int error;
 
-	/* get transfer protocols. */
-	op.cmd.buswidth = spi_nor_get_protocol_inst_nbits(nor->read_proto);
-	op.addr.buswidth = spi_nor_get_protocol_addr_nbits(nor->read_proto);
-	op.dummy.buswidth = op.addr.buswidth;
-	op.data.buswidth = spi_nor_get_protocol_data_nbits(nor->read_proto);
+	spi_nor_spimem_setup_op(nor, &op, nor->read_proto);
 
 	/* convert the dummy cycles to the number of bytes */
 	op.dummy.nbytes = (nor->read_dummy * op.dummy.buswidth) / 8;
+	if (spi_nor_protocol_is_dtr(nor->read_proto))
+		op.dummy.nbytes *= 2;
 
 	usebouncebuf = spi_nor_spimem_bounce(nor, &op);
 
@@ -179,20 +258,18 @@ static ssize_t spi_nor_spimem_write_data(struct spi_nor *nor, loff_t to,
 					 size_t len, const u8 *buf)
 {
 	struct spi_mem_op op =
-		SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 1),
-			   SPI_MEM_OP_ADDR(nor->addr_width, to, 1),
+		SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 0),
+			   SPI_MEM_OP_ADDR(nor->addr_width, to, 0),
 			   SPI_MEM_OP_NO_DUMMY,
-			   SPI_MEM_OP_DATA_OUT(len, buf, 1));
+			   SPI_MEM_OP_DATA_OUT(len, buf, 0));
 	ssize_t nbytes;
 	int error;
 
-	op.cmd.buswidth = spi_nor_get_protocol_inst_nbits(nor->write_proto);
-	op.addr.buswidth = spi_nor_get_protocol_addr_nbits(nor->write_proto);
-	op.data.buswidth = spi_nor_get_protocol_data_nbits(nor->write_proto);
-
 	if (nor->program_opcode == SPINOR_OP_AAI_WP && nor->sst_write_second)
 		op.addr.nbytes = 0;
 
+	spi_nor_spimem_setup_op(nor, &op, nor->write_proto);
+
 	if (spi_nor_spimem_bounce(nor, &op))
 		memcpy(nor->bouncebuf, buf, op.data.nbytes);
 
@@ -239,11 +316,13 @@ int spi_nor_write_enable(struct spi_nor *nor)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WREN, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WREN, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
 				   SPI_MEM_OP_NO_DATA);
 
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
+
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
 		ret = spi_nor_controller_ops_write_reg(nor, SPINOR_OP_WREN,
@@ -268,11 +347,13 @@ int spi_nor_write_disable(struct spi_nor *nor)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WRDI, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WRDI, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
 				   SPI_MEM_OP_NO_DATA);
 
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
+
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
 		ret = spi_nor_controller_ops_write_reg(nor, SPINOR_OP_WRDI,
@@ -299,10 +380,12 @@ static int spi_nor_read_sr(struct spi_nor *nor, u8 *sr)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_RDSR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_RDSR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_IN(1, sr, 1));
+				   SPI_MEM_OP_DATA_IN(1, sr, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -330,10 +413,12 @@ static int spi_nor_read_fsr(struct spi_nor *nor, u8 *fsr)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_RDFSR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_RDFSR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_IN(1, fsr, 1));
+				   SPI_MEM_OP_DATA_IN(1, fsr, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -362,10 +447,12 @@ static int spi_nor_read_cr(struct spi_nor *nor, u8 *cr)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_RDCR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_RDCR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_IN(1, cr, 1));
+				   SPI_MEM_OP_DATA_IN(1, cr, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -396,11 +483,13 @@ int spi_nor_set_4byte_addr_mode(struct spi_nor *nor, bool enable)
 			SPI_MEM_OP(SPI_MEM_OP_CMD(enable ?
 						  SPINOR_OP_EN4B :
 						  SPINOR_OP_EX4B,
-						  1),
+						  0),
 				  SPI_MEM_OP_NO_ADDR,
 				  SPI_MEM_OP_NO_DUMMY,
 				  SPI_MEM_OP_NO_DATA);
 
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
+
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
 		ret = spi_nor_controller_ops_write_reg(nor,
@@ -432,10 +521,12 @@ static int spansion_set_4byte_addr_mode(struct spi_nor *nor, bool enable)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_BRWR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_BRWR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_OUT(1, nor->bouncebuf, 1));
+				   SPI_MEM_OP_DATA_OUT(1, nor->bouncebuf, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -464,10 +555,12 @@ int spi_nor_write_ear(struct spi_nor *nor, u8 ear)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WREAR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WREAR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_OUT(1, nor->bouncebuf, 1));
+				   SPI_MEM_OP_DATA_OUT(1, nor->bouncebuf, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -495,10 +588,12 @@ int spi_nor_xread_sr(struct spi_nor *nor, u8 *sr)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_XRDSR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_XRDSR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_IN(1, sr, 1));
+				   SPI_MEM_OP_DATA_IN(1, sr, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -540,11 +635,13 @@ static void spi_nor_clear_sr(struct spi_nor *nor)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_CLSR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_CLSR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
 				   SPI_MEM_OP_NO_DATA);
 
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
+
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
 		ret = spi_nor_controller_ops_write_reg(nor, SPINOR_OP_CLSR,
@@ -604,11 +701,13 @@ static void spi_nor_clear_fsr(struct spi_nor *nor)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_CLFSR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_CLFSR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
 				   SPI_MEM_OP_NO_DATA);
 
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
+
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
 		ret = spi_nor_controller_ops_write_reg(nor, SPINOR_OP_CLFSR,
@@ -748,10 +847,12 @@ static int spi_nor_write_sr(struct spi_nor *nor, const u8 *sr, size_t len)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WRSR, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WRSR, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_OUT(len, sr, 1));
+				   SPI_MEM_OP_DATA_OUT(len, sr, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -950,10 +1051,12 @@ static int spi_nor_write_sr2(struct spi_nor *nor, const u8 *sr2)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WRSR2, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WRSR2, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_OUT(1, sr2, 1));
+				   SPI_MEM_OP_DATA_OUT(1, sr2, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -984,10 +1087,12 @@ static int spi_nor_read_sr2(struct spi_nor *nor, u8 *sr2)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_RDSR2, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_RDSR2, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
-				   SPI_MEM_OP_DATA_IN(1, sr2, 1));
+				   SPI_MEM_OP_DATA_IN(1, sr2, 0));
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
 
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
@@ -1015,11 +1120,13 @@ static int spi_nor_erase_chip(struct spi_nor *nor)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_CHIP_ERASE, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_CHIP_ERASE, 0),
 				   SPI_MEM_OP_NO_ADDR,
 				   SPI_MEM_OP_NO_DUMMY,
 				   SPI_MEM_OP_NO_DATA);
 
+		spi_nor_spimem_setup_op(nor, &op, nor->write_proto);
+
 		ret = spi_mem_exec_op(nor->spimem, &op);
 	} else {
 		ret = spi_nor_controller_ops_write_reg(nor,
@@ -1158,11 +1265,13 @@ static int spi_nor_erase_sector(struct spi_nor *nor, u32 addr)
 
 	if (nor->spimem) {
 		struct spi_mem_op op =
-			SPI_MEM_OP(SPI_MEM_OP_CMD(nor->erase_opcode, 1),
-				   SPI_MEM_OP_ADDR(nor->addr_width, addr, 1),
+			SPI_MEM_OP(SPI_MEM_OP_CMD(nor->erase_opcode, 0),
+				   SPI_MEM_OP_ADDR(nor->addr_width, addr, 0),
 				   SPI_MEM_OP_NO_DUMMY,
 				   SPI_MEM_OP_NO_DATA);
 
+		spi_nor_spimem_setup_op(nor, &op, nor->write_proto);
+
 		return spi_mem_exec_op(nor->spimem, &op);
 	} else if (nor->controller_ops->erase) {
 		return spi_nor_controller_ops_erase(nor, addr);
@@ -2272,6 +2381,7 @@ int spi_nor_hwcaps_read2cmd(u32 hwcaps)
 		{ SNOR_HWCAPS_READ_1_8_8,	SNOR_CMD_READ_1_8_8 },
 		{ SNOR_HWCAPS_READ_8_8_8,	SNOR_CMD_READ_8_8_8 },
 		{ SNOR_HWCAPS_READ_1_8_8_DTR,	SNOR_CMD_READ_1_8_8_DTR },
+		{ SNOR_HWCAPS_READ_8_8_8_DTR,	SNOR_CMD_READ_8_8_8_DTR },
 	};
 
 	return spi_nor_hwcaps2cmd(hwcaps, hwcaps_read2cmd,
@@ -2288,6 +2398,7 @@ static int spi_nor_hwcaps_pp2cmd(u32 hwcaps)
 		{ SNOR_HWCAPS_PP_1_1_8,		SNOR_CMD_PP_1_1_8 },
 		{ SNOR_HWCAPS_PP_1_8_8,		SNOR_CMD_PP_1_8_8 },
 		{ SNOR_HWCAPS_PP_8_8_8,		SNOR_CMD_PP_8_8_8 },
+		{ SNOR_HWCAPS_PP_8_8_8_DTR,	SNOR_CMD_PP_8_8_8_DTR },
 	};
 
 	return spi_nor_hwcaps2cmd(hwcaps, hwcaps_pp2cmd,
@@ -2336,17 +2447,17 @@ static int spi_nor_spimem_check_op(struct spi_nor *nor,
 static int spi_nor_spimem_check_readop(struct spi_nor *nor,
 				       const struct spi_nor_read_command *read)
 {
-	struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(read->opcode, 1),
-					  SPI_MEM_OP_ADDR(3, 0, 1),
-					  SPI_MEM_OP_DUMMY(0, 1),
-					  SPI_MEM_OP_DATA_IN(0, NULL, 1));
+	struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(read->opcode, 0),
+					  SPI_MEM_OP_ADDR(3, 0, 0),
+					  SPI_MEM_OP_DUMMY(1, 0),
+					  SPI_MEM_OP_DATA_IN(1, NULL, 0));
 
-	op.cmd.buswidth = spi_nor_get_protocol_inst_nbits(read->proto);
-	op.addr.buswidth = spi_nor_get_protocol_addr_nbits(read->proto);
-	op.data.buswidth = spi_nor_get_protocol_data_nbits(read->proto);
-	op.dummy.buswidth = op.addr.buswidth;
-	op.dummy.nbytes = (read->num_mode_clocks + read->num_wait_states) *
-			  op.dummy.buswidth / 8;
+	spi_nor_spimem_setup_op(nor, &op, read->proto);
+
+	/* convert the dummy cycles to the number of bytes */
+	op.dummy.nbytes = (nor->read_dummy * op.dummy.buswidth) / 8;
+	if (spi_nor_protocol_is_dtr(nor->read_proto))
+		op.dummy.nbytes *= 2;
 
 	return spi_nor_spimem_check_op(nor, &op);
 }
@@ -2362,14 +2473,12 @@ static int spi_nor_spimem_check_readop(struct spi_nor *nor,
 static int spi_nor_spimem_check_pp(struct spi_nor *nor,
 				   const struct spi_nor_pp_command *pp)
 {
-	struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(pp->opcode, 1),
-					  SPI_MEM_OP_ADDR(3, 0, 1),
+	struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(pp->opcode, 0),
+					  SPI_MEM_OP_ADDR(3, 0, 0),
 					  SPI_MEM_OP_NO_DUMMY,
-					  SPI_MEM_OP_DATA_OUT(0, NULL, 1));
+					  SPI_MEM_OP_DATA_OUT(1, NULL, 0));
 
-	op.cmd.buswidth = spi_nor_get_protocol_inst_nbits(pp->proto);
-	op.addr.buswidth = spi_nor_get_protocol_addr_nbits(pp->proto);
-	op.data.buswidth = spi_nor_get_protocol_data_nbits(pp->proto);
+	spi_nor_spimem_setup_op(nor, &op, pp->proto);
 
 	return spi_nor_spimem_check_op(nor, &op);
 }
@@ -2387,12 +2496,16 @@ spi_nor_spimem_adjust_hwcaps(struct spi_nor *nor, u32 *hwcaps)
 	struct spi_nor_flash_parameter *params = nor->params;
 	unsigned int cap;
 
-	/* DTR modes are not supported yet, mask them all. */
-	*hwcaps &= ~SNOR_HWCAPS_DTR;
-
 	/* X-X-X modes are not supported yet, mask them all. */
 	*hwcaps &= ~SNOR_HWCAPS_X_X_X;
 
+	/*
+	 * If the reset line is broken, we do not want to enter a stateful
+	 * mode.
+	 */
+	if (nor->flags & SNOR_F_BROKEN_RESET)
+		*hwcaps &= ~(SNOR_HWCAPS_X_X_X | SNOR_HWCAPS_X_X_X_DTR);
+
 	for (cap = 0; cap < sizeof(*hwcaps) * BITS_PER_BYTE; cap++) {
 		int rdidx, ppidx;
 
@@ -2647,7 +2760,7 @@ static int spi_nor_default_setup(struct spi_nor *nor,
 		 * controller directly implements the spi_nor interface.
 		 * Yet another reason to switch to spi-mem.
 		 */
-		ignored_mask = SNOR_HWCAPS_X_X_X;
+		ignored_mask = SNOR_HWCAPS_X_X_X | SNOR_HWCAPS_X_X_X_DTR;
 		if (shared_mask & ignored_mask) {
 			dev_dbg(nor->dev,
 				"SPI n-n-n protocols are not supported.\n");
@@ -2792,11 +2905,28 @@ static void spi_nor_info_init_params(struct spi_nor *nor)
 					  SNOR_PROTO_1_1_8);
 	}
 
+	if (info->flags & SPI_NOR_OCTAL_DTR_READ) {
+		params->hwcaps.mask |= SNOR_HWCAPS_READ_8_8_8_DTR;
+		spi_nor_set_read_settings(&params->reads[SNOR_CMD_READ_8_8_8_DTR],
+					  0, 20, SPINOR_OP_READ_FAST,
+					  SNOR_PROTO_8_8_8_DTR);
+	}
+
 	/* Page Program settings. */
 	params->hwcaps.mask |= SNOR_HWCAPS_PP;
 	spi_nor_set_pp_settings(&params->page_programs[SNOR_CMD_PP],
 				SPINOR_OP_PP, SNOR_PROTO_1_1_1);
 
+	if (info->flags & SPI_NOR_OCTAL_DTR_PP) {
+		params->hwcaps.mask |= SNOR_HWCAPS_PP_8_8_8_DTR;
+		/*
+		 * Since xSPI Page Program opcode is backward compatible with
+		 * Legacy SPI, use Legacy SPI opcode there as well.
+		 */
+		spi_nor_set_pp_settings(&params->page_programs[SNOR_CMD_PP_8_8_8_DTR],
+					SPINOR_OP_PP, SNOR_PROTO_8_8_8_DTR);
+	}
+
 	/*
 	 * Sector Erase settings. Sort Erase Types in ascending order, with the
 	 * smallest erase size starting at BIT(0).
@@ -2904,7 +3034,8 @@ static int spi_nor_init_params(struct spi_nor *nor)
 
 	spi_nor_manufacturer_init_params(nor);
 
-	if ((nor->info->flags & (SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ)) &&
+	if ((nor->info->flags & (SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+				 SPI_NOR_OCTAL_READ | SPI_NOR_OCTAL_DTR_READ)) &&
 	    !(nor->info->flags & SPI_NOR_SKIP_SFDP))
 		spi_nor_sfdp_init_params(nor);
 
@@ -2966,7 +3097,9 @@ static int spi_nor_init(struct spi_nor *nor)
 		return err;
 	}
 
-	if (nor->addr_width == 4 && !(nor->flags & SNOR_F_4B_OPCODES)) {
+	if (nor->addr_width == 4 &&
+	    nor->read_proto != SNOR_PROTO_8_8_8_DTR &&
+	    !(nor->flags & SNOR_F_4B_OPCODES)) {
 		/*
 		 * If the RESET# pin isn't hooked up properly, or the system
 		 * otherwise doesn't perform a reset command in the boot
@@ -3025,6 +3158,20 @@ static int spi_nor_set_addr_width(struct spi_nor *nor)
 {
 	if (nor->addr_width) {
 		/* already configured from SFDP */
+	} else if (nor->read_proto == SNOR_PROTO_8_8_8_DTR) {
+		/*
+		 * In 8D-8D-8D mode, one byte takes half a cycle to transfer. So
+		 * in this protocol an odd address width cannot be used because
+		 * then the address phase would only span a cycle and a half.
+		 * Half a cycle would be left over. We would then have to start
+		 * the dummy phase in the middle of a cycle and so too the data
+		 * phase, and we will end the transaction with half a cycle left
+		 * over.
+		 *
+		 * Force all 8D-8D-8D flashes to use an address width of 4 to
+		 * avoid this situation.
+		 */
+		nor->addr_width = 4;
 	} else if (nor->info->addr_width) {
 		nor->addr_width = nor->info->addr_width;
 	} else {
@@ -3255,23 +3402,28 @@ EXPORT_SYMBOL_GPL(spi_nor_scan);
 static int spi_nor_create_read_dirmap(struct spi_nor *nor)
 {
 	struct spi_mem_dirmap_info info = {
-		.op_tmpl = SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 1),
-				      SPI_MEM_OP_ADDR(nor->addr_width, 0, 1),
-				      SPI_MEM_OP_DUMMY(nor->read_dummy, 1),
-				      SPI_MEM_OP_DATA_IN(0, NULL, 1)),
+		.op_tmpl = SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 0),
+				      SPI_MEM_OP_ADDR(nor->addr_width, 0, 0),
+				      SPI_MEM_OP_DUMMY(nor->read_dummy, 0),
+				      SPI_MEM_OP_DATA_IN(0, NULL, 0)),
 		.offset = 0,
 		.length = nor->mtd.size,
 	};
 	struct spi_mem_op *op = &info.op_tmpl;
 
-	/* get transfer protocols. */
-	op->cmd.buswidth = spi_nor_get_protocol_inst_nbits(nor->read_proto);
-	op->addr.buswidth = spi_nor_get_protocol_addr_nbits(nor->read_proto);
-	op->dummy.buswidth = op->addr.buswidth;
-	op->data.buswidth = spi_nor_get_protocol_data_nbits(nor->read_proto);
+	spi_nor_spimem_setup_op(nor, op, nor->read_proto);
 
 	/* convert the dummy cycles to the number of bytes */
 	op->dummy.nbytes = (nor->read_dummy * op->dummy.buswidth) / 8;
+	if (spi_nor_protocol_is_dtr(nor->read_proto))
+		op->dummy.nbytes *= 2;
+
+	/*
+	 * Since spi_nor_spimem_setup_op() only sets buswidth when the number
+	 * of data bytes is non-zero, the data buswidth won't be set here. So,
+	 * do it explicitly.
+	 */
+	op->data.buswidth = spi_nor_get_protocol_data_nbits(nor->read_proto);
 
 	nor->dirmap.rdesc = devm_spi_mem_dirmap_create(nor->dev, nor->spimem,
 						       &info);
@@ -3281,24 +3433,27 @@ static int spi_nor_create_read_dirmap(struct spi_nor *nor)
 static int spi_nor_create_write_dirmap(struct spi_nor *nor)
 {
 	struct spi_mem_dirmap_info info = {
-		.op_tmpl = SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 1),
-				      SPI_MEM_OP_ADDR(nor->addr_width, 0, 1),
+		.op_tmpl = SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 0),
+				      SPI_MEM_OP_ADDR(nor->addr_width, 0, 0),
 				      SPI_MEM_OP_NO_DUMMY,
-				      SPI_MEM_OP_DATA_OUT(0, NULL, 1)),
+				      SPI_MEM_OP_DATA_OUT(0, NULL, 0)),
 		.offset = 0,
 		.length = nor->mtd.size,
 	};
 	struct spi_mem_op *op = &info.op_tmpl;
 
-	/* get transfer protocols. */
-	op->cmd.buswidth = spi_nor_get_protocol_inst_nbits(nor->write_proto);
-	op->addr.buswidth = spi_nor_get_protocol_addr_nbits(nor->write_proto);
-	op->dummy.buswidth = op->addr.buswidth;
-	op->data.buswidth = spi_nor_get_protocol_data_nbits(nor->write_proto);
-
 	if (nor->program_opcode == SPINOR_OP_AAI_WP && nor->sst_write_second)
 		op->addr.nbytes = 0;
 
+	spi_nor_spimem_setup_op(nor, op, nor->write_proto);
+
+	/*
+	 * Since spi_nor_spimem_setup_op() only sets buswidth when the number
+	 * of data bytes is non-zero, the data buswidth won't be set here. So,
+	 * do it explicitly.
+	 */
+	op->data.buswidth = spi_nor_get_protocol_data_nbits(nor->write_proto);
+
 	nor->dirmap.wdesc = devm_spi_mem_dirmap_create(nor->dev, nor->spimem,
 						       &info);
 	return PTR_ERR_OR_ZERO(nor->dirmap.wdesc);
diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h
index 6f2f6b27173f..5d95b4183a33 100644
--- a/drivers/mtd/spi-nor/core.h
+++ b/drivers/mtd/spi-nor/core.h
@@ -62,6 +62,7 @@ enum spi_nor_read_command_index {
 	SNOR_CMD_READ_1_8_8,
 	SNOR_CMD_READ_8_8_8,
 	SNOR_CMD_READ_1_8_8_DTR,
+	SNOR_CMD_READ_8_8_8_DTR,
 
 	SNOR_CMD_READ_MAX
 };
@@ -78,6 +79,7 @@ enum spi_nor_pp_command_index {
 	SNOR_CMD_PP_1_1_8,
 	SNOR_CMD_PP_1_8_8,
 	SNOR_CMD_PP_8_8_8,
+	SNOR_CMD_PP_8_8_8_DTR,
 
 	SNOR_CMD_PP_MAX
 };
@@ -311,6 +313,8 @@ struct flash_info {
 					 * BP3 is bit 6 of status register.
 					 * Must be used with SPI_NOR_4BIT_BP.
 					 */
+#define SPI_NOR_OCTAL_DTR_READ	BIT(19) /* Flash supports octal DTR Read. */
+#define SPI_NOR_OCTAL_DTR_PP	BIT(20) /* Flash supports Octal DTR Page Program */
 
 	/* Part specific fixup hooks. */
 	const struct spi_nor_fixups *fixups;
@@ -399,6 +403,9 @@ extern const struct spi_nor_manufacturer spi_nor_winbond;
 extern const struct spi_nor_manufacturer spi_nor_xilinx;
 extern const struct spi_nor_manufacturer spi_nor_xmc;
 
+void spi_nor_spimem_setup_op(const struct spi_nor *nor,
+			     struct spi_mem_op *op,
+			     const enum spi_nor_protocol proto);
 int spi_nor_write_enable(struct spi_nor *nor);
 int spi_nor_write_disable(struct spi_nor *nor);
 int spi_nor_set_4byte_addr_mode(struct spi_nor *nor, bool enable);
diff --git a/drivers/mtd/spi-nor/sfdp.c b/drivers/mtd/spi-nor/sfdp.c
index e2a43d39eb5f..21fa9ab78eae 100644
--- a/drivers/mtd/spi-nor/sfdp.c
+++ b/drivers/mtd/spi-nor/sfdp.c
@@ -1047,9 +1047,16 @@ static int spi_nor_parse_4bait(struct spi_nor *nor,
 	}
 
 	/* 4BAIT is the only SFDP table that indicates page program support. */
-	if (pp_hwcaps & SNOR_HWCAPS_PP)
+	if (pp_hwcaps & SNOR_HWCAPS_PP) {
 		spi_nor_set_pp_settings(&params_pp[SNOR_CMD_PP],
 					SPINOR_OP_PP_4B, SNOR_PROTO_1_1_1);
+		/*
+		 * Since xSPI Page Program opcode is backward compatible with
+		 * Legacy SPI, use Legacy SPI opcode there as well.
+		 */
+		spi_nor_set_pp_settings(&params_pp[SNOR_CMD_PP_8_8_8_DTR],
+					SPINOR_OP_PP_4B, SNOR_PROTO_8_8_8_DTR);
+	}
 	if (pp_hwcaps & SNOR_HWCAPS_PP_1_1_4)
 		spi_nor_set_pp_settings(&params_pp[SNOR_CMD_PP_1_1_4],
 					SPINOR_OP_PP_1_1_4_4B,
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 60bac2c0ec45..cd549042c53d 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -182,6 +182,7 @@ enum spi_nor_protocol {
 	SNOR_PROTO_1_2_2_DTR = SNOR_PROTO_DTR(1, 2, 2),
 	SNOR_PROTO_1_4_4_DTR = SNOR_PROTO_DTR(1, 4, 4),
 	SNOR_PROTO_1_8_8_DTR = SNOR_PROTO_DTR(1, 8, 8),
+	SNOR_PROTO_8_8_8_DTR = SNOR_PROTO_DTR(8, 8, 8),
 };
 
 static inline bool spi_nor_protocol_is_dtr(enum spi_nor_protocol proto)
@@ -228,7 +229,7 @@ struct spi_nor_hwcaps {
  * then Quad SPI protocols before Dual SPI protocols, Fast Read and lastly
  * (Slow) Read.
  */
-#define SNOR_HWCAPS_READ_MASK		GENMASK(14, 0)
+#define SNOR_HWCAPS_READ_MASK		GENMASK(15, 0)
 #define SNOR_HWCAPS_READ		BIT(0)
 #define SNOR_HWCAPS_READ_FAST		BIT(1)
 #define SNOR_HWCAPS_READ_1_1_1_DTR	BIT(2)
@@ -245,11 +246,12 @@ struct spi_nor_hwcaps {
 #define SNOR_HWCAPS_READ_4_4_4		BIT(9)
 #define SNOR_HWCAPS_READ_1_4_4_DTR	BIT(10)
 
-#define SNOR_HWCAPS_READ_OCTAL		GENMASK(14, 11)
+#define SNOR_HWCAPS_READ_OCTAL		GENMASK(15, 11)
 #define SNOR_HWCAPS_READ_1_1_8		BIT(11)
 #define SNOR_HWCAPS_READ_1_8_8		BIT(12)
 #define SNOR_HWCAPS_READ_8_8_8		BIT(13)
 #define SNOR_HWCAPS_READ_1_8_8_DTR	BIT(14)
+#define SNOR_HWCAPS_READ_8_8_8_DTR	BIT(15)
 
 /*
  * Page Program capabilities.
@@ -260,18 +262,19 @@ struct spi_nor_hwcaps {
  * JEDEC/SFDP standard to define them. Also at this moment no SPI flash memory
  * implements such commands.
  */
-#define SNOR_HWCAPS_PP_MASK	GENMASK(22, 16)
-#define SNOR_HWCAPS_PP		BIT(16)
+#define SNOR_HWCAPS_PP_MASK		GENMASK(23, 16)
+#define SNOR_HWCAPS_PP			BIT(16)
 
-#define SNOR_HWCAPS_PP_QUAD	GENMASK(19, 17)
-#define SNOR_HWCAPS_PP_1_1_4	BIT(17)
-#define SNOR_HWCAPS_PP_1_4_4	BIT(18)
-#define SNOR_HWCAPS_PP_4_4_4	BIT(19)
+#define SNOR_HWCAPS_PP_QUAD		GENMASK(19, 17)
+#define SNOR_HWCAPS_PP_1_1_4		BIT(17)
+#define SNOR_HWCAPS_PP_1_4_4		BIT(18)
+#define SNOR_HWCAPS_PP_4_4_4		BIT(19)
 
-#define SNOR_HWCAPS_PP_OCTAL	GENMASK(22, 20)
-#define SNOR_HWCAPS_PP_1_1_8	BIT(20)
-#define SNOR_HWCAPS_PP_1_8_8	BIT(21)
-#define SNOR_HWCAPS_PP_8_8_8	BIT(22)
+#define SNOR_HWCAPS_PP_OCTAL		GENMASK(23, 20)
+#define SNOR_HWCAPS_PP_1_1_8		BIT(20)
+#define SNOR_HWCAPS_PP_1_8_8		BIT(21)
+#define SNOR_HWCAPS_PP_8_8_8		BIT(22)
+#define SNOR_HWCAPS_PP_8_8_8_DTR	BIT(23)
 
 #define SNOR_HWCAPS_X_X_X	(SNOR_HWCAPS_READ_2_2_2 |	\
 				 SNOR_HWCAPS_READ_4_4_4 |	\
@@ -279,10 +282,14 @@ struct spi_nor_hwcaps {
 				 SNOR_HWCAPS_PP_4_4_4 |		\
 				 SNOR_HWCAPS_PP_8_8_8)
 
+#define SNOR_HWCAPS_X_X_X_DTR	(SNOR_HWCAPS_READ_8_8_8_DTR |	\
+				 SNOR_HWCAPS_PP_8_8_8_DTR)
+
 #define SNOR_HWCAPS_DTR		(SNOR_HWCAPS_READ_1_1_1_DTR |	\
 				 SNOR_HWCAPS_READ_1_2_2_DTR |	\
 				 SNOR_HWCAPS_READ_1_4_4_DTR |	\
-				 SNOR_HWCAPS_READ_1_8_8_DTR)
+				 SNOR_HWCAPS_READ_1_8_8_DTR |	\
+				 SNOR_HWCAPS_READ_8_8_8_DTR)
 
 #define SNOR_HWCAPS_ALL		(SNOR_HWCAPS_READ_MASK |	\
 				 SNOR_HWCAPS_PP_MASK)
@@ -318,6 +325,22 @@ struct spi_nor_controller_ops {
 	int (*erase)(struct spi_nor *nor, loff_t offs);
 };
 
+/**
+ * enum spi_nor_cmd_ext - describes the command opcode extension in DTR mode
+ * @SPI_NOR_EXT_NONE: no extension. This is the default, and is used in Legacy
+ *		      SPI mode
+ * @SPI_NOR_EXT_REPEAT: the extension is same as the opcode
+ * @SPI_NOR_EXT_INVERT: the extension is the bitwise inverse of the opcode
+ * @SPI_NOR_EXT_HEX: the extension is any hex value. The command and opcode
+ *		     combine to form a 16-bit opcode.
+ */
+enum spi_nor_cmd_ext {
+	SPI_NOR_EXT_NONE = 0,
+	SPI_NOR_EXT_REPEAT,
+	SPI_NOR_EXT_INVERT,
+	SPI_NOR_EXT_HEX,
+};
+
 /*
  * Forward declarations that are used internally by the core and manufacturer
  * drivers.
@@ -345,6 +368,7 @@ struct spi_nor_flash_parameter;
  * @program_opcode:	the program opcode
  * @sst_write_second:	used by the SST write operation
  * @flags:		flag options for the current SPI NOR (SNOR_F_*)
+ * @cmd_ext_type:	the command opcode extension type for DTR mode.
  * @read_proto:		the SPI protocol for read operations
  * @write_proto:	the SPI protocol for write operations
  * @reg_proto:		the SPI protocol for read_reg/write_reg/erase operations
@@ -376,6 +400,7 @@ struct spi_nor {
 	enum spi_nor_protocol	reg_proto;
 	bool			sst_write_second;
 	u32			flags;
+	enum spi_nor_cmd_ext	cmd_ext_type;
 
 	const struct spi_nor_controller_ops *controller_ops;
 
-- 
cgit 


From d73ee7534cc537b98b61bfd83dbce5bb12aecb80 Mon Sep 17 00:00:00 2001
From: Pratyush Yadav <p.yadav@ti.com>
Date: Mon, 5 Oct 2020 21:01:35 +0530
Subject: mtd: spi-nor: core: perform a Soft Reset on shutdown

Perform a Soft Reset on shutdown on flashes that support it so that the
flash can be reset to its initial state and any configurations made by
spi-nor (given that they're only done in volatile registers) will be
reset. This will hand back the flash in pristine state for any further
operations on it.

Signed-off-by: Pratyush Yadav <p.yadav@ti.com>
Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Reviewed-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Link: https://lore.kernel.org/r/20201005153138.6437-13-p.yadav@ti.com
---
 drivers/mtd/spi-nor/core.c  | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spi-nor.h |  2 ++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 7b9ec02734f2..633a1af8a0fe 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -40,6 +40,9 @@
 
 #define SPI_NOR_MAX_ADDR_WIDTH	4
 
+#define SPI_NOR_SRST_SLEEP_MIN 200
+#define SPI_NOR_SRST_SLEEP_MAX 400
+
 /**
  * spi_nor_get_cmd_ext() - Get the command opcode extension based on the
  *			   extension type.
@@ -3174,6 +3177,45 @@ static int spi_nor_init(struct spi_nor *nor)
 	return 0;
 }
 
+static void spi_nor_soft_reset(struct spi_nor *nor)
+{
+	struct spi_mem_op op;
+	int ret;
+
+	op = (struct spi_mem_op)SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_SRSTEN, 0),
+			SPI_MEM_OP_NO_DUMMY,
+			SPI_MEM_OP_NO_ADDR,
+			SPI_MEM_OP_NO_DATA);
+
+	spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
+
+	ret = spi_mem_exec_op(nor->spimem, &op);
+	if (ret) {
+		dev_warn(nor->dev, "Software reset failed: %d\n", ret);
+		return;
+	}
+
+	op = (struct spi_mem_op)SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_SRST, 0),
+			SPI_MEM_OP_NO_DUMMY,
+			SPI_MEM_OP_NO_ADDR,
+			SPI_MEM_OP_NO_DATA);
+
+	spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
+
+	ret = spi_mem_exec_op(nor->spimem, &op);
+	if (ret) {
+		dev_warn(nor->dev, "Software reset failed: %d\n", ret);
+		return;
+	}
+
+	/*
+	 * Software Reset is not instant, and the delay varies from flash to
+	 * flash. Looking at a few flashes, most range somewhere below 100
+	 * microseconds. So, sleep for a range of 200-400 us.
+	 */
+	usleep_range(SPI_NOR_SRST_SLEEP_MIN, SPI_NOR_SRST_SLEEP_MAX);
+}
+
 /* mtd resume handler */
 static void spi_nor_resume(struct mtd_info *mtd)
 {
@@ -3193,6 +3235,9 @@ void spi_nor_restore(struct spi_nor *nor)
 	if (nor->addr_width == 4 && !(nor->flags & SNOR_F_4B_OPCODES) &&
 	    nor->flags & SNOR_F_BROKEN_RESET)
 		nor->params->set_4byte_addr_mode(nor, false);
+
+	if (nor->flags & SNOR_F_SOFT_RESET)
+		spi_nor_soft_reset(nor);
 }
 EXPORT_SYMBOL_GPL(spi_nor_restore);
 
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index cd549042c53d..299685d15dc2 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -51,6 +51,8 @@
 #define SPINOR_OP_CLFSR		0x50	/* Clear flag status register */
 #define SPINOR_OP_RDEAR		0xc8	/* Read Extended Address Register */
 #define SPINOR_OP_WREAR		0xc5	/* Write Extended Address Register */
+#define SPINOR_OP_SRSTEN	0x66	/* Software Reset Enable */
+#define SPINOR_OP_SRST		0x99	/* Software Reset */
 
 /* 4-byte address opcodes - used on Spansion and some Macronix flashes. */
 #define SPINOR_OP_READ_4B	0x13	/* Read data bytes (low frequency) */
-- 
cgit 


From 43676605f890b218e551f396a55dbaea7799acb4 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 3 Nov 2020 10:30:10 +0100
Subject: drm/ttm: Add vmap/vunmap to TTM and TTM GEM helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new functions ttm_bo_{vmap,vunmap}() map and unmap a TTM BO in kernel
address space. The mapping's address is returned as struct dma_buf_map.
Each function is a simplified version of TTM's existing kmap code. Both
functions respect the memory's location ani/or writecombine flags.

On top TTM's functions, GEM TTM helpers got drm_gem_ttm_{vmap,vunmap}(),
two helpers that convert a GEM object into the TTM BO and forward the call
to TTM's vmap/vunmap. These helpers can be dropped into the rsp GEM object
callbacks.

v5:
	* use size_t for storing mapping size (Christian)
	* ignore premapped memory areas correctly in ttm_bo_vunmap()
	* rebase onto latest TTM interfaces (Christian)
	* remove BUG() from ttm_bo_vmap() (Christian)
v4:
	* drop ttm_kmap_obj_to_dma_buf() in favor of vmap helpers (Daniel,
	  Christian)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Tested-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20201103093015.1063-6-tzimmermann@suse.de
---
 drivers/gpu/drm/drm_gem_ttm_helper.c | 37 ++++++++++++++++++
 drivers/gpu/drm/ttm/ttm_bo_util.c    | 72 ++++++++++++++++++++++++++++++++++++
 include/drm/drm_gem_ttm_helper.h     |  6 +++
 include/drm/ttm/ttm_bo_api.h         | 28 ++++++++++++++
 include/linux/dma-buf-map.h          | 20 ++++++++++
 5 files changed, 163 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_gem_ttm_helper.c b/drivers/gpu/drm/drm_gem_ttm_helper.c
index 0e4fb9ba43ad..de28720757af 100644
--- a/drivers/gpu/drm/drm_gem_ttm_helper.c
+++ b/drivers/gpu/drm/drm_gem_ttm_helper.c
@@ -49,6 +49,43 @@ void drm_gem_ttm_print_info(struct drm_printer *p, unsigned int indent,
 }
 EXPORT_SYMBOL(drm_gem_ttm_print_info);
 
+/**
+ * drm_gem_ttm_vmap() - vmap &ttm_buffer_object
+ * @gem: GEM object.
+ * @map: [out] returns the dma-buf mapping.
+ *
+ * Maps a GEM object with ttm_bo_vmap(). This function can be used as
+ * &drm_gem_object_funcs.vmap callback.
+ *
+ * Returns:
+ * 0 on success, or a negative errno code otherwise.
+ */
+int drm_gem_ttm_vmap(struct drm_gem_object *gem,
+		     struct dma_buf_map *map)
+{
+	struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(gem);
+
+	return ttm_bo_vmap(bo, map);
+}
+EXPORT_SYMBOL(drm_gem_ttm_vmap);
+
+/**
+ * drm_gem_ttm_vunmap() - vunmap &ttm_buffer_object
+ * @gem: GEM object.
+ * @map: dma-buf mapping.
+ *
+ * Unmaps a GEM object with ttm_bo_vunmap(). This function can be used as
+ * &drm_gem_object_funcs.vmap callback.
+ */
+void drm_gem_ttm_vunmap(struct drm_gem_object *gem,
+			struct dma_buf_map *map)
+{
+	struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(gem);
+
+	ttm_bo_vunmap(bo, map);
+}
+EXPORT_SYMBOL(drm_gem_ttm_vunmap);
+
 /**
  * drm_gem_ttm_mmap() - mmap &ttm_buffer_object
  * @gem: GEM object.
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index ecb54415d1ca..7ccb2295cac1 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -32,6 +32,7 @@
 #include <drm/ttm/ttm_bo_driver.h>
 #include <drm/ttm/ttm_placement.h>
 #include <drm/drm_vma_manager.h>
+#include <linux/dma-buf-map.h>
 #include <linux/io.h>
 #include <linux/highmem.h>
 #include <linux/wait.h>
@@ -471,6 +472,77 @@ void ttm_bo_kunmap(struct ttm_bo_kmap_obj *map)
 }
 EXPORT_SYMBOL(ttm_bo_kunmap);
 
+int ttm_bo_vmap(struct ttm_buffer_object *bo, struct dma_buf_map *map)
+{
+	struct ttm_resource *mem = &bo->mem;
+	int ret;
+
+	ret = ttm_mem_io_reserve(bo->bdev, mem);
+	if (ret)
+		return ret;
+
+	if (mem->bus.is_iomem) {
+		void __iomem *vaddr_iomem;
+		size_t size = bo->num_pages << PAGE_SHIFT;
+
+		if (mem->bus.addr)
+			vaddr_iomem = (void __iomem *)mem->bus.addr;
+		else if (mem->bus.caching == ttm_write_combined)
+			vaddr_iomem = ioremap_wc(mem->bus.offset, size);
+		else
+			vaddr_iomem = ioremap(mem->bus.offset, size);
+
+		if (!vaddr_iomem)
+			return -ENOMEM;
+
+		dma_buf_map_set_vaddr_iomem(map, vaddr_iomem);
+
+	} else {
+		struct ttm_operation_ctx ctx = {
+			.interruptible = false,
+			.no_wait_gpu = false
+		};
+		struct ttm_tt *ttm = bo->ttm;
+		pgprot_t prot;
+		void *vaddr;
+
+		ret = ttm_tt_populate(bo->bdev, ttm, &ctx);
+		if (ret)
+			return ret;
+
+		/*
+		 * We need to use vmap to get the desired page protection
+		 * or to make the buffer object look contiguous.
+		 */
+		prot = ttm_io_prot(bo, mem, PAGE_KERNEL);
+		vaddr = vmap(ttm->pages, bo->num_pages, 0, prot);
+		if (!vaddr)
+			return -ENOMEM;
+
+		dma_buf_map_set_vaddr(map, vaddr);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ttm_bo_vmap);
+
+void ttm_bo_vunmap(struct ttm_buffer_object *bo, struct dma_buf_map *map)
+{
+	struct ttm_resource *mem = &bo->mem;
+
+	if (dma_buf_map_is_null(map))
+		return;
+
+	if (!map->is_iomem)
+		vunmap(map->vaddr);
+	else if (!mem->bus.addr)
+		iounmap(map->vaddr_iomem);
+	dma_buf_map_clear(map);
+
+	ttm_mem_io_free(bo->bdev, &bo->mem);
+}
+EXPORT_SYMBOL(ttm_bo_vunmap);
+
 static int ttm_bo_wait_free_node(struct ttm_buffer_object *bo,
 				 bool dst_use_tt)
 {
diff --git a/include/drm/drm_gem_ttm_helper.h b/include/drm/drm_gem_ttm_helper.h
index 118cef76f84f..7c6d874910b8 100644
--- a/include/drm/drm_gem_ttm_helper.h
+++ b/include/drm/drm_gem_ttm_helper.h
@@ -10,11 +10,17 @@
 #include <drm/ttm/ttm_bo_api.h>
 #include <drm/ttm/ttm_bo_driver.h>
 
+struct dma_buf_map;
+
 #define drm_gem_ttm_of_gem(gem_obj) \
 	container_of(gem_obj, struct ttm_buffer_object, base)
 
 void drm_gem_ttm_print_info(struct drm_printer *p, unsigned int indent,
 			    const struct drm_gem_object *gem);
+int drm_gem_ttm_vmap(struct drm_gem_object *gem,
+		     struct dma_buf_map *map);
+void drm_gem_ttm_vunmap(struct drm_gem_object *gem,
+			struct dma_buf_map *map);
 int drm_gem_ttm_mmap(struct drm_gem_object *gem,
 		     struct vm_area_struct *vma);
 
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 5ddad88ae6ed..2564e66e67d7 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -48,6 +48,8 @@ struct ttm_bo_global;
 
 struct ttm_bo_device;
 
+struct dma_buf_map;
+
 struct drm_mm_node;
 
 struct ttm_placement;
@@ -495,6 +497,32 @@ int ttm_bo_kmap(struct ttm_buffer_object *bo, unsigned long start_page,
  */
 void ttm_bo_kunmap(struct ttm_bo_kmap_obj *map);
 
+/**
+ * ttm_bo_vmap
+ *
+ * @bo: The buffer object.
+ * @map: pointer to a struct dma_buf_map representing the map.
+ *
+ * Sets up a kernel virtual mapping, using ioremap or vmap to the
+ * data in the buffer object. The parameter @map returns the virtual
+ * address as struct dma_buf_map. Unmap the buffer with ttm_bo_vunmap().
+ *
+ * Returns
+ * -ENOMEM: Out of memory.
+ * -EINVAL: Invalid range.
+ */
+int ttm_bo_vmap(struct ttm_buffer_object *bo, struct dma_buf_map *map);
+
+/**
+ * ttm_bo_vunmap
+ *
+ * @bo: The buffer object.
+ * @map: Object describing the map to unmap.
+ *
+ * Unmaps a kernel map set up by ttm_bo_vmap().
+ */
+void ttm_bo_vunmap(struct ttm_buffer_object *bo, struct dma_buf_map *map);
+
 /**
  * ttm_bo_mmap_obj - mmap memory backed by a ttm buffer object.
  *
diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index fd1aba545fdf..2e8bbecb5091 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -45,6 +45,12 @@
  *
  *	dma_buf_map_set_vaddr(&map. 0xdeadbeaf);
  *
+ * To set an address in I/O memory, use dma_buf_map_set_vaddr_iomem().
+ *
+ * .. code-block:: c
+ *
+ *	dma_buf_map_set_vaddr_iomem(&map. 0xdeadbeaf);
+ *
  * Test if a mapping is valid with either dma_buf_map_is_set() or
  * dma_buf_map_is_null().
  *
@@ -118,6 +124,20 @@ static inline void dma_buf_map_set_vaddr(struct dma_buf_map *map, void *vaddr)
 	map->is_iomem = false;
 }
 
+/**
+ * dma_buf_map_set_vaddr_iomem - Sets a dma-buf mapping structure to an address in I/O memory
+ * @map:		The dma-buf mapping structure
+ * @vaddr_iomem:	An I/O-memory address
+ *
+ * Sets the address and the I/O-memory flag.
+ */
+static inline void dma_buf_map_set_vaddr_iomem(struct dma_buf_map *map,
+					       void __iomem *vaddr_iomem)
+{
+	map->vaddr_iomem = vaddr_iomem;
+	map->is_iomem = true;
+}
+
 /**
  * dma_buf_map_is_equal - Compares two dma-buf mapping structures for equality
  * @lhs:	The dma-buf mapping structure
-- 
cgit 


From b4e7090c242ec64bfa77a69ea7c3cceba21c7a65 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 3 Nov 2020 10:30:14 +0100
Subject: dma-buf-map: Add memcpy and pointer-increment interfaces

To do framebuffer updates, one needs memcpy from system memory and a
pointer-increment function. Add both interfaces with documentation.

v5:
	* include <linux/string.h> to build on sparc64 (Sam)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Tested-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20201103093015.1063-10-tzimmermann@suse.de
---
 include/linux/dma-buf-map.h | 73 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index 2e8bbecb5091..583a3a1f9447 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -7,6 +7,7 @@
 #define __DMA_BUF_MAP_H__
 
 #include <linux/io.h>
+#include <linux/string.h>
 
 /**
  * DOC: overview
@@ -32,6 +33,14 @@
  * accessing the buffer. Use the returned instance and the helper functions
  * to access the buffer's memory in the correct way.
  *
+ * The type :c:type:`struct dma_buf_map <dma_buf_map>` and its helpers are
+ * actually independent from the dma-buf infrastructure. When sharing buffers
+ * among devices, drivers have to know the location of the memory to access
+ * the buffers in a safe way. :c:type:`struct dma_buf_map <dma_buf_map>`
+ * solves this problem for dma-buf and its users. If other drivers or
+ * sub-systems require similar functionality, the type could be generalized
+ * and moved to a more prominent header file.
+ *
  * Open-coding access to :c:type:`struct dma_buf_map <dma_buf_map>` is
  * considered bad style. Rather then accessing its fields directly, use one
  * of the provided helper functions, or implement your own. For example,
@@ -51,6 +60,14 @@
  *
  *	dma_buf_map_set_vaddr_iomem(&map. 0xdeadbeaf);
  *
+ * Instances of struct dma_buf_map do not have to be cleaned up, but
+ * can be cleared to NULL with dma_buf_map_clear(). Cleared mappings
+ * always refer to system memory.
+ *
+ * .. code-block:: c
+ *
+ *	dma_buf_map_clear(&map);
+ *
  * Test if a mapping is valid with either dma_buf_map_is_set() or
  * dma_buf_map_is_null().
  *
@@ -73,17 +90,19 @@
  *	if (dma_buf_map_is_equal(&sys_map, &io_map))
  *		// always false
  *
- * Instances of struct dma_buf_map do not have to be cleaned up, but
- * can be cleared to NULL with dma_buf_map_clear(). Cleared mappings
- * always refer to system memory.
+ * A set up instance of struct dma_buf_map can be used to access or manipulate
+ * the buffer memory. Depending on the location of the memory, the provided
+ * helpers will pick the correct operations. Data can be copied into the memory
+ * with dma_buf_map_memcpy_to(). The address can be manipulated with
+ * dma_buf_map_incr().
  *
- * The type :c:type:`struct dma_buf_map <dma_buf_map>` and its helpers are
- * actually independent from the dma-buf infrastructure. When sharing buffers
- * among devices, drivers have to know the location of the memory to access
- * the buffers in a safe way. :c:type:`struct dma_buf_map <dma_buf_map>`
- * solves this problem for dma-buf and its users. If other drivers or
- * sub-systems require similar functionality, the type could be generalized
- * and moved to a more prominent header file.
+ * .. code-block:: c
+ *
+ *	const void *src = ...; // source buffer
+ *	size_t len = ...; // length of src
+ *
+ *	dma_buf_map_memcpy_to(&map, src, len);
+ *	dma_buf_map_incr(&map, len); // go to first byte after the memcpy
  */
 
 /**
@@ -210,4 +229,38 @@ static inline void dma_buf_map_clear(struct dma_buf_map *map)
 	}
 }
 
+/**
+ * dma_buf_map_memcpy_to - Memcpy into dma-buf mapping
+ * @dst:	The dma-buf mapping structure
+ * @src:	The source buffer
+ * @len:	The number of byte in src
+ *
+ * Copies data into a dma-buf mapping. The source buffer is in system
+ * memory. Depending on the buffer's location, the helper picks the correct
+ * method of accessing the memory.
+ */
+static inline void dma_buf_map_memcpy_to(struct dma_buf_map *dst, const void *src, size_t len)
+{
+	if (dst->is_iomem)
+		memcpy_toio(dst->vaddr_iomem, src, len);
+	else
+		memcpy(dst->vaddr, src, len);
+}
+
+/**
+ * dma_buf_map_incr - Increments the address stored in a dma-buf mapping
+ * @map:	The dma-buf mapping structure
+ * @incr:	The number of bytes to increment
+ *
+ * Increments the address stored in a dma-buf mapping. Depending on the
+ * buffer's location, the correct value will be updated.
+ */
+static inline void dma_buf_map_incr(struct dma_buf_map *map, size_t incr)
+{
+	if (map->is_iomem)
+		map->vaddr_iomem += incr;
+	else
+		map->vaddr += incr;
+}
+
 #endif /* __DMA_BUF_MAP_H__ */
-- 
cgit 


From 22447a99c97e353bde8f90c2353873f27681d57c Mon Sep 17 00:00:00 2001
From: Pawel Czarnecki <pczarnecki@internships.antmicro.com>
Date: Tue, 13 Oct 2020 16:45:52 +0200
Subject: drivers/soc/litex: add LiteX SoC Controller driver

This commit adds driver for the FPGA-based LiteX SoC
Controller from LiteX SoC builder.

Co-developed-by: Mateusz Holenko <mholenko@antmicro.com>
Signed-off-by: Mateusz Holenko <mholenko@antmicro.com>
Signed-off-by: Pawel Czarnecki <pczarnecki@internships.antmicro.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 MAINTAINERS                        |   2 +
 drivers/soc/Kconfig                |   1 +
 drivers/soc/Makefile               |   1 +
 drivers/soc/litex/Kconfig          |  19 ++++
 drivers/soc/litex/Makefile         |   3 +
 drivers/soc/litex/litex_soc_ctrl.c | 176 +++++++++++++++++++++++++++++++++++++
 include/linux/litex.h              | 102 +++++++++++++++++++++
 7 files changed, 304 insertions(+)
 create mode 100644 drivers/soc/litex/Kconfig
 create mode 100644 drivers/soc/litex/Makefile
 create mode 100644 drivers/soc/litex/litex_soc_ctrl.c
 create mode 100644 include/linux/litex.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 049af639fdfc..3fde022413f9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10177,6 +10177,8 @@ M:	Karol Gugala <kgugala@antmicro.com>
 M:	Mateusz Holenko <mholenko@antmicro.com>
 S:	Maintained
 F:	Documentation/devicetree/bindings/*/litex,*.yaml
+F:	drivers/soc/litex/litex_soc_ctrl.c
+F:	include/linux/litex.h
 
 LIVE PATCHING
 M:	Josh Poimboeuf <jpoimboe@redhat.com>
diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig
index 425ab6f7e375..d097d070f579 100644
--- a/drivers/soc/Kconfig
+++ b/drivers/soc/Kconfig
@@ -9,6 +9,7 @@ source "drivers/soc/bcm/Kconfig"
 source "drivers/soc/fsl/Kconfig"
 source "drivers/soc/imx/Kconfig"
 source "drivers/soc/ixp4xx/Kconfig"
+source "drivers/soc/litex/Kconfig"
 source "drivers/soc/mediatek/Kconfig"
 source "drivers/soc/qcom/Kconfig"
 source "drivers/soc/renesas/Kconfig"
diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
index 36452bed86ef..0b16108823ef 100644
--- a/drivers/soc/Makefile
+++ b/drivers/soc/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_ARCH_GEMINI)	+= gemini/
 obj-y				+= imx/
 obj-$(CONFIG_ARCH_IXP4XX)	+= ixp4xx/
 obj-$(CONFIG_SOC_XWAY)		+= lantiq/
+obj-$(CONFIG_LITEX_SOC_CONTROLLER) += litex/
 obj-y				+= mediatek/
 obj-y				+= amlogic/
 obj-y				+= qcom/
diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig
new file mode 100644
index 000000000000..7c6b009b6f6c
--- /dev/null
+++ b/drivers/soc/litex/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License_Identifier: GPL-2.0
+
+menu "Enable LiteX SoC Builder specific drivers"
+
+config LITEX
+	bool
+
+config LITEX_SOC_CONTROLLER
+	tristate "Enable LiteX SoC Controller driver"
+	depends on OF || COMPILE_TEST
+	select LITEX
+	help
+	  This option enables the SoC Controller Driver which verifies
+	  LiteX CSR access and provides common litex_get_reg/litex_set_reg
+	  accessors.
+	  All drivers that use functions from litex.h must depend on
+	  LITEX.
+
+endmenu
diff --git a/drivers/soc/litex/Makefile b/drivers/soc/litex/Makefile
new file mode 100644
index 000000000000..98ff7325b1c0
--- /dev/null
+++ b/drivers/soc/litex/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License_Identifier: GPL-2.0
+
+obj-$(CONFIG_LITEX_SOC_CONTROLLER)	+= litex_soc_ctrl.o
diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c
new file mode 100644
index 000000000000..1217cafdfd4d
--- /dev/null
+++ b/drivers/soc/litex/litex_soc_ctrl.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * LiteX SoC Controller Driver
+ *
+ * Copyright (C) 2020 Antmicro <www.antmicro.com>
+ *
+ */
+
+#include <linux/litex.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+
+/*
+ * LiteX SoC Generator, depending on the configuration, can split a single
+ * logical CSR (Control&Status Register) into a series of consecutive physical
+ * registers.
+ *
+ * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the
+ * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four
+ * 32-bit physical registers, each one containing one byte of meaningful data.
+ *
+ * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
+ *
+ * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
+ * of writing to/reading from the LiteX CSR in a single place that can be
+ * then reused by all LiteX drivers.
+ */
+
+/**
+ * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register)
+ * @reg: Address of the CSR
+ * @reg_size: The width of the CSR expressed in the number of bytes
+ * @val: Value to be written to the CSR
+ *
+ * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
+ * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
+ * each one containing one byte of meaningful data.
+ *
+ * This function splits a single possibly multi-byte write into a series of
+ * single-byte writes with a proper offset.
+ */
+void litex_set_reg(void __iomem *reg, unsigned long reg_size,
+		    unsigned long val)
+{
+	unsigned long shifted_data, shift, i;
+
+	for (i = 0; i < reg_size; ++i) {
+		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
+		shifted_data = val >> shift;
+
+		WRITE_LITEX_SUBREGISTER(shifted_data, reg, i);
+	}
+}
+EXPORT_SYMBOL_GPL(litex_set_reg);
+
+/**
+ * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register)
+ * @reg: Address of the CSR
+ * @reg_size: The width of the CSR expressed in the number of bytes
+ *
+ * Return: Value read from the CSR
+ *
+ * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
+ * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
+ * each one containing one byte of meaningful data.
+ *
+ * This function generates a series of single-byte reads with a proper offset
+ * and joins their results into a single multi-byte value.
+ */
+unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_size)
+{
+	unsigned long shifted_data, shift, i;
+	unsigned long result = 0;
+
+	for (i = 0; i < reg_size; ++i) {
+		shifted_data = READ_LITEX_SUBREGISTER(reg, i);
+
+		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
+		result |= (shifted_data << shift);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(litex_get_reg);
+
+#define SCRATCH_REG_OFF         0x04
+#define SCRATCH_REG_VALUE       0x12345678
+#define SCRATCH_TEST_VALUE      0xdeadbeef
+
+/*
+ * Check LiteX CSR read/write access
+ *
+ * This function reads and writes a scratch register in order to verify if CSR
+ * access works.
+ *
+ * In case any problems are detected, the driver should panic.
+ *
+ * Access to the LiteX CSR is, by design, done in CPU native endianness.
+ * The driver should not dynamically configure access functions when
+ * the endianness mismatch is detected. Such situation indicates problems in
+ * the soft SoC design and should be solved at the LiteX generator level,
+ * not in the software.
+ */
+static int litex_check_csr_access(void __iomem *reg_addr)
+{
+	unsigned long reg;
+
+	reg = litex_read32(reg_addr + SCRATCH_REG_OFF);
+
+	if (reg != SCRATCH_REG_VALUE) {
+		panic("Scratch register read error - the system is probably broken! Expected: 0x%x but got: 0x%lx",
+			SCRATCH_REG_VALUE, reg);
+		return -EINVAL;
+	}
+
+	litex_write32(reg_addr + SCRATCH_REG_OFF, SCRATCH_TEST_VALUE);
+	reg = litex_read32(reg_addr + SCRATCH_REG_OFF);
+
+	if (reg != SCRATCH_TEST_VALUE) {
+		panic("Scratch register write error - the system is probably broken! Expected: 0x%x but got: 0x%lx",
+			SCRATCH_TEST_VALUE, reg);
+		return -EINVAL;
+	}
+
+	/* restore original value of the SCRATCH register */
+	litex_write32(reg_addr + SCRATCH_REG_OFF, SCRATCH_REG_VALUE);
+
+	pr_info("LiteX SoC Controller driver initialized");
+
+	return 0;
+}
+
+struct litex_soc_ctrl_device {
+	void __iomem *base;
+};
+
+static const struct of_device_id litex_soc_ctrl_of_match[] = {
+	{.compatible = "litex,soc-controller"},
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, litex_soc_ctrl_of_match);
+
+static int litex_soc_ctrl_probe(struct platform_device *pdev)
+{
+	struct litex_soc_ctrl_device *soc_ctrl_dev;
+
+	soc_ctrl_dev = devm_kzalloc(&pdev->dev, sizeof(*soc_ctrl_dev), GFP_KERNEL);
+	if (!soc_ctrl_dev)
+		return -ENOMEM;
+
+	soc_ctrl_dev->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(soc_ctrl_dev->base))
+		return PTR_ERR(soc_ctrl_dev->base);
+
+	return litex_check_csr_access(soc_ctrl_dev->base);
+}
+
+static struct platform_driver litex_soc_ctrl_driver = {
+	.driver = {
+		.name = "litex-soc-controller",
+		.of_match_table = of_match_ptr(litex_soc_ctrl_of_match)
+	},
+	.probe = litex_soc_ctrl_probe,
+};
+
+module_platform_driver(litex_soc_ctrl_driver);
+MODULE_DESCRIPTION("LiteX SoC Controller driver");
+MODULE_AUTHOR("Antmicro <www.antmicro.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/litex.h b/include/linux/litex.h
new file mode 100644
index 000000000000..40f5be503593
--- /dev/null
+++ b/include/linux/litex.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common LiteX header providing
+ * helper functions for accessing CSRs.
+ *
+ * Implementation of the functions is provided by
+ * the LiteX SoC Controller driver.
+ *
+ * Copyright (C) 2019-2020 Antmicro <www.antmicro.com>
+ */
+
+#ifndef _LINUX_LITEX_H
+#define _LINUX_LITEX_H
+
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
+/*
+ * The parameters below are true for LiteX SoCs configured for 8-bit CSR Bus,
+ * 32-bit aligned.
+ *
+ * Supporting other configurations will require extending the logic in this
+ * header and in the LiteX SoC controller driver.
+ */
+#define LITEX_REG_SIZE	  0x4
+#define LITEX_SUBREG_SIZE	0x1
+#define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
+
+#define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \
+	writel((u32 __force)cpu_to_le32(val), base_offset + (LITEX_REG_SIZE * subreg_id))
+
+#define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
+	le32_to_cpu((__le32 __force)readl(base_offset + (LITEX_REG_SIZE * subreg_id)))
+
+void litex_set_reg(void __iomem *reg, unsigned long reg_sz, unsigned long val);
+
+unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_sz);
+
+static inline void litex_write8(void __iomem *reg, u8 val)
+{
+	WRITE_LITEX_SUBREGISTER(val, reg, 0);
+}
+
+static inline void litex_write16(void __iomem *reg, u16 val)
+{
+	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 0);
+	WRITE_LITEX_SUBREGISTER(val, reg, 1);
+}
+
+static inline void litex_write32(void __iomem *reg, u32 val)
+{
+	WRITE_LITEX_SUBREGISTER(val >> 24, reg, 0);
+	WRITE_LITEX_SUBREGISTER(val >> 16, reg, 1);
+	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 2);
+	WRITE_LITEX_SUBREGISTER(val, reg, 3);
+}
+
+static inline void litex_write64(void __iomem *reg, u64 val)
+{
+	WRITE_LITEX_SUBREGISTER(val >> 56, reg, 0);
+	WRITE_LITEX_SUBREGISTER(val >> 48, reg, 1);
+	WRITE_LITEX_SUBREGISTER(val >> 40, reg, 2);
+	WRITE_LITEX_SUBREGISTER(val >> 32, reg, 3);
+	WRITE_LITEX_SUBREGISTER(val >> 24, reg, 4);
+	WRITE_LITEX_SUBREGISTER(val >> 16, reg, 5);
+	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 6);
+	WRITE_LITEX_SUBREGISTER(val, reg, 7);
+}
+
+static inline u8 litex_read8(void __iomem *reg)
+{
+	return READ_LITEX_SUBREGISTER(reg, 0);
+}
+
+static inline u16 litex_read16(void __iomem *reg)
+{
+	return (READ_LITEX_SUBREGISTER(reg, 0) << 8)
+		| (READ_LITEX_SUBREGISTER(reg, 1));
+}
+
+static inline u32 litex_read32(void __iomem *reg)
+{
+	return (READ_LITEX_SUBREGISTER(reg, 0) << 24)
+		| (READ_LITEX_SUBREGISTER(reg, 1) << 16)
+		| (READ_LITEX_SUBREGISTER(reg, 2) << 8)
+		| (READ_LITEX_SUBREGISTER(reg, 3));
+}
+
+static inline u64 litex_read64(void __iomem *reg)
+{
+	return ((u64)READ_LITEX_SUBREGISTER(reg, 0) << 56)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 1) << 48)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 2) << 40)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 3) << 32)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 4) << 24)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 5) << 16)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 6) << 8)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 7));
+}
+
+#endif /* _LINUX_LITEX_H */
-- 
cgit 


From 267fb27352b6fc9fdbad753127a239f75618ecbc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 30 Oct 2020 15:50:32 +0100
Subject: perf: Reduce stack usage of perf_output_begin()

__perf_output_begin() has an on-stack struct perf_sample_data in the
unlikely case it needs to generate a LOST record. However, every call
to perf_output_begin() must already have a perf_sample_data on-stack.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201030151954.985416146@infradead.org
---
 arch/powerpc/perf/imc-pmu.c     |  2 +-
 arch/s390/kernel/perf_cpum_sf.c |  2 +-
 arch/x86/events/intel/ds.c      |  4 ++--
 include/linux/perf_event.h      |  7 +++++--
 kernel/events/core.c            | 32 +++++++++++++++++---------------
 kernel/events/ring_buffer.c     | 20 +++++++++++---------
 6 files changed, 37 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 9ed4fcccf8a9..7b25548ec42b 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1336,7 +1336,7 @@ static void dump_trace_imc_data(struct perf_event *event)
 			/* If this is a valid record, create the sample */
 			struct perf_output_handle handle;
 
-			if (perf_output_begin(&handle, event, header.size))
+			if (perf_output_begin(&handle, &data, event, header.size))
 				return;
 
 			perf_output_sample(&handle, &header, &data, event);
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 4f9e4626df55..00255ae3979d 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -672,7 +672,7 @@ static void cpumsf_output_event_pid(struct perf_event *event,
 	rcu_read_lock();
 
 	perf_prepare_sample(&header, data, event, regs);
-	if (perf_output_begin(&handle, event, header.size))
+	if (perf_output_begin(&handle, data, event, header.size))
 		goto out;
 
 	/* Update the process ID (see also kernel/events/core.c) */
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 404315df1e16..cd2ae14a0a98 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -642,8 +642,8 @@ int intel_pmu_drain_bts_buffer(void)
 	rcu_read_lock();
 	perf_prepare_sample(&header, &data, event, &regs);
 
-	if (perf_output_begin(&handle, event, header.size *
-			      (top - base - skip)))
+	if (perf_output_begin(&handle, &data, event,
+			      header.size * (top - base - skip)))
 		goto unlock;
 
 	for (at = base; at < top; at++) {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0c19d279b97f..b775ae0a8c87 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1400,11 +1400,14 @@ perf_event_addr_filters(struct perf_event *event)
 extern void perf_event_addr_filters_sync(struct perf_event *event);
 
 extern int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_sample_data *data,
 			     struct perf_event *event, unsigned int size);
 extern int perf_output_begin_forward(struct perf_output_handle *handle,
-				    struct perf_event *event,
-				    unsigned int size);
+				     struct perf_sample_data *data,
+				     struct perf_event *event,
+				     unsigned int size);
 extern int perf_output_begin_backward(struct perf_output_handle *handle,
+				      struct perf_sample_data *data,
 				      struct perf_event *event,
 				      unsigned int size);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5a29ab09e72d..fc681c7c1e03 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7186,6 +7186,7 @@ __perf_event_output(struct perf_event *event,
 		    struct perf_sample_data *data,
 		    struct pt_regs *regs,
 		    int (*output_begin)(struct perf_output_handle *,
+					struct perf_sample_data *,
 					struct perf_event *,
 					unsigned int))
 {
@@ -7198,7 +7199,7 @@ __perf_event_output(struct perf_event *event,
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	err = output_begin(&handle, event, header.size);
+	err = output_begin(&handle, data, event, header.size);
 	if (err)
 		goto exit;
 
@@ -7264,7 +7265,7 @@ perf_event_read_event(struct perf_event *event,
 	int ret;
 
 	perf_event_header__init_id(&read_event.header, &sample, event);
-	ret = perf_output_begin(&handle, event, read_event.header.size);
+	ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
 	if (ret)
 		return;
 
@@ -7533,7 +7534,7 @@ static void perf_event_task_output(struct perf_event *event,
 
 	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				task_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7636,7 +7637,7 @@ static void perf_event_comm_output(struct perf_event *event,
 		return;
 
 	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				comm_event->event_id.header.size);
 
 	if (ret)
@@ -7736,7 +7737,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
 
 	perf_event_header__init_id(&namespaces_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				namespaces_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7863,7 +7864,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&cgroup_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				cgroup_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7989,7 +7990,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 	}
 
 	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				mmap_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -8299,7 +8300,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
 	int ret;
 
 	perf_event_header__init_id(&rec.header, &sample, event);
-	ret = perf_output_begin(&handle, event, rec.header.size);
+	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
 
 	if (ret)
 		return;
@@ -8333,7 +8334,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
 
 	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				lost_samples_event.header.size);
 	if (ret)
 		return;
@@ -8388,7 +8389,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&se->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event, se->event_id.header.size);
+	ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
 	if (ret)
 		return;
 
@@ -8463,7 +8464,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 
 	perf_event_header__init_id(&throttle_event.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				throttle_event.header.size);
 	if (ret)
 		return;
@@ -8506,7 +8507,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&ksymbol_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				ksymbol_event->event_id.header.size);
 	if (ret)
 		return;
@@ -8596,7 +8597,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&bpf_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, data, event,
 				bpf_event->event_id.header.size);
 	if (ret)
 		return;
@@ -8705,7 +8706,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+	ret = perf_output_begin(&handle, &sample, event,
+				text_poke_event->event_id.header.size);
 	if (ret)
 		return;
 
@@ -8786,7 +8788,7 @@ static void perf_log_itrace_start(struct perf_event *event)
 	rec.tid	= perf_event_tid(event, current);
 
 	perf_event_header__init_id(&rec.header, &sample, event);
-	ret = perf_output_begin(&handle, event, rec.header.size);
+	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
 
 	if (ret)
 		return;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 192b8abc6330..ef91ae75ca56 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail,
 
 static __always_inline int
 __perf_output_begin(struct perf_output_handle *handle,
+		    struct perf_sample_data *data,
 		    struct perf_event *event, unsigned int size,
 		    bool backward)
 {
@@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle,
 	handle->size = (1UL << page_shift) - offset;
 
 	if (unlikely(have_lost)) {
-		struct perf_sample_data sample_data;
-
 		lost_event.header.size = sizeof(lost_event);
 		lost_event.header.type = PERF_RECORD_LOST;
 		lost_event.header.misc = 0;
 		lost_event.id          = event->id;
 		lost_event.lost        = local_xchg(&rb->lost, 0);
 
-		perf_event_header__init_id(&lost_event.header,
-					   &sample_data, event);
+		/* XXX mostly redundant; @data is already fully initializes */
+		perf_event_header__init_id(&lost_event.header, data, event);
 		perf_output_put(handle, lost_event);
-		perf_event__output_id_sample(event, handle, &sample_data);
+		perf_event__output_id_sample(event, handle, data);
 	}
 
 	return 0;
@@ -263,22 +262,25 @@ out:
 }
 
 int perf_output_begin_forward(struct perf_output_handle *handle,
-			     struct perf_event *event, unsigned int size)
+			      struct perf_sample_data *data,
+			      struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, false);
+	return __perf_output_begin(handle, data, event, size, false);
 }
 
 int perf_output_begin_backward(struct perf_output_handle *handle,
+			       struct perf_sample_data *data,
 			       struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, true);
+	return __perf_output_begin(handle, data, event, size, true);
 }
 
 int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_sample_data *data,
 		      struct perf_event *event, unsigned int size)
 {
 
-	return __perf_output_begin(handle, event, size,
+	return __perf_output_begin(handle, data, event, size,
 				   unlikely(is_write_backward(event)));
 }
 
-- 
cgit 


From 76a4efa80900fc40e0fdf243b42aec9fb8c35d24 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 30 Oct 2020 12:14:21 +0100
Subject: perf/arch: Remove perf_sample_data::regs_user_copy

struct perf_sample_data lives on-stack, we should be careful about it's
size. Furthermore, the pt_regs copy in there is only because x86_64 is a
trainwreck, solve it differently.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20201030151955.258178461@infradead.org
---
 arch/arm/kernel/perf_regs.c   |  3 +--
 arch/arm64/kernel/perf_regs.c |  3 +--
 arch/csky/kernel/perf_regs.c  |  3 +--
 arch/powerpc/perf/perf_regs.c |  3 +--
 arch/riscv/kernel/perf_regs.c |  3 +--
 arch/s390/kernel/perf_regs.c  |  3 +--
 arch/x86/kernel/perf_regs.c   | 15 +++++++++++----
 include/linux/perf_event.h    |  6 ------
 include/linux/perf_regs.h     |  6 ++----
 kernel/events/core.c          |  8 +++-----
 10 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
index 05fe92aa7d98..0529f90395c9 100644
--- a/arch/arm/kernel/perf_regs.c
+++ b/arch/arm/kernel/perf_regs.c
@@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 94e8718e7229..f6f58e6265df 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -73,8 +73,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/arch/csky/kernel/perf_regs.c b/arch/csky/kernel/perf_regs.c
index eb32838b8210..09b7f88a2d6a 100644
--- a/arch/csky/kernel/perf_regs.c
+++ b/arch/csky/kernel/perf_regs.c
@@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index 8e53f2fc3fe0..6f681b105eec 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -144,8 +144,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) :
diff --git a/arch/riscv/kernel/perf_regs.c b/arch/riscv/kernel/perf_regs.c
index 04a38fbeb9c7..fd304a248de6 100644
--- a/arch/riscv/kernel/perf_regs.c
+++ b/arch/riscv/kernel/perf_regs.c
@@ -36,8 +36,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index 4352a504f235..6e9e5d5e927e 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -53,8 +53,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	/*
 	 * Use the regs from the first interruption and let
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index bb7e1132290b..f9e5352b3bef 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
@@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *task)
 		return PERF_SAMPLE_REGS_ABI_64;
 }
 
+static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs);
+
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
+	struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);
 	struct pt_regs *user_regs = task_pt_regs(current);
 
+	if (!in_nmi()) {
+		regs_user->regs = user_regs;
+		regs_user->abi = perf_reg_abi(current);
+		return;
+	}
+
 	/*
 	 * If we're in an NMI that interrupted task_pt_regs setup, then
 	 * we can't sample user regs at all.  This check isn't really
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b775ae0a8c87..96450f6fb1de 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1022,13 +1022,7 @@ struct perf_sample_data {
 	struct perf_callchain_entry	*callchain;
 	u64				aux_size;
 
-	/*
-	 * regs_user may point to task_pt_regs or to regs_user_copy, depending
-	 * on arch details.
-	 */
 	struct perf_regs		regs_user;
-	struct pt_regs			regs_user_copy;
-
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
 
diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
index 2d12e97d5e7b..f632c5725f16 100644
--- a/include/linux/perf_regs.h
+++ b/include/linux/perf_regs.h
@@ -20,8 +20,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx);
 int perf_reg_validate(u64 mask);
 u64 perf_reg_abi(struct task_struct *task);
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy);
+			struct pt_regs *regs);
 #else
 
 #define PERF_REG_EXTENDED_MASK	0
@@ -42,8 +41,7 @@ static inline u64 perf_reg_abi(struct task_struct *task)
 }
 
 static inline void perf_get_regs_user(struct perf_regs *regs_user,
-				      struct pt_regs *regs,
-				      struct pt_regs *regs_user_copy)
+				      struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc681c7c1e03..d67c9cbb0f6a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6374,14 +6374,13 @@ perf_output_sample_regs(struct perf_output_handle *handle,
 }
 
 static void perf_sample_regs_user(struct perf_regs *regs_user,
-				  struct pt_regs *regs,
-				  struct pt_regs *regs_user_copy)
+				  struct pt_regs *regs)
 {
 	if (user_mode(regs)) {
 		regs_user->abi = perf_reg_abi(current);
 		regs_user->regs = regs;
 	} else if (!(current->flags & PF_KTHREAD)) {
-		perf_get_regs_user(regs_user, regs, regs_user_copy);
+		perf_get_regs_user(regs_user, regs);
 	} else {
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
 		regs_user->regs = NULL;
@@ -7083,8 +7082,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 
 	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
-		perf_sample_regs_user(&data->regs_user, regs,
-				      &data->regs_user_copy);
+		perf_sample_regs_user(&data->regs_user, regs);
 
 	if (sample_type & PERF_SAMPLE_REGS_USER) {
 		/* regs dump ABI info */
-- 
cgit 


From 21774fd81a51ec1eccd59caf922d387977acd2aa Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 15 Oct 2020 14:57:26 -0400
Subject: kernfs: bring names in comments in line with code

Fix two stragglers in the comments of the below rename operation.

Fixes: adc5e8b58f48 ("kernfs: drop s_ prefix from kernfs_node members")
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20201015185726.1386868-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 2 +-
 include/linux/kernfs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 9aec80b9d7c6..ea3c95125bf1 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1599,7 +1599,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	return error;
 }
 
-/* Relationship between s_mode and the DT_xxx types */
+/* Relationship between mode and the DT_xxx types */
 static inline unsigned char dt_type(struct kernfs_node *kn)
 {
 	return (kn->mode >> 12) & 15;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 89f6a4214a70..9e8ca8743c26 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -116,7 +116,7 @@ struct kernfs_elem_attr {
  * kernfs node is represented by single kernfs_node.  Most fields are
  * private to kernfs and shouldn't be accessed directly by kernfs users.
  *
- * As long as s_count reference is held, the kernfs_node itself is
+ * As long as count reference is held, the kernfs_node itself is
  * accessible.  Dereferencing elem or any other outer entity requires
  * active reference.
  */
-- 
cgit 


From 9f38abefd37af8726d59706b9b84530630b6b620 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 23 Oct 2020 18:33:17 +0200
Subject: uio: fix some kernel-doc markups

The definitions for (devm_)uio_register_device should be
at the header file, as the macros are there. The ones
inside uio.c refer, instead, to __(devm_)uio_register_device.

Update them and add new kernel-doc markups for the macros.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/82ab7b68d271aeda7396e369ff8a629491b9d628.1603469755.git.mchehab+huawei@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/uio/uio.c          |  4 ++--
 include/linux/uio_driver.h | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 6dca744e39e9..01b349c1b923 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -906,7 +906,7 @@ static void uio_device_release(struct device *dev)
 }
 
 /**
- * uio_register_device - register a new userspace IO device
+ * __uio_register_device - register a new userspace IO device
  * @owner:	module that creates the new device
  * @parent:	parent device
  * @info:	UIO device capabilities
@@ -1002,7 +1002,7 @@ static void devm_uio_unregister_device(struct device *dev, void *res)
 }
 
 /**
- * devm_uio_register_device - Resource managed uio_register_device()
+ * __devm_uio_register_device - Resource managed uio_register_device()
  * @owner:	module that creates the new device
  * @parent:	parent device
  * @info:	UIO device capabilities
diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h
index 54bf6b118401..47c5962b876b 100644
--- a/include/linux/uio_driver.h
+++ b/include/linux/uio_driver.h
@@ -117,6 +117,14 @@ extern int __must_check
 			      struct uio_info *info);
 
 /* use a define to avoid include chaining to get THIS_MODULE */
+
+/**
+ * uio_register_device - register a new userspace IO device
+ * @parent:	parent device
+ * @info:	UIO device capabilities
+ *
+ * returns zero on success or a negative error code.
+ */
 #define uio_register_device(parent, info) \
 	__uio_register_device(THIS_MODULE, parent, info)
 
@@ -129,6 +137,14 @@ extern int __must_check
 				   struct uio_info *info);
 
 /* use a define to avoid include chaining to get THIS_MODULE */
+
+/**
+ * devm_uio_register_device - Resource managed uio_register_device()
+ * @parent:	parent device
+ * @info:	UIO device capabilities
+ *
+ * returns zero on success or a negative error code.
+ */
 #define devm_uio_register_device(parent, info) \
 	__devm_uio_register_device(THIS_MODULE, parent, info)
 
-- 
cgit 


From 33c0c9bdf7a59051a654cd98b7d2b48ce0080967 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 23 Oct 2020 18:32:56 +0200
Subject: drivers: base: fix some kernel-doc markups

class_create is actually defined at the header. Fix the
markup there and add a new one at the right place.

While here, also fix some markups for functions that have
different names between their prototypes and kernel-doc
comments.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/2fb6efd6a1f90d69ff73bf579566079cbb051e15.1603469755.git.mchehab+huawei@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c                    |  2 +-
 drivers/base/devres.c                   |  2 +-
 drivers/base/firmware_loader/fallback.c |  2 +-
 include/linux/device/class.h            | 14 ++++++++++++++
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index c3451481194e..7476f393df97 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -210,7 +210,7 @@ static void class_create_release(struct class *cls)
 }
 
 /**
- * class_create - create a struct class structure
+ * __class_create - create a struct class structure
  * @owner: pointer to the module that is to "own" this struct class
  * @name: pointer to a string for the name of this class.
  * @key: the lock_class_key for this class; used by mutex lock debugging
diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 586e9a75c840..fb9d5289a620 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -149,7 +149,7 @@ void * __devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid
 EXPORT_SYMBOL_GPL(__devres_alloc_node);
 #else
 /**
- * devres_alloc - Allocate device resource data
+ * devres_alloc_node - Allocate device resource data
  * @release: Release function devres will be associated with
  * @size: Allocation size
  * @gfp: Allocation flags
diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c
index 4dec4b79ae06..91899d185e31 100644
--- a/drivers/base/firmware_loader/fallback.c
+++ b/drivers/base/firmware_loader/fallback.c
@@ -128,7 +128,7 @@ static ssize_t timeout_show(struct class *class, struct class_attribute *attr,
 }
 
 /**
- * firmware_timeout_store() - set number of seconds to wait for firmware
+ * timeout_store() - set number of seconds to wait for firmware
  * @class: device class pointer
  * @attr: device attribute pointer
  * @buf: buffer to scan for timeout value
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index e8d470c457d1..e61ec5502019 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -256,6 +256,20 @@ extern void class_destroy(struct class *cls);
 
 /* This is a #define to keep the compiler from merging different
  * instances of the __key variable */
+
+/**
+ * class_create - create a struct class structure
+ * @owner: pointer to the module that is to "own" this struct class
+ * @name: pointer to a string for the name of this class.
+ *
+ * This is used to create a struct class pointer that can then be used
+ * in calls to device_create().
+ *
+ * Returns &struct class pointer on success, or ERR_PTR() on error.
+ *
+ * Note, the pointer created here is to be destroyed when finished by
+ * making a call to class_destroy().
+ */
 #define class_create(owner, name)		\
 ({						\
 	static struct lock_class_key __key;	\
-- 
cgit 


From a18394269fc87276963e8d965c730900178d7e4b Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 7 Nov 2020 21:49:07 +0100
Subject: net: core: add dev_get_tstats64 as a ndo_get_stats64 implementation

It's a frequent pattern to use netdev->stats for the less frequently
accessed counters and per-cpu counters for the frequently accessed
counters (rx/tx bytes/packets). Add a default ndo_get_stats64()
implementation for this use case.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a53ed2d1ed1d..7ce648a564f7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4527,6 +4527,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 			     const struct net_device_stats *netdev_stats);
 void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
 			   const struct pcpu_sw_netstats __percpu *netstats);
+void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
 
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
diff --git a/net/core/dev.c b/net/core/dev.c
index bd6100da66f4..60d325bda0d7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10366,6 +10366,21 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
 }
 EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
 
+/**
+ *	dev_get_tstats64 - ndo_get_stats64 implementation
+ *	@dev: device to get statistics from
+ *	@s: place to store stats
+ *
+ *	Populate @s from dev->stats and dev->tstats. Can be used as
+ *	ndo_get_stats64() callback.
+ */
+void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
+{
+	netdev_stats_to_stats64(s, &dev->stats);
+	dev_fetch_sw_netstats(s, dev->tstats);
+}
+EXPORT_SYMBOL_GPL(dev_get_tstats64);
+
 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
 {
 	struct netdev_queue *queue = dev_ingress_queue(dev);
-- 
cgit 


From 9a2a9ebc0a758d887ee06e067e9f7f0b36ff7574 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 10 Nov 2020 18:25:57 +0100
Subject: cpufreq: Introduce governor flags

A new cpufreq governor flag will be added subsequently, so replace
the bool dynamic_switching fleid in struct cpufreq_governor with a
flags field and introduce CPUFREQ_GOV_DYNAMIC_SWITCHING to set for
the "dynamic switching" governors instead of it.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c          | 2 +-
 drivers/cpufreq/cpufreq_governor.h | 2 +-
 include/linux/cpufreq.h            | 9 +++++++--
 kernel/sched/cpufreq_schedutil.c   | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 336b5e94cbc8..0252903f1b43 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2254,7 +2254,7 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy)
 		return -EINVAL;
 
 	/* Platform doesn't want dynamic frequency switching ? */
-	if (policy->governor->dynamic_switching &&
+	if (policy->governor->flags & CPUFREQ_GOV_DYNAMIC_SWITCHING &&
 	    cpufreq_driver->flags & CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING) {
 		struct cpufreq_governor *gov = cpufreq_fallback_governor();
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index c56773c25757..bab8e6140377 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -156,7 +156,7 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy);
 #define CPUFREQ_DBS_GOVERNOR_INITIALIZER(_name_)			\
 	{								\
 		.name = _name_,						\
-		.dynamic_switching = true,				\
+		.flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,			\
 		.owner = THIS_MODULE,					\
 		.init = cpufreq_dbs_governor_init,			\
 		.exit = cpufreq_dbs_governor_exit,			\
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1eaa04f1bae6..9bdfcf3c4748 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -570,12 +570,17 @@ struct cpufreq_governor {
 					 char *buf);
 	int	(*store_setspeed)	(struct cpufreq_policy *policy,
 					 unsigned int freq);
-	/* For governors which change frequency dynamically by themselves */
-	bool			dynamic_switching;
 	struct list_head	governor_list;
 	struct module		*owner;
+	u8			flags;
 };
 
+/* Governor flags */
+
+/* For governors which change frequency dynamically by themselves */
+#define CPUFREQ_GOV_DYNAMIC_SWITCHING	BIT(0)
+
+
 /* Pass a target to the cpufreq driver */
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 					unsigned int target_freq);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d73bccde2720..97d318b0cd0c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -881,7 +881,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
 struct cpufreq_governor schedutil_gov = {
 	.name			= "schedutil",
 	.owner			= THIS_MODULE,
-	.dynamic_switching	= true,
+	.flags			= CPUFREQ_GOV_DYNAMIC_SWITCHING,
 	.init			= sugov_init,
 	.exit			= sugov_exit,
 	.start			= sugov_start,
-- 
cgit 


From 218f66870181bec7aaa6e3c72f346039c590c3c2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 10 Nov 2020 18:26:10 +0100
Subject: cpufreq: Introduce CPUFREQ_GOV_STRICT_TARGET

Introduce a new governor flag, CPUFREQ_GOV_STRICT_TARGET, for the
governors that want the target frequency to be set exactly to the
given value without leaving any room for adjustments on the hardware
side and set this flag for the powersave and performance governors.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq_performance.c | 1 +
 drivers/cpufreq/cpufreq_powersave.c   | 1 +
 include/linux/cpufreq.h               | 3 +++
 3 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index 71c1d9aba772..addd93f2a420 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -20,6 +20,7 @@ static void cpufreq_gov_performance_limits(struct cpufreq_policy *policy)
 static struct cpufreq_governor cpufreq_gov_performance = {
 	.name		= "performance",
 	.owner		= THIS_MODULE,
+	.flags		= CPUFREQ_GOV_STRICT_TARGET,
 	.limits		= cpufreq_gov_performance_limits,
 };
 
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index 7749522355b5..8d830d860e91 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -21,6 +21,7 @@ static struct cpufreq_governor cpufreq_gov_powersave = {
 	.name		= "powersave",
 	.limits		= cpufreq_gov_powersave_limits,
 	.owner		= THIS_MODULE,
+	.flags		= CPUFREQ_GOV_STRICT_TARGET,
 };
 
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9bdfcf3c4748..6eb9a3b8ec7b 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -580,6 +580,9 @@ struct cpufreq_governor {
 /* For governors which change frequency dynamically by themselves */
 #define CPUFREQ_GOV_DYNAMIC_SWITCHING	BIT(0)
 
+/* For governors wanting the target frequency to be set exactly */
+#define CPUFREQ_GOV_STRICT_TARGET	BIT(1)
+
 
 /* Pass a target to the cpufreq driver */
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
-- 
cgit 


From ea9364bbadf11f0c55802cf11387d74f524cee84 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 10 Nov 2020 18:26:37 +0100
Subject: cpufreq: Add strict_target to struct cpufreq_policy

Add a new field to be set when the CPUFREQ_GOV_STRICT_TARGET flag is
set for the current governor to struct cpufreq_policy, so that the
drivers needing to check CPUFREQ_GOV_STRICT_TARGET do not have to
access the governor object during every frequency transition.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 2 ++
 include/linux/cpufreq.h   | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 0252903f1b43..1e7e3f2ff09f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2280,6 +2280,8 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy)
 		}
 	}
 
+	policy->strict_target = !!(policy->governor->flags & CPUFREQ_GOV_STRICT_TARGET);
+
 	return 0;
 }
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 6eb9a3b8ec7b..acbad3b36322 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -109,6 +109,12 @@ struct cpufreq_policy {
 	bool			fast_switch_possible;
 	bool			fast_switch_enabled;
 
+	/*
+	 * Set if the CPUFREQ_GOV_STRICT_TARGET flag is set for the current
+	 * governor.
+	 */
+	bool			strict_target;
+
 	/*
 	 * Preferred average time interval between consecutive invocations of
 	 * the driver to set the frequency for this policy.  To be set by the
-- 
cgit 


From a8b62fd0850503cf1e557d7e5a98d3f1f5c25eef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 21 Sep 2020 12:58:17 +0200
Subject: stop_machine: Add function and caller debug info

Crashes in stop-machine are hard to connect to the calling code, add a
little something to help with that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.116513635@infradead.org
---
 include/linux/stop_machine.h |  5 +++++
 kernel/sched/core.c          |  1 +
 kernel/stop_machine.c        | 27 ++++++++++++++++++++++++---
 lib/dump_stack.c             |  2 ++
 4 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 76d8b09384a7..30577c3aecf8 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
+	unsigned long		caller;
 	void			*arg;
 	struct cpu_stop_done	*done;
 };
@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
 void stop_machine_unpark(int cpu);
 void stop_machine_yield(const struct cpumask *cpumask);
 
+extern void print_stop_info(const char *log_lvl, struct task_struct *task);
+
 #else	/* CONFIG_SMP */
 
 #include <linux/workqueue.h>
@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
 	return false;
 }
 
+static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
+
 #endif	/* CONFIG_SMP */
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab5..5e24104faafd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6447,6 +6447,7 @@ void sched_show_task(struct task_struct *p)
 		(unsigned long)task_thread_info(p)->flags);
 
 	print_worker_info(KERN_INFO, p);
+	print_stop_info(KERN_INFO, p);
 	show_stack(p, NULL, KERN_INFO);
 	put_task_stack(p);
 }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 865bb0228ab6..3cf567c2019d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -42,11 +42,27 @@ struct cpu_stopper {
 	struct list_head	works;		/* list of pending works */
 
 	struct cpu_stop_work	stop_work;	/* for stop_cpus */
+	unsigned long		caller;
+	cpu_stop_fn_t		fn;
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static bool stop_machine_initialized = false;
 
+void print_stop_info(const char *log_lvl, struct task_struct *task)
+{
+	/*
+	 * If @task is a stopper task, it cannot migrate and task_cpu() is
+	 * stable.
+	 */
+	struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
+
+	if (task != stopper->thread)
+		return;
+
+	printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
+}
+
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
 static bool stop_cpus_in_progress;
@@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 {
 	struct cpu_stop_done done;
-	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
 
 	cpu_stop_init_done(&done, 1);
 	if (!cpu_stop_queue_work(cpu, &work))
@@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	work1 = work2 = (struct cpu_stop_work){
 		.fn = multi_cpu_stop,
 		.arg = &msdata,
-		.done = &done
+		.done = &done,
+		.caller = _RET_IP_,
 	};
 
 	cpu_stop_init_done(&done, 2);
@@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
-	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
@@ -487,6 +504,8 @@ repeat:
 		int ret;
 
 		/* cpu stop callbacks must not sleep, make in_atomic() == T */
+		stopper->caller = work->caller;
+		stopper->fn = fn;
 		preempt_count_inc();
 		ret = fn(arg);
 		if (done) {
@@ -495,6 +514,8 @@ repeat:
 			cpu_stop_signal_done(done);
 		}
 		preempt_count_dec();
+		stopper->fn = NULL;
+		stopper->caller = 0;
 		WARN_ONCE(preempt_count(),
 			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index a00ee6eedc7c..f5a33b6f773f 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -12,6 +12,7 @@
 #include <linux/atomic.h>
 #include <linux/kexec.h>
 #include <linux/utsname.h>
+#include <linux/stop_machine.h>
 
 static char dump_stack_arch_desc_str[128];
 
@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl)
 		       log_lvl, dump_stack_arch_desc_str);
 
 	print_worker_info(log_lvl, current);
+	print_stop_info(log_lvl, current);
 }
 
 /**
-- 
cgit 


From 1cf12e08bc4d50a76b80c42a3109c53d8794a0c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Sep 2020 09:27:18 +0200
Subject: sched/hotplug: Consolidate task migration on CPU unplug

With the new mechanism which kicks tasks off the outgoing CPU at the end of
schedule() the situation on an outgoing CPU right before the stopper thread
brings it down completely is:

 - All user tasks and all unbound kernel threads have either been migrated
   away or are not running and the next wakeup will move them to a online CPU.

 - All per CPU kernel threads, except cpu hotplug thread and the stopper
   thread have either been unbound or parked by the responsible CPU hotplug
   callback.

That means that at the last step before the stopper thread is invoked the
cpu hotplug thread is the last legitimate running task on the outgoing
CPU.

Add a final wait step right before the stopper thread is kicked which
ensures that any still running tasks on the way to park or on the way to
kick themself of the CPU are either sleeping or gone.

This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If
sched_cpu_dying() detects that there is still another running task aside of
the stopper thread then it will explode with the appropriate fireworks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.547163969@infradead.org
---
 include/linux/cpuhotplug.h    |   1 +
 include/linux/sched/hotplug.h |   2 +
 kernel/cpu.c                  |   9 ++-
 kernel/sched/core.c           | 154 ++++++++++--------------------------------
 4 files changed, 46 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index bc56287a1ed1..0042ef362511 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,6 +152,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_AP_ONLINE_IDLE,
+	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
index 9a62ffdd296f..412cdaba33eb 100644
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
 extern int sched_cpu_deactivate(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_wait_empty(unsigned int cpu);
 extern int sched_cpu_dying(unsigned int cpu);
 #else
+# define sched_cpu_wait_empty	NULL
 # define sched_cpu_dying	NULL
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ff2578ecf17..fa535eaa4826 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.name			= "ap:online",
 	},
 	/*
-	 * Handled on controll processor until the plugged processor manages
+	 * Handled on control processor until the plugged processor manages
 	 * this itself.
 	 */
 	[CPUHP_TEARDOWN_CPU] = {
@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.teardown.single	= takedown_cpu,
 		.cant_stop		= true,
 	},
+
+	[CPUHP_AP_SCHED_WAIT_EMPTY] = {
+		.name			= "sched:waitempty",
+		.startup.single		= NULL,
+		.teardown.single	= sched_cpu_wait_empty,
+	},
+
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot/threads:online",
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1093c443ff9..6c89806c834b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6741,120 +6741,6 @@ void idle_task_exit(void)
 	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
-	long delta = calc_load_fold_active(rq, 1);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-}
-
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
-	const struct sched_class *class;
-	struct task_struct *next;
-
-	for_each_class(class) {
-		next = class->pick_next_task(rq);
-		if (next) {
-			next->sched_class->put_prev_task(rq, next);
-			return next;
-		}
-	}
-
-	/* The idle class should always have a runnable task */
-	BUG();
-}
-
-/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
-{
-	struct rq *rq = dead_rq;
-	struct task_struct *next, *stop = rq->stop;
-	struct rq_flags orf = *rf;
-	int dest_cpu;
-
-	/*
-	 * Fudge the rq selection such that the below task selection loop
-	 * doesn't get stuck on the currently eligible stop task.
-	 *
-	 * We're currently inside stop_machine() and the rq is either stuck
-	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
-	 * either way we should never end up calling schedule() until we're
-	 * done here.
-	 */
-	rq->stop = NULL;
-
-	/*
-	 * put_prev_task() and pick_next_task() sched
-	 * class method both need to have an up-to-date
-	 * value of rq->clock[_task]
-	 */
-	update_rq_clock(rq);
-
-	for (;;) {
-		/*
-		 * There's this thread running, bail when that's the only
-		 * remaining thread:
-		 */
-		if (rq->nr_running == 1)
-			break;
-
-		next = __pick_migrate_task(rq);
-
-		/*
-		 * Rules for changing task_struct::cpus_mask are holding
-		 * both pi_lock and rq->lock, such that holding either
-		 * stabilizes the mask.
-		 *
-		 * Drop rq->lock is not quite as disastrous as it usually is
-		 * because !cpu_active at this point, which means load-balance
-		 * will not interfere. Also, stop-machine.
-		 */
-		rq_unlock(rq, rf);
-		raw_spin_lock(&next->pi_lock);
-		rq_relock(rq, rf);
-
-		/*
-		 * Since we're inside stop-machine, _nothing_ should have
-		 * changed the task, WARN if weird stuff happened, because in
-		 * that case the above rq->lock drop is a fail too.
-		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
-			raw_spin_unlock(&next->pi_lock);
-			continue;
-		}
-
-		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-		rq = __migrate_task(rq, rf, next, dest_cpu);
-		if (rq != dead_rq) {
-			rq_unlock(rq, rf);
-			rq = dead_rq;
-			*rf = orf;
-			rq_relock(rq, rf);
-		}
-		raw_spin_unlock(&next->pi_lock);
-	}
-
-	rq->stop = stop;
-}
-
 static int __balance_push_cpu_stop(void *arg)
 {
 	struct task_struct *p = arg;
@@ -7123,10 +7009,6 @@ int sched_cpu_deactivate(unsigned int cpu)
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
-
-	/* Wait for all non per CPU kernel threads to vanish. */
-	balance_hotplug_wait();
-
 	return 0;
 }
 
@@ -7146,6 +7028,41 @@ int sched_cpu_starting(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+	balance_hotplug_wait();
+	return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+	long delta = calc_load_fold_active(rq, 1);
+
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+}
+
 int sched_cpu_dying(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7159,7 +7076,6 @@ int sched_cpu_dying(unsigned int cpu)
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	migrate_tasks(rq, &rf);
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 
-- 
cgit 


From af449901b84c98cbd84a0113223ba3bcfcb12a26 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 17 Sep 2020 10:38:30 +0200
Subject: sched: Add migrate_disable()

Add the base migrate_disable() support (under protest).

While migrate_disable() is (currently) required for PREEMPT_RT, it is
also one of the biggest flaws in the system.

Notably this is just the base implementation, it is broken vs
sched_setaffinity() and hotplug, both solved in additional patches for
ease of review.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.818170844@infradead.org
---
 include/linux/preempt.h |  65 ++++++++++++++++++++++++++++
 include/linux/sched.h   |   3 ++
 kernel/sched/core.c     | 112 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |   6 ++-
 lib/smp_processor_id.c  |   5 +++
 5 files changed, 183 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7d9c1c0e149c..97ba7c920653 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -322,6 +322,69 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+/*
+ * Migrate-Disable and why it is (strongly) undesired.
+ *
+ * The premise of the Real-Time schedulers we have on Linux
+ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
+ * concurrently, provided there are sufficient runnable tasks, also known as
+ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
+ * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
+ *
+ * The correctness of various scheduling models depends on this, but is it
+ * broken by migrate_disable() that doesn't imply preempt_disable(). Where
+ * preempt_disable() implies an immediate priority ceiling, preemptible
+ * migrate_disable() allows nesting.
+ *
+ * The worst case is that all tasks preempt one another in a migrate_disable()
+ * region and stack on a single CPU. This then reduces the available bandwidth
+ * to a single CPU. And since Real-Time schedulability theory considers the
+ * Worst-Case only, all Real-Time analysis shall revert to single-CPU
+ * (instantly solving the SMP analysis problem).
+ *
+ *
+ * The reason we have it anyway.
+ *
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
+ *
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ *       migration, so migrate_disable() is not only a problem for
+ *       work-conserving schedulers.
+ *
+ */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+#elif defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable(void) { }
+static inline void migrate_enable(void) { }
+
+#else /* !CONFIG_PREEMPT_RT */
+
 /**
  * migrate_disable - Prevent migration of the current task
  *
@@ -352,4 +415,6 @@ static __always_inline void migrate_enable(void)
 	preempt_enable();
 }
 
+#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
+
 #endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 063cd120b459..0732356c0eca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -714,6 +714,9 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	int				migration_disabled;
+#endif
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 396accb1d69c..6a3f1c2e185b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1696,6 +1696,61 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_PREEMPT_RT
+
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask,
+				  u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+	if (likely(!p->migration_disabled))
+		return;
+
+	if (p->cpus_ptr != &p->cpus_mask)
+		return;
+
+	/*
+	 * Violates locking rules! see comment in __do_set_cpus_allowed().
+	 */
+	__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+	if (current->migration_disabled++)
+		return;
+
+	barrier();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+
+	if (--p->migration_disabled)
+		return;
+
+	barrier();
+
+	if (p->cpus_ptr == &p->cpus_mask)
+		return;
+
+	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return p->migration_disabled;
+}
+
+#endif
+
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1705,7 +1760,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
-	if (is_per_cpu_kthread(p))
+	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
 		return cpu_online(cpu);
 
 	return cpu_active(cpu);
@@ -1826,6 +1881,11 @@ static int migration_cpu_stop(void *data)
  */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
+	if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+		p->cpus_ptr = new_mask;
+		return;
+	}
+
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
@@ -1836,7 +1896,22 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
 	struct rq *rq = task_rq(p);
 	bool queued, running;
 
-	lockdep_assert_held(&p->pi_lock);
+	/*
+	 * This here violates the locking rules for affinity, since we're only
+	 * supposed to change these variables while holding both rq->lock and
+	 * p->pi_lock.
+	 *
+	 * HOWEVER, it magically works, because ttwu() is the only code that
+	 * accesses these variables under p->pi_lock and only does so after
+	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+	 * before finish_task().
+	 *
+	 * XXX do further audits, this smells like something putrid.
+	 */
+	if (flags & SCA_MIGRATE_DISABLE)
+		SCHED_WARN_ON(!p->on_cpu);
+	else
+		lockdep_assert_held(&p->pi_lock);
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
@@ -1887,9 +1962,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	rq = task_rq_lock(p, &rf);
 	update_rq_clock(rq);
 
-	if (p->flags & PF_KTHREAD) {
+	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
 		/*
-		 * Kernel threads are allowed on online && !active CPUs
+		 * Kernel threads are allowed on online && !active CPUs.
+		 *
+		 * Specifically, migration_disabled() tasks must not fail the
+		 * cpumask_any_and_distribute() pick below, esp. so on
+		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
+		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
 		 */
 		cpu_valid_mask = cpu_online_mask;
 	}
@@ -1903,7 +1983,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		goto out;
 	}
 
-	if (cpumask_equal(&p->cpus_mask, new_mask))
+	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
 		goto out;
 
 	/*
@@ -1995,6 +2075,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
 	 */
 	WARN_ON_ONCE(!cpu_online(new_cpu));
+
+	WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2325,6 +2407,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			}
 			fallthrough;
 		case possible:
+			/*
+			 * XXX When called from select_task_rq() we only
+			 * hold p->pi_lock and again violate locking order.
+			 *
+			 * More yuck to audit.
+			 */
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
 			break;
@@ -2359,7 +2447,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1)
+	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(p->cpus_ptr);
@@ -2421,6 +2509,17 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 
 #endif /* CONFIG_SMP */
 
+#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return false;
+}
+
+#endif
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -4570,6 +4669,7 @@ static void __sched notrace __schedule(bool preempt)
 		 */
 		++*switch_count;
 
+		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(preempt, prev, next);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0420d80fb250..72d8e47cf0bb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1902,14 +1902,16 @@ static inline bool sched_fair_runnable(struct rq *rq)
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 extern struct task_struct *pick_next_task_idle(struct rq *rq);
 
+#define SCA_CHECK		0x01
+#define SCA_MIGRATE_DISABLE	0x02
+#define SCA_MIGRATE_ENABLE	0x04
+
 #ifdef CONFIG_SMP
 
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-#define SCA_CHECK		0x01
-
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 525222e4f409..faaa927ac2c8 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
 	if (current->nr_cpus_allowed == 1)
 		goto out;
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	if (current->migration_disabled)
+		goto out;
+#endif
+
 	/*
 	 * It is valid to assume CPU-locality during early bootup:
 	 */
-- 
cgit 


From 6d337eab041d56bb8f0e7794f39906c21054c512 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Sep 2020 17:24:31 +0200
Subject: sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

Concurrent migrate_disable() and set_cpus_allowed_ptr() has
interesting features. We rely on set_cpus_allowed_ptr() to not return
until the task runs inside the provided mask. This expectation is
exported to userspace.

This means that any set_cpus_allowed_ptr() caller must wait until
migrate_enable() allows migrations.

At the same time, we don't want migrate_enable() to schedule, due to
patterns like:

	preempt_disable();
	migrate_disable();
	...
	migrate_enable();
	preempt_enable();

And:

	raw_spin_lock(&B);
	spin_unlock(&A);

this means that when migrate_enable() must restore the affinity
mask, it cannot wait for completion thereof. Luck will have it that
that is exactly the case where there is a pending
set_cpus_allowed_ptr(), so let that provide storage for the async stop
machine.

Much thanks to Valentin who used TLA+ most effective and found lots of
'interesting' cases.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.921768277@infradead.org
---
 include/linux/sched.h |   1 +
 kernel/sched/core.c   | 236 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 207 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0732356c0eca..90a0c92741d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -714,6 +714,7 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 	int				migration_disabled;
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6a3f1c2e185b..0efc1e41bb60 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1732,15 +1732,26 @@ void migrate_enable(void)
 {
 	struct task_struct *p = current;
 
-	if (--p->migration_disabled)
+	if (p->migration_disabled > 1) {
+		p->migration_disabled--;
 		return;
+	}
 
+	/*
+	 * Ensure stop_task runs either before or after this, and that
+	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+	 */
+	preempt_disable();
+	if (p->cpus_ptr != &p->cpus_mask)
+		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	/*
+	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
+	 * regular cpus_mask, otherwise things that race (eg.
+	 * select_fallback_rq) get confused.
+	 */
 	barrier();
-
-	if (p->cpus_ptr == &p->cpus_mask)
-		return;
-
-	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	p->migration_disabled = 0;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -1805,8 +1816,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 }
 
 struct migration_arg {
-	struct task_struct *task;
-	int dest_cpu;
+	struct task_struct		*task;
+	int				dest_cpu;
+	struct set_affinity_pending	*pending;
+};
+
+struct set_affinity_pending {
+	refcount_t		refs;
+	struct completion	done;
+	struct cpu_stop_work	stop_work;
+	struct migration_arg	arg;
 };
 
 /*
@@ -1838,16 +1857,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
  */
 static int migration_cpu_stop(void *data)
 {
+	struct set_affinity_pending *pending;
 	struct migration_arg *arg = data;
 	struct task_struct *p = arg->task;
+	int dest_cpu = arg->dest_cpu;
 	struct rq *rq = this_rq();
+	bool complete = false;
 	struct rq_flags rf;
 
 	/*
 	 * The original target CPU might have gone down and we might
 	 * be on another CPU but it doesn't matter.
 	 */
-	local_irq_disable();
+	local_irq_save(rf.flags);
 	/*
 	 * We need to explicitly wake pending tasks before running
 	 * __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1857,21 +1879,83 @@ static int migration_cpu_stop(void *data)
 
 	raw_spin_lock(&p->pi_lock);
 	rq_lock(rq, &rf);
+
+	pending = p->migration_pending;
 	/*
 	 * If task_rq(p) != rq, it cannot be migrated here, because we're
 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
 	 * we're holding p->pi_lock.
 	 */
 	if (task_rq(p) == rq) {
+		if (is_migration_disabled(p))
+			goto out;
+
+		if (pending) {
+			p->migration_pending = NULL;
+			complete = true;
+		}
+
+		/* migrate_enable() --  we must not race against SCA */
+		if (dest_cpu < 0) {
+			/*
+			 * When this was migrate_enable() but we no longer
+			 * have a @pending, a concurrent SCA 'fixed' things
+			 * and we should be valid again. Nothing to do.
+			 */
+			if (!pending) {
+				WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+				goto out;
+			}
+
+			dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+		}
+
 		if (task_on_rq_queued(p))
-			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+			rq = __migrate_task(rq, &rf, p, dest_cpu);
 		else
-			p->wake_cpu = arg->dest_cpu;
+			p->wake_cpu = dest_cpu;
+
+	} else if (dest_cpu < 0) {
+		/*
+		 * This happens when we get migrated between migrate_enable()'s
+		 * preempt_enable() and scheduling the stopper task. At that
+		 * point we're a regular task again and not current anymore.
+		 *
+		 * A !PREEMPT kernel has a giant hole here, which makes it far
+		 * more likely.
+		 */
+
+		/*
+		 * When this was migrate_enable() but we no longer have an
+		 * @pending, a concurrent SCA 'fixed' things and we should be
+		 * valid again. Nothing to do.
+		 */
+		if (!pending) {
+			WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+			goto out;
+		}
+
+		/*
+		 * When migrate_enable() hits a rq mis-match we can't reliably
+		 * determine is_migration_disabled() and so have to chase after
+		 * it.
+		 */
+		task_rq_unlock(rq, p, &rf);
+		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+		return 0;
 	}
-	rq_unlock(rq, &rf);
-	raw_spin_unlock(&p->pi_lock);
+out:
+	task_rq_unlock(rq, p, &rf);
+
+	if (complete)
+		complete_all(&pending->done);
+
+	/* For pending->{arg,stop_work} */
+	pending = arg->pending;
+	if (pending && refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
 
-	local_irq_enable();
 	return 0;
 }
 
@@ -1940,6 +2024,112 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	__do_set_cpus_allowed(p, new_mask, 0);
 }
 
+/*
+ * This function is wildly self concurrent, consider at least 3 times.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+			    int dest_cpu, unsigned int flags)
+{
+	struct set_affinity_pending my_pending = { }, *pending = NULL;
+	struct migration_arg arg = {
+		.task = p,
+		.dest_cpu = dest_cpu,
+	};
+	bool complete = false;
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		pending = p->migration_pending;
+		if (pending) {
+			refcount_inc(&pending->refs);
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+		if (complete)
+			goto do_complete;
+
+		return 0;
+	}
+
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		/* serialized by p->pi_lock */
+		if (!p->migration_pending) {
+			refcount_set(&my_pending.refs, 1);
+			init_completion(&my_pending.done);
+			p->migration_pending = &my_pending;
+		} else {
+			pending = p->migration_pending;
+			refcount_inc(&pending->refs);
+		}
+	}
+	pending = p->migration_pending;
+	/*
+	 * - !MIGRATE_ENABLE:
+	 *   we'll have installed a pending if there wasn't one already.
+	 *
+	 * - MIGRATE_ENABLE:
+	 *   we're here because the current CPU isn't matching anymore,
+	 *   the only way that can happen is because of a concurrent
+	 *   set_cpus_allowed_ptr() call, which should then still be
+	 *   pending completion.
+	 *
+	 * Either way, we really should have a @pending here.
+	 */
+	if (WARN_ON_ONCE(!pending)) {
+		task_rq_unlock(rq, p, rf);
+		return -EINVAL;
+	}
+
+	if (flags & SCA_MIGRATE_ENABLE) {
+
+		refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+		task_rq_unlock(rq, p, rf);
+
+		pending->arg = (struct migration_arg) {
+			.task = p,
+			.dest_cpu = -1,
+			.pending = pending,
+		};
+
+		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+
+		return 0;
+	}
+
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+
+		task_rq_unlock(rq, p, rf);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+	} else {
+
+		if (!is_migration_disabled(p)) {
+			if (task_on_rq_queued(p))
+				rq = move_queued_task(rq, rf, p, dest_cpu);
+
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+do_complete:
+		if (complete)
+			complete_all(&pending->done);
+	}
+
+	wait_for_completion(&pending->done);
+
+	if (refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
+
+	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+	return 0;
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -2009,23 +2199,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 			p->nr_cpus_allowed != 1);
 	}
 
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
+	return affine_move_task(rq, p, &rf, dest_cpu, flags);
 
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &rf);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		return 0;
-	} else if (task_on_rq_queued(p)) {
-		/*
-		 * OK, since we're going to drop the lock immediately
-		 * afterwards anyway.
-		 */
-		rq = move_queued_task(rq, &rf, p, dest_cpu);
-	}
 out:
 	task_rq_unlock(rq, p, &rf);
 
@@ -3205,6 +3380,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	init_numa_balancing(clone_flags, p);
 #ifdef CONFIG_SMP
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
+	p->migration_pending = NULL;
 #endif
 }
 
-- 
cgit 


From 14e292f8d45380c519a83d9b0f37089a17eedcdf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 1 Oct 2020 15:54:14 +0200
Subject: sched,rt: Use cpumask_any*_distribute()

Replace a bunch of cpumask_any*() instances with
cpumask_any*_distribute(), by injecting this little bit of random in
cpu selection, we reduce the chance two competing balance operations
working off the same lowest_mask pick the same CPU.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102347.190759694@infradead.org
---
 include/linux/cpumask.h |  6 ++++++
 kernel/sched/deadline.c |  6 +++---
 kernel/sched/rt.c       |  6 +++---
 lib/cpumask.c           | 18 ++++++++++++++++++
 4 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index f0d895d6ac39..383684e30f12 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
 	return cpumask_next_and(-1, src1p, src2p);
 }
 
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	return cpumask_first(srcp);
+}
+
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
+int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e97c7c2708bc..206a0703fcbc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2002,8 +2002,8 @@ static int find_later_rq(struct task_struct *task)
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(later_mask,
-							sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(later_mask,
+							      sched_domain_span(sd));
 			/*
 			 * Last chance: if a CPU being in both later_mask
 			 * and current sd span is valid, that becomes our
@@ -2025,7 +2025,7 @@ static int find_later_rq(struct task_struct *task)
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(later_mask);
+	cpu = cpumask_any_distribute(later_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 40a46639f78a..2525a1beed26 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_struct *task)
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(lowest_mask,
-						     sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(lowest_mask,
+							      sched_domain_span(sd));
 			if (best_cpu < nr_cpu_ids) {
 				rcu_read_unlock();
 				return best_cpu;
@@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_struct *task)
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(lowest_mask);
+	cpu = cpumask_any_distribute(lowest_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 85da6ab4fbb5..35924025097b 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
 	return next;
 }
 EXPORT_SYMBOL(cpumask_any_and_distribute);
+
+int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	int next, prev;
+
+	/* NOTE: our first selection will skip 0. */
+	prev = __this_cpu_read(distribute_cpu_mask_prev);
+
+	next = cpumask_next(prev, srcp);
+	if (next >= nr_cpu_ids)
+		next = cpumask_first(srcp);
+
+	if (next < nr_cpu_ids)
+		__this_cpu_write(distribute_cpu_mask_prev, next);
+
+	return next;
+}
+EXPORT_SYMBOL(cpumask_any_distribute);
-- 
cgit 


From a7c81556ec4d341dfdbf2cc478ead89d73e474a7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2020 17:06:07 +0200
Subject: sched: Fix migrate_disable() vs rt/dl balancing

In order to minimize the interference of migrate_disable() on lower
priority tasks, which can be deprived of runtime due to being stuck
below a higher priority task. Teach the RT/DL balancers to push away
these higher priority tasks when a lower priority task gets selected
to run on a freshly demoted CPU (pull).

This adds migration interference to the higher priority task, but
restores bandwidth to system that would otherwise be irrevocably lost.
Without this it would be possible to have all tasks on the system
stuck on a single CPU, each task preempted in a migrate_disable()
section with a single high priority task running.

This way we can still approximate running the M highest priority tasks
on the system.

Migrating the top task away is (ofcourse) still subject to
migrate_disable() too, which means the lower task is subject to an
interference equivalent to the worst case migrate_disable() section.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102347.499155098@infradead.org
---
 include/linux/preempt.h | 40 ++++++++++++++++-------------
 include/linux/sched.h   |  3 ++-
 kernel/sched/core.c     | 67 +++++++++++++++++++++++++++++++++++++++++--------
 kernel/sched/deadline.c | 29 +++++++++++++++------
 kernel/sched/rt.c       | 63 +++++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h    | 32 +++++++++++++++++++++++
 6 files changed, 186 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 97ba7c920653..8b43922e65df 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 
 /*
- * Migrate-Disable and why it is (strongly) undesired.
- *
- * The premise of the Real-Time schedulers we have on Linux
- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
- * concurrently, provided there are sufficient runnable tasks, also known as
- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
- *
- * The correctness of various scheduling models depends on this, but is it
- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
- * preempt_disable() implies an immediate priority ceiling, preemptible
- * migrate_disable() allows nesting.
- *
- * The worst case is that all tasks preempt one another in a migrate_disable()
- * region and stack on a single CPU. This then reduces the available bandwidth
- * to a single CPU. And since Real-Time schedulability theory considers the
- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
- * (instantly solving the SMP analysis problem).
+ * Migrate-Disable and why it is undesired.
+ *
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ *   it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ *   migrated away when another CPU becomes available, is now constrained
+ *   by the ability to push the higher priority task away, which might itself be
+ *   in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
  *
  *
  * The reason we have it anyway.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 90a0c92741d7..3af9d52fe093 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -716,8 +716,9 @@ struct task_struct {
 	cpumask_t			cpus_mask;
 	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
-	int				migration_disabled;
+	unsigned short			migration_disabled;
 #endif
+	unsigned short			migration_flags;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9ce2fc7d3d51..e92d7853057c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1763,11 +1763,6 @@ void migrate_enable(void)
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return p->migration_disabled;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return rq->nr_pinned;
@@ -1972,6 +1967,49 @@ out:
 	return 0;
 }
 
+int push_cpu_stop(void *arg)
+{
+	struct rq *lowest_rq = NULL, *rq = this_rq();
+	struct task_struct *p = arg;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+
+	if (task_rq(p) != rq)
+		goto out_unlock;
+
+	if (is_migration_disabled(p)) {
+		p->migration_flags |= MDF_PUSH;
+		goto out_unlock;
+	}
+
+	p->migration_flags &= ~MDF_PUSH;
+
+	if (p->sched_class->find_lock_rq)
+		lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+	if (!lowest_rq)
+		goto out_unlock;
+
+	// XXX validate p is still the highest prio task
+	if (task_rq(p) == rq) {
+		deactivate_task(rq, p, 0);
+		set_task_cpu(p, lowest_rq->cpu);
+		activate_task(lowest_rq, p, 0);
+		resched_curr(lowest_rq);
+	}
+
+	double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+	rq->push_busy = false;
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+	return 0;
+}
+
 /*
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
@@ -2052,6 +2090,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		struct task_struct *push_task = NULL;
+
+		if ((flags & SCA_MIGRATE_ENABLE) &&
+		    (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+			rq->push_busy = true;
+			push_task = get_task_struct(p);
+		}
+
 		pending = p->migration_pending;
 		if (pending) {
 			refcount_inc(&pending->refs);
@@ -2060,6 +2106,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 		}
 		task_rq_unlock(rq, p, rf);
 
+		if (push_task) {
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    p, &rq->push_work);
+		}
+
 		if (complete)
 			goto do_complete;
 
@@ -2098,6 +2149,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 	if (flags & SCA_MIGRATE_ENABLE) {
 
 		refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+		p->migration_flags &= ~MDF_PUSH;
 		task_rq_unlock(rq, p, rf);
 
 		pending->arg = (struct migration_arg) {
@@ -2716,11 +2768,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 
 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return false;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return false;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3d3fd8370447..eed2e449b313 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task))
+		return 0;
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq)
 static void pull_dl_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, cpu;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
@@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this_rq)
 			continue;
 
 		/* Might drop this_rq->lock */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this_rq)
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
-			dmin = p->dl.deadline;
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				dmin = p->dl.deadline;
+				resched = true;
+			}
 
 			/* Is there any other task even earlier? */
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
 	.task_woken		= task_woken_dl,
+	.find_lock_rq		= find_lock_later_rq,
 #endif
 
 	.task_tick		= task_tick_dl,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index cf63346a07e4..c592e47cafed 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
  * running task can migrate over to a CPU that is running a task
  * of lesser priority.
  */
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task)) {
+		struct task_struct *push_task = NULL;
+		int cpu;
+
+		if (!pull || rq->push_busy)
+			return 0;
+
+		cpu = find_lowest_rq(rq->curr);
+		if (cpu == -1 || cpu == rq->cpu)
+			return 0;
+
+		/*
+		 * Given we found a CPU with lower priority than @next_task,
+		 * therefore it should be running. However we cannot migrate it
+		 * to this other CPU, instead attempt to push the current
+		 * running task on this CPU away.
+		 */
+		push_task = get_push_task(rq);
+		if (push_task) {
+			raw_spin_unlock(&rq->lock);
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    push_task, &rq->push_work);
+			raw_spin_lock(&rq->lock);
+		}
+
+		return 0;
+	}
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -1927,12 +1955,10 @@ retry:
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
-	ret = 1;
-
 	resched_curr(lowest_rq);
+	ret = 1;
 
 	double_unlock_balance(rq, lowest_rq);
-
 out:
 	put_task_struct(next_task);
 
@@ -1942,7 +1968,7 @@ out:
 static void push_rt_tasks(struct rq *rq)
 {
 	/* push_rt_task will return true if it moved an RT */
-	while (push_rt_task(rq))
+	while (push_rt_task(rq, false))
 		;
 }
 
@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_work *work)
 	 */
 	if (has_pushable_tasks(rq)) {
 		raw_spin_lock(&rq->lock);
-		push_rt_tasks(rq);
+		while (push_rt_task(rq, true))
+			;
 		raw_spin_unlock(&rq->lock);
 	}
 
@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, cpu;
 	bool resched = false;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	struct rq *src_rq;
 	int rt_overload_count = rt_overloaded(this_rq);
 
@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this_rq)
 		 * double_lock_balance, and another CPU could
 		 * alter this_rq
 		 */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this_rq)
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				resched = true;
+			}
 			/*
 			 * We continue with the search, just in
 			 * case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this_rq)
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class
 	.rq_offline             = rq_offline_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
+	.find_lock_rq		= find_lock_lowest_rq,
 #endif
 
 	.task_tick		= task_tick_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 42de1406c0dc..56992aaca48e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,6 +1057,8 @@ struct rq {
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	unsigned int		nr_pinned;
 #endif
+	unsigned int		push_busy;
+	struct cpu_stop_work	push_work;
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+#define MDF_PUSH	0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	return p->migration_disabled;
+#else
+	return false;
+#endif
+}
 
 #ifdef CONFIG_SCHED_SMT
 extern void __update_idle_core(struct rq *rq);
@@ -1823,6 +1835,8 @@ struct sched_class {
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
+
+	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
 #endif
 
 	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1918,6 +1932,24 @@ extern void trigger_load_balance(struct rq *rq);
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (rq->push_busy)
+		return NULL;
+
+	if (p->nr_cpus_allowed == 1)
+		return NULL;
+
+	rq->push_busy = true;
+	return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
 #endif
 
 #ifdef CONFIG_CPU_IDLE
-- 
cgit 


From c250d50fe2ce627ca9805d9c8ac11cbbf922a4a6 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Thu, 5 Nov 2020 12:50:01 +0000
Subject: PM: EM: Add a flag indicating units of power values in Energy Model

There are different platforms and devices which might use different scale
for the power values. Kernel sub-systems might need to check if all
Energy Model (EM) devices are using the same scale. Address that issue and
store the information inside EM for each device. Thanks to that they can
be easily compared and proper action triggered.

Suggested-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Quentin Perret <qperret@google.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scmi-cpufreq.c |  3 ++-
 drivers/opp/of.c               |  2 +-
 include/linux/energy_model.h   |  9 +++++++--
 kernel/power/energy_model.c    | 24 +++++++++++++++++++++++-
 4 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index e855e8612a67..3714a4cd07fa 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -188,7 +188,8 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	policy->fast_switch_possible =
 		handle->perf_ops->fast_switch_possible(handle, cpu_dev);
 
-	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus);
+	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus,
+				    false);
 
 	return 0;
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 9faeb83e4b32..16f39e2127a5 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1335,7 +1335,7 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus)
 		goto failed;
 	}
 
-	ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus);
+	ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus, true);
 	if (ret)
 		goto failed;
 
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index b67a51c574b9..3a33c738d876 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -29,6 +29,8 @@ struct em_perf_state {
  * em_perf_domain - Performance domain
  * @table:		List of performance states, in ascending order
  * @nr_perf_states:	Number of performance states
+ * @milliwatts:		Flag indicating the power values are in milli-Watts
+ *			or some other scale.
  * @cpus:		Cpumask covering the CPUs of the domain. It's here
  *			for performance reasons to avoid potential cache
  *			misses during energy calculations in the scheduler
@@ -43,6 +45,7 @@ struct em_perf_state {
 struct em_perf_domain {
 	struct em_perf_state *table;
 	int nr_perf_states;
+	int milliwatts;
 	unsigned long cpus[];
 };
 
@@ -79,7 +82,8 @@ struct em_data_callback {
 struct em_perf_domain *em_cpu_get(int cpu);
 struct em_perf_domain *em_pd_get(struct device *dev);
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
-				struct em_data_callback *cb, cpumask_t *span);
+				struct em_data_callback *cb, cpumask_t *span,
+				bool milliwatts);
 void em_dev_unregister_perf_domain(struct device *dev);
 
 /**
@@ -186,7 +190,8 @@ struct em_data_callback {};
 
 static inline
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
-				struct em_data_callback *cb, cpumask_t *span)
+				struct em_data_callback *cb, cpumask_t *span,
+				bool milliwatts)
 {
 	return -EINVAL;
 }
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index c1ff7fa030ab..efe2a595988e 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
 
+static int em_debug_units_show(struct seq_file *s, void *unused)
+{
+	struct em_perf_domain *pd = s->private;
+	char *units = pd->milliwatts ? "milliWatts" : "bogoWatts";
+
+	seq_printf(s, "%s\n", units);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_units);
+
 static void em_debug_create_pd(struct device *dev)
 {
 	struct dentry *d;
@@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev)
 		debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
 				    &em_debug_cpus_fops);
 
+	debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
+
 	/* Create a sub-directory for each performance state */
 	for (i = 0; i < dev->em_pd->nr_perf_states; i++)
 		em_debug_create_ps(&dev->em_pd->table[i], d);
@@ -250,17 +263,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
  *		type of devices this should be set to NULL.
+ * @milliwatts	: Flag indicating that the power values are in milliWatts or
+ *		in some other scale. It must be set properly.
  *
  * Create Energy Model tables for a performance domain using the callbacks
  * defined in cb.
  *
+ * The @milliwatts is important to set with correct value. Some kernel
+ * sub-systems might rely on this flag and check if all devices in the EM are
+ * using the same scale.
+ *
  * If multiple clients register the same performance domain, all but the first
  * registration will be ignored.
  *
  * Return 0 on success
  */
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
-				struct em_data_callback *cb, cpumask_t *cpus)
+				struct em_data_callback *cb, cpumask_t *cpus,
+				bool milliwatts)
 {
 	unsigned long cap, prev_cap = 0;
 	int cpu, ret;
@@ -313,6 +333,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 	if (ret)
 		goto unlock;
 
+	dev->em_pd->milliwatts = milliwatts;
+
 	em_debug_create_pd(dev);
 	dev_info(dev, "EM: created perf domain\n");
 
-- 
cgit 


From f2c90b12e700fff6a0b5a1c32f446f05f9d0890c Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Tue, 3 Nov 2020 09:05:59 +0000
Subject: PM: EM: update the comments related to power scale

The Energy Model supports power values expressed in milli-Watts or in an
'abstract scale'. Update the related comments is the code to reflect that
state.

Reviewed-by: Quentin Perret <qperret@google.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 11 +++++------
 kernel/power/energy_model.c  |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 3a33c738d876..9618c0a46ef4 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -13,9 +13,8 @@
 /**
  * em_perf_state - Performance state of a performance domain
  * @frequency:	The frequency in KHz, for consistency with CPUFreq
- * @power:	The power consumed at this level, in milli-watts (by 1 CPU or
-		by a registered device). It can be a total power: static and
-		dynamic.
+ * @power:	The power consumed at this level (by 1 CPU or by a registered
+ *		device). It can be a total power: static and dynamic.
  * @cost:	The cost coefficient associated with this level, used during
  *		energy calculation. Equal to: power * max_frequency / frequency
  */
@@ -58,7 +57,7 @@ struct em_data_callback {
 	/**
 	 * active_power() - Provide power at the next performance state of
 	 *		a device
-	 * @power	: Active power at the performance state in mW
+	 * @power	: Active power at the performance state
 	 *		(modified)
 	 * @freq	: Frequency at the performance state in kHz
 	 *		(modified)
@@ -69,8 +68,8 @@ struct em_data_callback {
 	 * and frequency.
 	 *
 	 * In case of CPUs, the power is the one of a single CPU in the domain,
-	 * expressed in milli-watts. It is expected to fit in the
-	 * [0, EM_MAX_POWER] range.
+	 * expressed in milli-Watts or an abstract scale. It is expected to
+	 * fit in the [0, EM_MAX_POWER] range.
 	 *
 	 * Return 0 on success.
 	 */
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index efe2a595988e..1358fa4abfa8 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -143,7 +143,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
 
 		/*
 		 * The power returned by active_state() is expected to be
-		 * positive, in milli-watts and to fit into 16 bits.
+		 * positive and to fit into 16 bits.
 		 */
 		if (!power || power > EM_MAX_POWER) {
 			dev_err(dev, "EM: invalid power: %lu\n",
-- 
cgit 


From fc51989062138744b56e47190915ce68484e73f3 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 3 Nov 2020 16:06:25 +0100
Subject: PM: domains: Rename pm_genpd_syscore_poweroff|poweron()

To better describe what the pm_genpd_syscore_poweroff|poweron() functions
actually do, let's rename them to dev_pm_genpd_suspend|resume() and update
the rather few callers of them accordingly (a couple of clocksource
drivers).

Moreover, let's take the opportunity to add some documentation of these
exported functions, as that is currently missing.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c   | 35 +++++++++++++++++++++--------------
 drivers/clocksource/sh_cmt.c  |  8 ++++----
 drivers/clocksource/sh_mtu2.c |  4 ++--
 drivers/clocksource/sh_tmu.c  |  8 ++++----
 include/linux/pm_domain.h     |  8 ++++----
 5 files changed, 35 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 743268996336..9b4881b67683 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1363,14 +1363,7 @@ static void genpd_complete(struct device *dev)
 	genpd_unlock(genpd);
 }
 
-/**
- * genpd_syscore_switch - Switch power during system core suspend or resume.
- * @dev: Device that normally is marked as "always on" to switch power for.
- *
- * This routine may only be called during the system core (syscore) suspend or
- * resume phase for devices whose "always on" flags are set.
- */
-static void genpd_syscore_switch(struct device *dev, bool suspend)
+static void genpd_switch_state(struct device *dev, bool suspend)
 {
 	struct generic_pm_domain *genpd;
 
@@ -1387,17 +1380,31 @@ static void genpd_syscore_switch(struct device *dev, bool suspend)
 	}
 }
 
-void pm_genpd_syscore_poweroff(struct device *dev)
+/**
+ * dev_pm_genpd_suspend - Synchronously try to suspend the genpd for @dev
+ * @dev: The device that is attached to the genpd, that can be suspended.
+ *
+ * This routine should typically be called for a device that needs to be
+ * suspended during the syscore suspend phase.
+ */
+void dev_pm_genpd_suspend(struct device *dev)
 {
-	genpd_syscore_switch(dev, true);
+	genpd_switch_state(dev, true);
 }
-EXPORT_SYMBOL_GPL(pm_genpd_syscore_poweroff);
+EXPORT_SYMBOL_GPL(dev_pm_genpd_suspend);
 
-void pm_genpd_syscore_poweron(struct device *dev)
+/**
+ * dev_pm_genpd_resume - Synchronously try to resume the genpd for @dev
+ * @dev: The device that is attached to the genpd, which needs to be resumed.
+ *
+ * This routine should typically be called for a device that needs to be resumed
+ * during the syscore resume phase.
+ */
+void dev_pm_genpd_resume(struct device *dev)
 {
-	genpd_syscore_switch(dev, false);
+	genpd_switch_state(dev, false);
 }
-EXPORT_SYMBOL_GPL(pm_genpd_syscore_poweron);
+EXPORT_SYMBOL_GPL(dev_pm_genpd_resume);
 
 #else /* !CONFIG_PM_SLEEP */
 
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 760777458a90..7275d95de435 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -658,7 +658,7 @@ static void sh_cmt_clocksource_suspend(struct clocksource *cs)
 		return;
 
 	sh_cmt_stop(ch, FLAG_CLOCKSOURCE);
-	pm_genpd_syscore_poweroff(&ch->cmt->pdev->dev);
+	dev_pm_genpd_suspend(&ch->cmt->pdev->dev);
 }
 
 static void sh_cmt_clocksource_resume(struct clocksource *cs)
@@ -668,7 +668,7 @@ static void sh_cmt_clocksource_resume(struct clocksource *cs)
 	if (!ch->cs_enabled)
 		return;
 
-	pm_genpd_syscore_poweron(&ch->cmt->pdev->dev);
+	dev_pm_genpd_resume(&ch->cmt->pdev->dev);
 	sh_cmt_start(ch, FLAG_CLOCKSOURCE);
 }
 
@@ -760,7 +760,7 @@ static void sh_cmt_clock_event_suspend(struct clock_event_device *ced)
 {
 	struct sh_cmt_channel *ch = ced_to_sh_cmt(ced);
 
-	pm_genpd_syscore_poweroff(&ch->cmt->pdev->dev);
+	dev_pm_genpd_suspend(&ch->cmt->pdev->dev);
 	clk_unprepare(ch->cmt->clk);
 }
 
@@ -769,7 +769,7 @@ static void sh_cmt_clock_event_resume(struct clock_event_device *ced)
 	struct sh_cmt_channel *ch = ced_to_sh_cmt(ced);
 
 	clk_prepare(ch->cmt->clk);
-	pm_genpd_syscore_poweron(&ch->cmt->pdev->dev);
+	dev_pm_genpd_resume(&ch->cmt->pdev->dev);
 }
 
 static int sh_cmt_register_clockevent(struct sh_cmt_channel *ch,
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
index bfccb31e94ad..169a1fccc497 100644
--- a/drivers/clocksource/sh_mtu2.c
+++ b/drivers/clocksource/sh_mtu2.c
@@ -297,12 +297,12 @@ static int sh_mtu2_clock_event_set_periodic(struct clock_event_device *ced)
 
 static void sh_mtu2_clock_event_suspend(struct clock_event_device *ced)
 {
-	pm_genpd_syscore_poweroff(&ced_to_sh_mtu2(ced)->mtu->pdev->dev);
+	dev_pm_genpd_suspend(&ced_to_sh_mtu2(ced)->mtu->pdev->dev);
 }
 
 static void sh_mtu2_clock_event_resume(struct clock_event_device *ced)
 {
-	pm_genpd_syscore_poweron(&ced_to_sh_mtu2(ced)->mtu->pdev->dev);
+	dev_pm_genpd_resume(&ced_to_sh_mtu2(ced)->mtu->pdev->dev);
 }
 
 static void sh_mtu2_register_clockevent(struct sh_mtu2_channel *ch,
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index d41df9ba3725..b00dec0655cb 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -292,7 +292,7 @@ static void sh_tmu_clocksource_suspend(struct clocksource *cs)
 
 	if (--ch->enable_count == 0) {
 		__sh_tmu_disable(ch);
-		pm_genpd_syscore_poweroff(&ch->tmu->pdev->dev);
+		dev_pm_genpd_suspend(&ch->tmu->pdev->dev);
 	}
 }
 
@@ -304,7 +304,7 @@ static void sh_tmu_clocksource_resume(struct clocksource *cs)
 		return;
 
 	if (ch->enable_count++ == 0) {
-		pm_genpd_syscore_poweron(&ch->tmu->pdev->dev);
+		dev_pm_genpd_resume(&ch->tmu->pdev->dev);
 		__sh_tmu_enable(ch);
 	}
 }
@@ -394,12 +394,12 @@ static int sh_tmu_clock_event_next(unsigned long delta,
 
 static void sh_tmu_clock_event_suspend(struct clock_event_device *ced)
 {
-	pm_genpd_syscore_poweroff(&ced_to_sh_tmu(ced)->tmu->pdev->dev);
+	dev_pm_genpd_suspend(&ced_to_sh_tmu(ced)->tmu->pdev->dev);
 }
 
 static void sh_tmu_clock_event_resume(struct clock_event_device *ced)
 {
-	pm_genpd_syscore_poweron(&ced_to_sh_tmu(ced)->tmu->pdev->dev);
+	dev_pm_genpd_resume(&ced_to_sh_tmu(ced)->tmu->pdev->dev);
 }
 
 static void sh_tmu_register_clockevent(struct sh_tmu_channel *ch,
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 1ad0ec481416..a8f93328daec 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -280,11 +280,11 @@ static inline int dev_pm_genpd_remove_notifier(struct device *dev)
 #endif
 
 #ifdef CONFIG_PM_GENERIC_DOMAINS_SLEEP
-void pm_genpd_syscore_poweroff(struct device *dev);
-void pm_genpd_syscore_poweron(struct device *dev);
+void dev_pm_genpd_suspend(struct device *dev);
+void dev_pm_genpd_resume(struct device *dev);
 #else
-static inline void pm_genpd_syscore_poweroff(struct device *dev) {}
-static inline void pm_genpd_syscore_poweron(struct device *dev) {}
+static inline void dev_pm_genpd_suspend(struct device *dev) {}
+static inline void dev_pm_genpd_resume(struct device *dev) {}
 #endif
 
 /* OF PM domain providers */
-- 
cgit 


From 36e68442d1afd4f720704ee1ea8486331507e834 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 9 Nov 2020 17:19:31 -0800
Subject: bpf: Load and verify kernel module BTFs

Add kernel module listener that will load/validate and unload module BTF.
Module BTFs gets ID generated for them, which makes it possible to iterate
them with existing BTF iteration API. They are given their respective module's
names, which will get reported through GET_OBJ_INFO API. They are also marked
as in-kernel BTFs for tooling to distinguish them from user-provided BTFs.

Also, similarly to vmlinux BTF, kernel module BTFs are exposed through
sysfs as /sys/kernel/btf/<module-name>. This is convenient for user-space
tools to inspect module BTF contents and dump their types with existing tools:

[vmuser@archvm bpf]$ ls -la /sys/kernel/btf
total 0
drwxr-xr-x  2 root root       0 Nov  4 19:46 .
drwxr-xr-x 13 root root       0 Nov  4 19:46 ..

...

-r--r--r--  1 root root     888 Nov  4 19:46 irqbypass
-r--r--r--  1 root root  100225 Nov  4 19:46 kvm
-r--r--r--  1 root root   35401 Nov  4 19:46 kvm_intel
-r--r--r--  1 root root     120 Nov  4 19:46 pcspkr
-r--r--r--  1 root root     399 Nov  4 19:46 serio_raw
-r--r--r--  1 root root 4094095 Nov  4 19:46 vmlinux

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/bpf/20201110011932.3201430-5-andrii@kernel.org
---
 Documentation/ABI/testing/sysfs-kernel-btf |   8 ++
 include/linux/bpf.h                        |   2 +
 include/linux/module.h                     |   4 +
 kernel/bpf/btf.c                           | 194 +++++++++++++++++++++++++++++
 kernel/bpf/sysfs_btf.c                     |   2 +-
 kernel/module.c                            |  32 +++++
 6 files changed, 241 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-kernel-btf b/Documentation/ABI/testing/sysfs-kernel-btf
index 2c9744b2cd59..fe96efdc9b6c 100644
--- a/Documentation/ABI/testing/sysfs-kernel-btf
+++ b/Documentation/ABI/testing/sysfs-kernel-btf
@@ -15,3 +15,11 @@ Description:
 		information with description of all internal kernel types. See
 		Documentation/bpf/btf.rst for detailed description of format
 		itself.
+
+What:		/sys/kernel/btf/<module-name>
+Date:		Nov 2020
+KernelVersion:	5.11
+Contact:	bpf@vger.kernel.org
+Description:
+		Read-only binary attribute exposing kernel module's BTF type
+		information as an add-on to the kernel's BTF (/sys/kernel/btf/vmlinux).
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 73d5381a5d5c..581b2a2e78eb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -36,9 +36,11 @@ struct seq_operations;
 struct bpf_iter_aux_info;
 struct bpf_local_storage;
 struct bpf_local_storage_map;
+struct kobject;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
+extern struct kobject *btf_kobj;
 
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
 					struct bpf_iter_aux_info *aux);
diff --git a/include/linux/module.h b/include/linux/module.h
index a29187f7c360..20fce258ffba 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -475,6 +475,10 @@ struct module {
 	unsigned int num_bpf_raw_events;
 	struct bpf_raw_event_map *bpf_raw_events;
 #endif
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	unsigned int btf_data_size;
+	void *btf_data;
+#endif
 #ifdef CONFIG_JUMP_LABEL
 	struct jump_entry *jump_entries;
 	unsigned int num_jump_entries;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 856585db7aa7..0f1fd2669d69 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -22,6 +22,8 @@
 #include <linux/skmsg.h>
 #include <linux/perf_event.h>
 #include <linux/bsearch.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 #include <net/sock.h>
 
 /* BTF (BPF Type Format) is the meta data format which describes
@@ -4476,6 +4478,75 @@ errout:
 	return ERR_PTR(err);
 }
 
+static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
+{
+	struct btf_verifier_env *env = NULL;
+	struct bpf_verifier_log *log;
+	struct btf *btf = NULL, *base_btf;
+	int err;
+
+	base_btf = bpf_get_btf_vmlinux();
+	if (IS_ERR(base_btf))
+		return base_btf;
+	if (!base_btf)
+		return ERR_PTR(-EINVAL);
+
+	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+	if (!env)
+		return ERR_PTR(-ENOMEM);
+
+	log = &env->log;
+	log->level = BPF_LOG_KERNEL;
+
+	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+	if (!btf) {
+		err = -ENOMEM;
+		goto errout;
+	}
+	env->btf = btf;
+
+	btf->base_btf = base_btf;
+	btf->start_id = base_btf->nr_types;
+	btf->start_str_off = base_btf->hdr.str_len;
+	btf->kernel_btf = true;
+	snprintf(btf->name, sizeof(btf->name), "%s", module_name);
+
+	btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!btf->data) {
+		err = -ENOMEM;
+		goto errout;
+	}
+	memcpy(btf->data, data, data_size);
+	btf->data_size = data_size;
+
+	err = btf_parse_hdr(env);
+	if (err)
+		goto errout;
+
+	btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+	err = btf_parse_str_sec(env);
+	if (err)
+		goto errout;
+
+	err = btf_check_all_metas(env);
+	if (err)
+		goto errout;
+
+	btf_verifier_env_free(env);
+	refcount_set(&btf->refcnt, 1);
+	return btf;
+
+errout:
+	btf_verifier_env_free(env);
+	if (btf) {
+		kvfree(btf->data);
+		kvfree(btf->types);
+		kfree(btf);
+	}
+	return ERR_PTR(err);
+}
+
 struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
 {
 	struct bpf_prog *tgt_prog = prog->aux->dst_prog;
@@ -5651,3 +5722,126 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
 {
 	return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
 }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+struct btf_module {
+	struct list_head list;
+	struct module *module;
+	struct btf *btf;
+	struct bin_attribute *sysfs_attr;
+};
+
+static LIST_HEAD(btf_modules);
+static DEFINE_MUTEX(btf_module_mutex);
+
+static ssize_t
+btf_module_read(struct file *file, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t off, size_t len)
+{
+	const struct btf *btf = bin_attr->private;
+
+	memcpy(buf, btf->data + off, len);
+	return len;
+}
+
+static int btf_module_notify(struct notifier_block *nb, unsigned long op,
+			     void *module)
+{
+	struct btf_module *btf_mod, *tmp;
+	struct module *mod = module;
+	struct btf *btf;
+	int err = 0;
+
+	if (mod->btf_data_size == 0 ||
+	    (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+		goto out;
+
+	switch (op) {
+	case MODULE_STATE_COMING:
+		btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL);
+		if (!btf_mod) {
+			err = -ENOMEM;
+			goto out;
+		}
+		btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
+		if (IS_ERR(btf)) {
+			pr_warn("failed to validate module [%s] BTF: %ld\n",
+				mod->name, PTR_ERR(btf));
+			kfree(btf_mod);
+			err = PTR_ERR(btf);
+			goto out;
+		}
+		err = btf_alloc_id(btf);
+		if (err) {
+			btf_free(btf);
+			kfree(btf_mod);
+			goto out;
+		}
+
+		mutex_lock(&btf_module_mutex);
+		btf_mod->module = module;
+		btf_mod->btf = btf;
+		list_add(&btf_mod->list, &btf_modules);
+		mutex_unlock(&btf_module_mutex);
+
+		if (IS_ENABLED(CONFIG_SYSFS)) {
+			struct bin_attribute *attr;
+
+			attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+			if (!attr)
+				goto out;
+
+			sysfs_bin_attr_init(attr);
+			attr->attr.name = btf->name;
+			attr->attr.mode = 0444;
+			attr->size = btf->data_size;
+			attr->private = btf;
+			attr->read = btf_module_read;
+
+			err = sysfs_create_bin_file(btf_kobj, attr);
+			if (err) {
+				pr_warn("failed to register module [%s] BTF in sysfs: %d\n",
+					mod->name, err);
+				kfree(attr);
+				err = 0;
+				goto out;
+			}
+
+			btf_mod->sysfs_attr = attr;
+		}
+
+		break;
+	case MODULE_STATE_GOING:
+		mutex_lock(&btf_module_mutex);
+		list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+			if (btf_mod->module != module)
+				continue;
+
+			list_del(&btf_mod->list);
+			if (btf_mod->sysfs_attr)
+				sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+			btf_put(btf_mod->btf);
+			kfree(btf_mod->sysfs_attr);
+			kfree(btf_mod);
+			break;
+		}
+		mutex_unlock(&btf_module_mutex);
+		break;
+	}
+out:
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block btf_module_nb = {
+	.notifier_call = btf_module_notify,
+};
+
+static int __init btf_module_init(void)
+{
+	register_module_notifier(&btf_module_nb);
+	return 0;
+}
+
+fs_initcall(btf_module_init);
+#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 11b3380887fa..ef6911aee3bb 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -26,7 +26,7 @@ static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
 	.read = btf_vmlinux_read,
 };
 
-static struct kobject *btf_kobj;
+struct kobject *btf_kobj;
 
 static int __init btf_vmlinux_init(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index a4fa44a652a7..f2996b02ab2e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -380,6 +380,35 @@ static void *section_objs(const struct load_info *info,
 	return (void *)info->sechdrs[sec].sh_addr;
 }
 
+/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
+static unsigned int find_any_sec(const struct load_info *info, const char *name)
+{
+	unsigned int i;
+
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *shdr = &info->sechdrs[i];
+		if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
+			return i;
+	}
+	return 0;
+}
+
+/*
+ * Find a module section, or NULL. Fill in number of "objects" in section.
+ * Ignores SHF_ALLOC flag.
+ */
+static __maybe_unused void *any_section_objs(const struct load_info *info,
+					     const char *name,
+					     size_t object_size,
+					     unsigned int *num)
+{
+	unsigned int sec = find_any_sec(info, name);
+
+	/* Section 0 has sh_addr 0 and sh_size 0. */
+	*num = info->sechdrs[sec].sh_size / object_size;
+	return (void *)info->sechdrs[sec].sh_addr;
+}
+
 /* Provided by the linker */
 extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
@@ -3250,6 +3279,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 					   sizeof(*mod->bpf_raw_events),
 					   &mod->num_bpf_raw_events);
 #endif
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
+#endif
 #ifdef CONFIG_JUMP_LABEL
 	mod->jump_entries = section_objs(info, "__jump_table",
 					sizeof(*mod->jump_entries),
-- 
cgit 


From 8a3c84b649b033024d2349f96234b26cbd6083a6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 10 Nov 2020 16:50:21 -0800
Subject: vfs: separate __sb_start_write into blocking and non-blocking helpers

Break this function into two helpers so that it's obvious that the
trylock versions return a value that must be checked, and the blocking
versions don't require that.  While we're at it, clean up the return
type mismatch.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c           |  2 +-
 fs/io_uring.c      |  3 +--
 fs/super.c         | 18 ++++++++++++------
 include/linux/fs.h | 21 +++++++++++----------
 4 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index c45c20d87538..6a21d8919409 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1572,7 +1572,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 		 * we return to userspace.
 		 */
 		if (S_ISREG(file_inode(file)->i_mode)) {
-			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
+			sb_start_write(file_inode(file)->i_sb);
 			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 		}
 		req->ki_flags |= IOCB_WRITE;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b42dfa0243bf..4cbaddfe3d80 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3532,8 +3532,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	 * we return to userspace.
 	 */
 	if (req->flags & REQ_F_ISREG) {
-		__sb_start_write(file_inode(req->file)->i_sb,
-					SB_FREEZE_WRITE, true);
+		sb_start_write(file_inode(req->file)->i_sb);
 		__sb_writers_release(file_inode(req->file)->i_sb,
 					SB_FREEZE_WRITE);
 	}
diff --git a/fs/super.c b/fs/super.c
index e1fd667454d4..59aa59279133 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1645,16 +1645,22 @@ EXPORT_SYMBOL(__sb_end_write);
  * This is an internal function, please use sb_start_{write,pagefault,intwrite}
  * instead.
  */
-int __sb_start_write(struct super_block *sb, int level, bool wait)
+void __sb_start_write(struct super_block *sb, int level)
 {
-	if (!wait)
-		return percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
-
-	percpu_down_read(sb->s_writers.rw_sem + level-1);
-	return 1;
+	percpu_down_read(sb->s_writers.rw_sem + level - 1);
 }
 EXPORT_SYMBOL(__sb_start_write);
 
+/*
+ * This is an internal function, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+bool __sb_start_write_trylock(struct super_block *sb, int level)
+{
+	return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
+}
+EXPORT_SYMBOL_GPL(__sb_start_write_trylock);
+
 /**
  * sb_wait_write - wait until all writers to given file system finish
  * @sb: the super for which we wait
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0bd126418bb6..305989afd49c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1581,7 +1581,8 @@ extern struct timespec64 current_time(struct inode *inode);
  */
 
 void __sb_end_write(struct super_block *sb, int level);
-int __sb_start_write(struct super_block *sb, int level, bool wait);
+void __sb_start_write(struct super_block *sb, int level);
+bool __sb_start_write_trylock(struct super_block *sb, int level);
 
 #define __sb_writers_acquired(sb, lev)	\
 	percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
@@ -1645,12 +1646,12 @@ static inline void sb_end_intwrite(struct super_block *sb)
  */
 static inline void sb_start_write(struct super_block *sb)
 {
-	__sb_start_write(sb, SB_FREEZE_WRITE, true);
+	__sb_start_write(sb, SB_FREEZE_WRITE);
 }
 
-static inline int sb_start_write_trylock(struct super_block *sb)
+static inline bool sb_start_write_trylock(struct super_block *sb)
 {
-	return __sb_start_write(sb, SB_FREEZE_WRITE, false);
+	return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
 }
 
 /**
@@ -1674,7 +1675,7 @@ static inline int sb_start_write_trylock(struct super_block *sb)
  */
 static inline void sb_start_pagefault(struct super_block *sb)
 {
-	__sb_start_write(sb, SB_FREEZE_PAGEFAULT, true);
+	__sb_start_write(sb, SB_FREEZE_PAGEFAULT);
 }
 
 /*
@@ -1692,12 +1693,12 @@ static inline void sb_start_pagefault(struct super_block *sb)
  */
 static inline void sb_start_intwrite(struct super_block *sb)
 {
-	__sb_start_write(sb, SB_FREEZE_FS, true);
+	__sb_start_write(sb, SB_FREEZE_FS);
 }
 
-static inline int sb_start_intwrite_trylock(struct super_block *sb)
+static inline bool sb_start_intwrite_trylock(struct super_block *sb)
 {
-	return __sb_start_write(sb, SB_FREEZE_FS, false);
+	return __sb_start_write_trylock(sb, SB_FREEZE_FS);
 }
 
 
@@ -2756,14 +2757,14 @@ static inline void file_start_write(struct file *file)
 {
 	if (!S_ISREG(file_inode(file)->i_mode))
 		return;
-	__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
+	sb_start_write(file_inode(file)->i_sb);
 }
 
 static inline bool file_start_write_trylock(struct file *file)
 {
 	if (!S_ISREG(file_inode(file)->i_mode))
 		return true;
-	return __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, false);
+	return sb_start_write_trylock(file_inode(file)->i_sb);
 }
 
 static inline void file_end_write(struct file *file)
-- 
cgit 


From 9b8523423b23ee3dfd88e32f5b7207be56a4e782 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 10 Nov 2020 16:50:21 -0800
Subject: vfs: move __sb_{start,end}_write* to fs.h

Now that we've straightened out the callers, move these three functions
to fs.h since they're fairly trivial.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/super.c         | 30 ------------------------------
 include/linux/fs.h | 21 ++++++++++++++++++---
 2 files changed, 18 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 59aa59279133..98bb0629ee10 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1631,36 +1631,6 @@ int super_setup_bdi(struct super_block *sb)
 }
 EXPORT_SYMBOL(super_setup_bdi);
 
-/*
- * This is an internal function, please use sb_end_{write,pagefault,intwrite}
- * instead.
- */
-void __sb_end_write(struct super_block *sb, int level)
-{
-	percpu_up_read(sb->s_writers.rw_sem + level-1);
-}
-EXPORT_SYMBOL(__sb_end_write);
-
-/*
- * This is an internal function, please use sb_start_{write,pagefault,intwrite}
- * instead.
- */
-void __sb_start_write(struct super_block *sb, int level)
-{
-	percpu_down_read(sb->s_writers.rw_sem + level - 1);
-}
-EXPORT_SYMBOL(__sb_start_write);
-
-/*
- * This is an internal function, please use sb_start_{write,pagefault,intwrite}
- * instead.
- */
-bool __sb_start_write_trylock(struct super_block *sb, int level)
-{
-	return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
-}
-EXPORT_SYMBOL_GPL(__sb_start_write_trylock);
-
 /**
  * sb_wait_write - wait until all writers to given file system finish
  * @sb: the super for which we wait
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 305989afd49c..6dabd019cab0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1580,9 +1580,24 @@ extern struct timespec64 current_time(struct inode *inode);
  * Snapshotting support.
  */
 
-void __sb_end_write(struct super_block *sb, int level);
-void __sb_start_write(struct super_block *sb, int level);
-bool __sb_start_write_trylock(struct super_block *sb, int level);
+/*
+ * These are internal functions, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+static inline void __sb_end_write(struct super_block *sb, int level)
+{
+	percpu_up_read(sb->s_writers.rw_sem + level-1);
+}
+
+static inline void __sb_start_write(struct super_block *sb, int level)
+{
+	percpu_down_read(sb->s_writers.rw_sem + level - 1);
+}
+
+static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
+{
+	return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
+}
 
 #define __sb_writers_acquired(sb, lev)	\
 	percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
-- 
cgit 


From 60602cb549f1965a7edbc96026760dfb93911fab Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Oct 2020 08:19:24 -0400
Subject: fgraph: Make overruns 4 bytes in graph stack structure

Inspecting the data structures of the function graph tracer, I found that
the overrun value is unsigned long, which is 8 bytes on a 64 bit machine,
and not only that, the depth is an int (4 bytes). The overrun can be simply
an unsigned int (4 bytes) and pack the ftrace_graph_ret structure better.

The depth is moved up next to the func, as it is used more often with func,
and improves cache locality.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h               | 4 ++--
 kernel/trace/trace_entries.h         | 4 ++--
 kernel/trace/trace_functions_graph.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 806196345c3f..8dde9c17aaa5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -864,11 +864,11 @@ struct ftrace_graph_ent {
  */
 struct ftrace_graph_ret {
 	unsigned long func; /* Current function */
+	int depth;
 	/* Number of functions that overran the depth limit for current task */
-	unsigned long overrun;
+	unsigned int overrun;
 	unsigned long long calltime;
 	unsigned long long rettime;
-	int depth;
 } __packed;
 
 /* Type of the callback handlers for tracing function graph*/
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 18c4a58aff79..ceafe2dc97e1 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -93,10 +93,10 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 	F_STRUCT(
 		__field_struct(	struct ftrace_graph_ret,	ret	)
 		__field_packed(	unsigned long,	ret,		func	)
-		__field_packed(	unsigned long,	ret,		overrun	)
+		__field_packed(	int,		ret,		depth	)
+		__field_packed(	unsigned int,	ret,		overrun	)
 		__field_packed(	unsigned long long, ret,	calltime)
 		__field_packed(	unsigned long long, ret,	rettime	)
-		__field_packed(	int,		ret,		depth	)
 	),
 
 	F_printk("<-- %ps (%d) (start: %llx  end: %llx) over: %d",
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 60d66278aa0d..d874dec87131 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -957,7 +957,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 
 	/* Overrun */
 	if (flags & TRACE_GRAPH_PRINT_OVERRUN)
-		trace_seq_printf(s, " (Overruns: %lu)\n",
+		trace_seq_printf(s, " (Overruns: %u)\n",
 				 trace->overrun);
 
 	print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
-- 
cgit 


From 7b68621f8d16689cbb4203aceaca86ffb165f1d0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 30 Oct 2020 17:21:00 -0400
Subject: ftrace: Clean up the recursion code a bit

In trace_test_and_set_recursion(), current->trace_recursion is placed into a
variable, and that variable should be used for the processing, as there's no
reason to dereference current multiple times.

On trace_clear_recursion(), current->trace_recursion is modified and there's
no reason to copy it over to a variable.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 228cc56ed66e..a9f9c5714e65 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -162,7 +162,7 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
 static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip,
 							int start, int max)
 {
-	unsigned int val = current->trace_recursion;
+	unsigned int val = READ_ONCE(current->trace_recursion);
 	int bit;
 
 	/* A previous recursion check was made */
@@ -176,18 +176,15 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 		 * a switch between contexts. Allow for a single recursion.
 		 */
 		bit = TRACE_TRANSITION_BIT;
-		if (trace_recursion_test(bit)) {
+		if (val & (1 << bit)) {
 			do_ftrace_record_recursion(ip, pip);
 			return -1;
 		}
-		trace_recursion_set(bit);
-		barrier();
-		return bit + 1;
+	} else {
+		/* Normal check passed, clear the transition to allow it again */
+		val &= ~(1 << TRACE_TRANSITION_BIT);
 	}
 
-	/* Normal check passed, clear the transition to allow it again */
-	trace_recursion_clear(TRACE_TRANSITION_BIT);
-
 	val |= 1 << bit;
 	current->trace_recursion = val;
 	barrier();
@@ -197,17 +194,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 
 static __always_inline void trace_clear_recursion(int bit)
 {
-	unsigned int val = current->trace_recursion;
-
 	if (!bit)
 		return;
 
-	bit--;
-	bit = 1 << bit;
-	val &= ~bit;
-
 	barrier();
-	current->trace_recursion = val;
+	bit--;
+	trace_recursion_clear(bit);
 }
 
 /**
-- 
cgit 


From 4210d50f0b3e423e10a7a254b2a67f5c5318868e Mon Sep 17 00:00:00 2001
From: Isaac Hazan <isaac.hazan@intel.com>
Date: Thu, 24 Sep 2020 11:43:58 +0300
Subject: thunderbolt: Add link_speed and link_width to XDomain

Link speed and link width are needed for checking expected values in
case of using a loopback service.

Signed-off-by: Isaac Hazan <isaac.hazan@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-thunderbolt | 28 +++++++++++
 drivers/thunderbolt/switch.c                    |  9 +++-
 drivers/thunderbolt/tb.h                        |  1 +
 drivers/thunderbolt/xdomain.c                   | 65 +++++++++++++++++++++++++
 include/linux/thunderbolt.h                     |  4 ++
 5 files changed, 106 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt
index 0b4ab9e4b8f4..a91b4b24496e 100644
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@@ -1,3 +1,31 @@
+What:		/sys/bus/thunderbolt/devices/<xdomain>/rx_speed
+Date:		Feb 2021
+KernelVersion:	5.11
+Contact:	Isaac Hazan <isaac.hazan@intel.com>
+Description:	This attribute reports the XDomain RX speed per lane.
+		All RX lanes run at the same speed.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>/rx_lanes
+Date:		Feb 2021
+KernelVersion:	5.11
+Contact:	Isaac Hazan <isaac.hazan@intel.com>
+Description:	This attribute reports the number of RX lanes the XDomain
+		is using simultaneously through its upstream port.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>/tx_speed
+Date:		Feb 2021
+KernelVersion:	5.11
+Contact:	Isaac Hazan <isaac.hazan@intel.com>
+Description:	This attribute reports the XDomain TX speed per lane.
+		All TX lanes run at the same speed.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>/tx_lanes
+Date:		Feb 2021
+KernelVersion:	5.11
+Contact:	Isaac Hazan <isaac.hazan@intel.com>
+Description:	This attribute reports number of TX lanes the XDomain
+		is using simultaneously through its upstream port.
+
 What: /sys/bus/thunderbolt/devices/.../domainX/boot_acl
 Date:		Jun 2018
 KernelVersion:	4.17
diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index c73bbfe69ba1..05a360901790 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -932,7 +932,14 @@ int tb_port_get_link_speed(struct tb_port *port)
 	return speed == LANE_ADP_CS_1_CURRENT_SPEED_GEN3 ? 20 : 10;
 }
 
-static int tb_port_get_link_width(struct tb_port *port)
+/**
+ * tb_port_get_link_width() - Get current link width
+ * @port: Port to check (USB4 or CIO)
+ *
+ * Returns link width. Return values can be 1 (Single-Lane), 2 (Dual-Lane)
+ * or negative errno in case of failure.
+ */
+int tb_port_get_link_width(struct tb_port *port)
 {
 	u32 val;
 	int ret;
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index a9995e21b916..3a826315049e 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -862,6 +862,7 @@ struct tb_port *tb_next_port_on_path(struct tb_port *start, struct tb_port *end,
 	     (p) = tb_next_port_on_path((src), (dst), (p)))
 
 int tb_port_get_link_speed(struct tb_port *port);
+int tb_port_get_link_width(struct tb_port *port);
 
 int tb_switch_find_vse_cap(struct tb_switch *sw, enum tb_switch_vse_cap vsec);
 int tb_switch_find_cap(struct tb_switch *sw, enum tb_switch_cap cap);
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
index da229ac4e471..83a315f96934 100644
--- a/drivers/thunderbolt/xdomain.c
+++ b/drivers/thunderbolt/xdomain.c
@@ -942,6 +942,43 @@ static void tb_xdomain_restore_paths(struct tb_xdomain *xd)
 	}
 }
 
+static inline struct tb_switch *tb_xdomain_parent(struct tb_xdomain *xd)
+{
+	return tb_to_switch(xd->dev.parent);
+}
+
+static int tb_xdomain_update_link_attributes(struct tb_xdomain *xd)
+{
+	bool change = false;
+	struct tb_port *port;
+	int ret;
+
+	port = tb_port_at(xd->route, tb_xdomain_parent(xd));
+
+	ret = tb_port_get_link_speed(port);
+	if (ret < 0)
+		return ret;
+
+	if (xd->link_speed != ret)
+		change = true;
+
+	xd->link_speed = ret;
+
+	ret = tb_port_get_link_width(port);
+	if (ret < 0)
+		return ret;
+
+	if (xd->link_width != ret)
+		change = true;
+
+	xd->link_width = ret;
+
+	if (change)
+		kobject_uevent(&xd->dev.kobj, KOBJ_CHANGE);
+
+	return 0;
+}
+
 static void tb_xdomain_get_uuid(struct work_struct *work)
 {
 	struct tb_xdomain *xd = container_of(work, typeof(*xd),
@@ -1053,6 +1090,8 @@ static void tb_xdomain_get_properties(struct work_struct *work)
 	xd->properties = dir;
 	xd->property_block_gen = gen;
 
+	tb_xdomain_update_link_attributes(xd);
+
 	tb_xdomain_restore_paths(xd);
 
 	mutex_unlock(&xd->lock);
@@ -1159,9 +1198,35 @@ static ssize_t unique_id_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(unique_id);
 
+static ssize_t speed_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%u.0 Gb/s\n", xd->link_speed);
+}
+
+static DEVICE_ATTR(rx_speed, 0444, speed_show, NULL);
+static DEVICE_ATTR(tx_speed, 0444, speed_show, NULL);
+
+static ssize_t lanes_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%u\n", xd->link_width);
+}
+
+static DEVICE_ATTR(rx_lanes, 0444, lanes_show, NULL);
+static DEVICE_ATTR(tx_lanes, 0444, lanes_show, NULL);
+
 static struct attribute *xdomain_attrs[] = {
 	&dev_attr_device.attr,
 	&dev_attr_device_name.attr,
+	&dev_attr_rx_lanes.attr,
+	&dev_attr_rx_speed.attr,
+	&dev_attr_tx_lanes.attr,
+	&dev_attr_tx_speed.attr,
 	&dev_attr_unique_id.attr,
 	&dev_attr_vendor.attr,
 	&dev_attr_vendor_name.attr,
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 5db2b11ab085..e441af88ed77 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -179,6 +179,8 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
  * @lock: Lock to serialize access to the following fields of this structure
  * @vendor_name: Name of the vendor (or %NULL if not known)
  * @device_name: Name of the device (or %NULL if not known)
+ * @link_speed: Speed of the link in Gb/s
+ * @link_width: Width of the link (1 or 2)
  * @is_unplugged: The XDomain is unplugged
  * @resume: The XDomain is being resumed
  * @needs_uuid: If the XDomain does not have @remote_uuid it will be
@@ -223,6 +225,8 @@ struct tb_xdomain {
 	struct mutex lock;
 	const char *vendor_name;
 	const char *device_name;
+	unsigned int link_speed;
+	unsigned int link_width;
 	bool is_unplugged;
 	bool resume;
 	bool needs_uuid;
-- 
cgit 


From 5cc0df9ce10a860aaeac53f8df1cc8754c5c7b03 Mon Sep 17 00:00:00 2001
From: Isaac Hazan <isaac.hazan@intel.com>
Date: Thu, 24 Sep 2020 11:44:01 +0300
Subject: thunderbolt: Add functions for enabling and disabling lane bonding on
 XDomain

These can be used by service drivers to enable and disable lane bonding
as needed.

Signed-off-by: Isaac Hazan <isaac.hazan@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/thunderbolt/switch.c  | 24 ++++++++++++++--
 drivers/thunderbolt/tb.h      |  3 ++
 drivers/thunderbolt/xdomain.c | 66 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/thunderbolt.h   |  2 ++
 4 files changed, 92 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index 05a360901790..cdfd8cccfe19 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -503,12 +503,13 @@ static void tb_dump_port(struct tb *tb, struct tb_regs_port_header *port)
 
 /**
  * tb_port_state() - get connectedness state of a port
+ * @port: the port to check
  *
  * The port must have a TB_CAP_PHY (i.e. it should be a real port).
  *
  * Return: Returns an enum tb_port_state on success or an error code on failure.
  */
-static int tb_port_state(struct tb_port *port)
+int tb_port_state(struct tb_port *port)
 {
 	struct tb_cap_phy phy;
 	int res;
@@ -1008,7 +1009,16 @@ static int tb_port_set_link_width(struct tb_port *port, unsigned int width)
 			     port->cap_phy + LANE_ADP_CS_1, 1);
 }
 
-static int tb_port_lane_bonding_enable(struct tb_port *port)
+/**
+ * tb_port_lane_bonding_enable() - Enable bonding on port
+ * @port: port to enable
+ *
+ * Enable bonding by setting the link width of the port and the
+ * other port in case of dual link port.
+ *
+ * Return: %0 in case of success and negative errno in case of error
+ */
+int tb_port_lane_bonding_enable(struct tb_port *port)
 {
 	int ret;
 
@@ -1038,7 +1048,15 @@ static int tb_port_lane_bonding_enable(struct tb_port *port)
 	return 0;
 }
 
-static void tb_port_lane_bonding_disable(struct tb_port *port)
+/**
+ * tb_port_lane_bonding_disable() - Disable bonding on port
+ * @port: port to disable
+ *
+ * Disable bonding by setting the link width of the port and the
+ * other port in case of dual link port.
+ *
+ */
+void tb_port_lane_bonding_disable(struct tb_port *port)
 {
 	port->dual_link_port->bonded = false;
 	port->bonded = false;
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index 3a826315049e..e98d3561648d 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -863,6 +863,9 @@ struct tb_port *tb_next_port_on_path(struct tb_port *start, struct tb_port *end,
 
 int tb_port_get_link_speed(struct tb_port *port);
 int tb_port_get_link_width(struct tb_port *port);
+int tb_port_state(struct tb_port *port);
+int tb_port_lane_bonding_enable(struct tb_port *port);
+void tb_port_lane_bonding_disable(struct tb_port *port);
 
 int tb_switch_find_vse_cap(struct tb_switch *sw, enum tb_switch_vse_cap vsec);
 int tb_switch_find_cap(struct tb_switch *sw, enum tb_switch_cap cap);
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
index 83a315f96934..65108216bfe3 100644
--- a/drivers/thunderbolt/xdomain.c
+++ b/drivers/thunderbolt/xdomain.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/delay.h>
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/pm_runtime.h>
@@ -21,6 +22,7 @@
 #define XDOMAIN_UUID_RETRIES			10
 #define XDOMAIN_PROPERTIES_RETRIES		60
 #define XDOMAIN_PROPERTIES_CHANGED_RETRIES	10
+#define XDOMAIN_BONDING_WAIT			100  /* ms */
 
 struct xdomain_request_work {
 	struct work_struct work;
@@ -1443,6 +1445,70 @@ void tb_xdomain_remove(struct tb_xdomain *xd)
 		device_unregister(&xd->dev);
 }
 
+/**
+ * tb_xdomain_lane_bonding_enable() - Enable lane bonding on XDomain
+ * @xd: XDomain connection
+ *
+ * Lane bonding is disabled by default for XDomains. This function tries
+ * to enable bonding by first enabling the port and waiting for the CL0
+ * state.
+ *
+ * Return: %0 in case of success and negative errno in case of error.
+ */
+int tb_xdomain_lane_bonding_enable(struct tb_xdomain *xd)
+{
+	struct tb_port *port;
+	int ret;
+
+	port = tb_port_at(xd->route, tb_xdomain_parent(xd));
+	if (!port->dual_link_port)
+		return -ENODEV;
+
+	ret = tb_port_enable(port->dual_link_port);
+	if (ret)
+		return ret;
+
+	ret = tb_wait_for_port(port->dual_link_port, true);
+	if (ret < 0)
+		return ret;
+	if (!ret)
+		return -ENOTCONN;
+
+	ret = tb_port_lane_bonding_enable(port);
+	if (ret) {
+		tb_port_warn(port, "failed to enable lane bonding\n");
+		return ret;
+	}
+
+	tb_xdomain_update_link_attributes(xd);
+
+	dev_dbg(&xd->dev, "lane bonding enabled\n");
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_lane_bonding_enable);
+
+/**
+ * tb_xdomain_lane_bonding_disable() - Disable lane bonding
+ * @xd: XDomain connection
+ *
+ * Lane bonding is disabled by default for XDomains. If bonding has been
+ * enabled, this function can be used to disable it.
+ */
+void tb_xdomain_lane_bonding_disable(struct tb_xdomain *xd)
+{
+	struct tb_port *port;
+
+	port = tb_port_at(xd->route, tb_xdomain_parent(xd));
+	if (port->dual_link_port) {
+		tb_port_lane_bonding_disable(port);
+		tb_port_disable(port->dual_link_port);
+		tb_xdomain_update_link_attributes(xd);
+
+		dev_dbg(&xd->dev, "lane bonding disabled\n");
+	}
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_lane_bonding_disable);
+
 /**
  * tb_xdomain_enable_paths() - Enable DMA paths for XDomain connection
  * @xd: XDomain connection
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index e441af88ed77..0a747f92847e 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -247,6 +247,8 @@ struct tb_xdomain {
 	u8 depth;
 };
 
+int tb_xdomain_lane_bonding_enable(struct tb_xdomain *xd);
+void tb_xdomain_lane_bonding_disable(struct tb_xdomain *xd);
 int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
 			    u16 transmit_ring, u16 receive_path,
 			    u16 receive_ring);
-- 
cgit 


From 407ac931aefda91ac90498c6b6e6893982173613 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 7 Oct 2020 17:53:44 +0300
Subject: thunderbolt: Create debugfs directory automatically for services

This allows service drivers to use it as parent directory if they need
to add their own debugfs entries.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/thunderbolt/debugfs.c | 24 ++++++++++++++++++++++++
 drivers/thunderbolt/tb.h      |  4 ++++
 drivers/thunderbolt/xdomain.c |  3 +++
 include/linux/thunderbolt.h   |  4 ++++
 4 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/debugfs.c b/drivers/thunderbolt/debugfs.c
index 3680b2784ea1..e53ca8270acd 100644
--- a/drivers/thunderbolt/debugfs.c
+++ b/drivers/thunderbolt/debugfs.c
@@ -690,6 +690,30 @@ void tb_switch_debugfs_remove(struct tb_switch *sw)
 	debugfs_remove_recursive(sw->debugfs_dir);
 }
 
+/**
+ * tb_service_debugfs_init() - Add debugfs directory for service
+ * @svc: Thunderbolt service pointer
+ *
+ * Adds debugfs directory for service.
+ */
+void tb_service_debugfs_init(struct tb_service *svc)
+{
+	svc->debugfs_dir = debugfs_create_dir(dev_name(&svc->dev),
+					      tb_debugfs_root);
+}
+
+/**
+ * tb_service_debugfs_remove() - Remove service debugfs directory
+ * @svc: Thunderbolt service pointer
+ *
+ * Removes the previously created debugfs directory for @svc.
+ */
+void tb_service_debugfs_remove(struct tb_service *svc)
+{
+	debugfs_remove_recursive(svc->debugfs_dir);
+	svc->debugfs_dir = NULL;
+}
+
 void tb_debugfs_init(void)
 {
 	tb_debugfs_root = debugfs_create_dir("thunderbolt", NULL);
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index e98d3561648d..a21000649009 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -1027,11 +1027,15 @@ void tb_debugfs_init(void);
 void tb_debugfs_exit(void);
 void tb_switch_debugfs_init(struct tb_switch *sw);
 void tb_switch_debugfs_remove(struct tb_switch *sw);
+void tb_service_debugfs_init(struct tb_service *svc);
+void tb_service_debugfs_remove(struct tb_service *svc);
 #else
 static inline void tb_debugfs_init(void) { }
 static inline void tb_debugfs_exit(void) { }
 static inline void tb_switch_debugfs_init(struct tb_switch *sw) { }
 static inline void tb_switch_debugfs_remove(struct tb_switch *sw) { }
+static inline void tb_service_debugfs_init(struct tb_service *svc) { }
+static inline void tb_service_debugfs_remove(struct tb_service *svc) { }
 #endif
 
 #ifdef CONFIG_USB4_KUNIT_TEST
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
index 65108216bfe3..1a0491b461fd 100644
--- a/drivers/thunderbolt/xdomain.c
+++ b/drivers/thunderbolt/xdomain.c
@@ -779,6 +779,7 @@ static void tb_service_release(struct device *dev)
 	struct tb_service *svc = container_of(dev, struct tb_service, dev);
 	struct tb_xdomain *xd = tb_service_parent(svc);
 
+	tb_service_debugfs_remove(svc);
 	ida_simple_remove(&xd->service_ids, svc->id);
 	kfree(svc->key);
 	kfree(svc);
@@ -892,6 +893,8 @@ static void enumerate_services(struct tb_xdomain *xd)
 		svc->dev.parent = &xd->dev;
 		dev_set_name(&svc->dev, "%s.%d", dev_name(&xd->dev), svc->id);
 
+		tb_service_debugfs_init(svc);
+
 		if (device_register(&svc->dev)) {
 			put_device(&svc->dev);
 			break;
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 0a747f92847e..a844fd5d96ab 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -350,6 +350,9 @@ void tb_unregister_protocol_handler(struct tb_protocol_handler *handler);
  * @prtcvers: Protocol version from the properties directory
  * @prtcrevs: Protocol software revision from the properties directory
  * @prtcstns: Protocol settings mask from the properties directory
+ * @debugfs_dir: Pointer to the service debugfs directory. Always created
+ *		 when debugfs is enabled. Can be used by service drivers to
+ *		 add their own entries under the service.
  *
  * Each domain exposes set of services it supports as collection of
  * properties. For each service there will be one corresponding
@@ -363,6 +366,7 @@ struct tb_service {
 	u32 prtcvers;
 	u32 prtcrevs;
 	u32 prtcstns;
+	struct dentry *debugfs_dir;
 };
 
 static inline struct tb_service *tb_service_get(struct tb_service *svc)
-- 
cgit 


From afe704a2d0618ebdb559b5ddb059f6cdbfc78783 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 19 Oct 2020 19:15:20 +0300
Subject: thunderbolt: Add support for end-to-end flow control

USB4 spec defines end-to-end (E2E) flow control that can be used between
hosts to prevent overflow of a RX ring. We previously had this partially
implemented but that code was removed with commit 53f13319d131
("thunderbolt: Get rid of E2E workaround") with the idea that we add it
back properly if there ever is need. Now that we are going to add DMA
traffic test driver (in subsequent patches) this can be useful.

For this reason we modify tb_ring_alloc_rx/tx() so that they accept
RING_FLAG_E2E and configure the hardware ring accordingly. The RX side
also requires passing TX HopID (e2e_tx_hop) used in the credit grant
packets.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/thunderbolt.c   |  2 +-
 drivers/thunderbolt/ctl.c   |  4 ++--
 drivers/thunderbolt/nhi.c   | 36 ++++++++++++++++++++++++++++++++----
 include/linux/thunderbolt.h |  8 +++++++-
 4 files changed, 42 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c
index 3160443ef3b9..d7b5f87eaa15 100644
--- a/drivers/net/thunderbolt.c
+++ b/drivers/net/thunderbolt.c
@@ -866,7 +866,7 @@ static int tbnet_open(struct net_device *dev)
 	eof_mask = BIT(TBIP_PDF_FRAME_END);
 
 	ring = tb_ring_alloc_rx(xd->tb->nhi, -1, TBNET_RING_SIZE,
-				RING_FLAG_FRAME, sof_mask, eof_mask,
+				RING_FLAG_FRAME, 0, sof_mask, eof_mask,
 				tbnet_start_poll, net);
 	if (!ring) {
 		netdev_err(dev, "failed to allocate Rx ring\n");
diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index 9894b8f63064..1d86e27a0ef3 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -628,8 +628,8 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data)
 	if (!ctl->tx)
 		goto err;
 
-	ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
-				0xffff, NULL, NULL);
+	ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0, 0xffff,
+				   0xffff, NULL, NULL);
 	if (!ctl->rx)
 		goto err;
 
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 3f79baa54829..a69bc6b49405 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -483,7 +483,7 @@ err_unlock:
 
 static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 				     bool transmit, unsigned int flags,
-				     u16 sof_mask, u16 eof_mask,
+				     int e2e_tx_hop, u16 sof_mask, u16 eof_mask,
 				     void (*start_poll)(void *),
 				     void *poll_data)
 {
@@ -506,6 +506,7 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	ring->is_tx = transmit;
 	ring->size = size;
 	ring->flags = flags;
+	ring->e2e_tx_hop = e2e_tx_hop;
 	ring->sof_mask = sof_mask;
 	ring->eof_mask = eof_mask;
 	ring->head = 0;
@@ -550,7 +551,7 @@ err_free_ring:
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags)
 {
-	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0, NULL, NULL);
+	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
 
@@ -560,6 +561,7 @@ EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
  * @hop: HopID (ring) to allocate. Pass %-1 for automatic allocation.
  * @size: Number of entries in the ring
  * @flags: Flags for the ring
+ * @e2e_tx_hop: Transmit HopID when E2E is enabled in @flags
  * @sof_mask: Mask of PDF values that start a frame
  * @eof_mask: Mask of PDF values that end a frame
  * @start_poll: If not %NULL the ring will call this function when an
@@ -568,10 +570,11 @@ EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
  * @poll_data: Optional data passed to @start_poll
  */
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 unsigned int flags, int e2e_tx_hop,
+				 u16 sof_mask, u16 eof_mask,
 				 void (*start_poll)(void *), void *poll_data)
 {
-	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask,
+	return tb_ring_alloc(nhi, hop, size, false, flags, e2e_tx_hop, sof_mask, eof_mask,
 			     start_poll, poll_data);
 }
 EXPORT_SYMBOL_GPL(tb_ring_alloc_rx);
@@ -618,6 +621,31 @@ void tb_ring_start(struct tb_ring *ring)
 		ring_iowrite32options(ring, sof_eof_mask, 4);
 		ring_iowrite32options(ring, flags, 0);
 	}
+
+	/*
+	 * Now that the ring valid bit is set we can configure E2E if
+	 * enabled for the ring.
+	 */
+	if (ring->flags & RING_FLAG_E2E) {
+		if (!ring->is_tx) {
+			u32 hop;
+
+			hop = ring->e2e_tx_hop << REG_RX_OPTIONS_E2E_HOP_SHIFT;
+			hop &= REG_RX_OPTIONS_E2E_HOP_MASK;
+			flags |= hop;
+
+			dev_dbg(&ring->nhi->pdev->dev,
+				"enabling E2E for %s %d with TX HopID %d\n",
+				RING_TYPE(ring), ring->hop, ring->e2e_tx_hop);
+		} else {
+			dev_dbg(&ring->nhi->pdev->dev, "enabling E2E for %s %d\n",
+				RING_TYPE(ring), ring->hop);
+		}
+
+		flags |= RING_FLAG_E2E_FLOW_CONTROL;
+		ring_iowrite32options(ring, flags, 0);
+	}
+
 	ring_interrupt_active(ring, true);
 	ring->running = true;
 err:
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index a844fd5d96ab..034dccf93955 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -481,6 +481,8 @@ struct tb_nhi {
  * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise.
  * @vector: MSI-X vector number the ring uses (only set if @irq is > 0)
  * @flags: Ring specific flags
+ * @e2e_tx_hop: Transmit HopID when E2E is enabled. Only applicable to
+ *		RX ring. For TX ring this should be set to %0.
  * @sof_mask: Bit mask used to detect start of frame PDF
  * @eof_mask: Bit mask used to detect end of frame PDF
  * @start_poll: Called when ring interrupt is triggered to start
@@ -504,6 +506,7 @@ struct tb_ring {
 	int irq;
 	u8 vector;
 	unsigned int flags;
+	int e2e_tx_hop;
 	u16 sof_mask;
 	u16 eof_mask;
 	void (*start_poll)(void *data);
@@ -514,6 +517,8 @@ struct tb_ring {
 #define RING_FLAG_NO_SUSPEND	BIT(0)
 /* Configure the ring to be in frame mode */
 #define RING_FLAG_FRAME		BIT(1)
+/* Enable end-to-end flow control */
+#define RING_FLAG_E2E		BIT(2)
 
 struct ring_frame;
 typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled);
@@ -562,7 +567,8 @@ struct ring_frame {
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags);
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 unsigned int flags, int e2e_tx_hop,
+				 u16 sof_mask, u16 eof_mask,
 				 void (*start_poll)(void *), void *poll_data);
 void tb_ring_start(struct tb_ring *ring);
 void tb_ring_stop(struct tb_ring *ring);
-- 
cgit 


From 029b42d8519cef70c4fb5fcaccd08f1053ed2bf0 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 27 Oct 2020 10:57:23 +0100
Subject: spi: introduce SPI_MODE_X_MASK macro

Provide a macro to filter all SPI_MODE_0,1,2,3 mode in one run.

The latest SPI framework will parse the devicetree in following call
sequence: of_register_spi_device() -> of_spi_parse_dt()
So, driver do not need to pars the devicetree and will get prepared
flags in the probe.

On one hand it is good far most drivers. On other hand some drivers need to
filter flags provide by SPI framework and apply know to work flags. This drivers
may use SPI_MODE_X_MASK to filter MODE flags and set own, known flags:
  spi->flags &= ~SPI_MODE_X_MASK;
  spi->flags |= SPI_MODE_0;

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20201027095724.18654-2-o.rempel@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 99380c0825db..8097f27702f3 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -171,6 +171,7 @@ struct spi_device {
 #define	SPI_MODE_1	(0|SPI_CPHA)
 #define	SPI_MODE_2	(SPI_CPOL|0)
 #define	SPI_MODE_3	(SPI_CPOL|SPI_CPHA)
+#define	SPI_MODE_X_MASK	(SPI_CPOL|SPI_CPHA)
 #define	SPI_CS_HIGH	0x04			/* chipselect active high? */
 #define	SPI_LSB_FIRST	0x08			/* per-word bits-on-wire */
 #define	SPI_3WIRE	0x10			/* SI/SO signals shared */
-- 
cgit 


From 476b485be03c37212fcf10d85510aae5d34316ef Mon Sep 17 00:00:00 2001
From: Jianxin Xiong <jianxin.xiong@intel.com>
Date: Tue, 10 Nov 2020 13:41:17 -0800
Subject: dma-buf: Document that dma-buf size is fixed

The fact that the size of dma-buf is invariant over the lifetime of the
buffer is mentioned in the comment of 'dma_buf_ops.mmap', but is not
documented at where the info is defined. Add the missing documentation.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1605044477-51833-7-git-send-email-jianxin.xiong@intel.com
---
 include/linux/dma-buf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 03875eaed51a..cf72699cb2bc 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -273,7 +273,7 @@ struct dma_buf_ops {
 
 /**
  * struct dma_buf - shared buffer object
- * @size: size of the buffer
+ * @size: size of the buffer; invariant over the lifetime of the buffer.
  * @file: file pointer used for sharing buffers across, and for refcounting.
  * @attachments: list of dma_buf_attachment that denotes all devices attached,
  *               protected by dma_resv lock.
@@ -405,7 +405,7 @@ struct dma_buf_attachment {
  * @exp_name:	name of the exporter - useful for debugging.
  * @owner:	pointer to exporter module - used for refcounting kernel module
  * @ops:	Attach allocator-defined dma buf ops to the new buffer
- * @size:	Size of the buffer
+ * @size:	Size of the buffer - invariant over the lifetime of the buffer
  * @flags:	mode flags for the file
  * @resv:	reservation-object, NULL to allocate default one
  * @priv:	Attach private data of allocator to this buffer
-- 
cgit 


From c0c5a60f0f1311bcf08bbe735122096d6326fb5b Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.ch>
Date: Sat, 7 Nov 2020 20:35:13 +0100
Subject: net: evaluate net.ipvX.conf.all.ignore_routes_with_linkdown

Introduced in 0eeb075fad73, the "ignore_routes_with_linkdown" sysctl
ignores a route whose interface is down. It is provided as a
per-interface sysctl. However, while a "all" variant is exposed, it
was a noop since it was never evaluated. We use the usual "or" logic
for this kind of sysctls.

Tested with:

    ip link add type veth # veth0 + veth1
    ip link add type veth # veth1 + veth2
    ip link set up dev veth0
    ip link set up dev veth1 # link-status paired with veth0
    ip link set up dev veth2
    ip link set up dev veth3 # link-status paired with veth2

    # First available path
    ip -4 addr add 203.0.113.${uts#H}/24 dev veth0
    ip -6 addr add 2001:db8:1::${uts#H}/64 dev veth0

    # Second available path
    ip -4 addr add 192.0.2.${uts#H}/24 dev veth2
    ip -6 addr add 2001:db8:2::${uts#H}/64 dev veth2

    # More specific route through first path
    ip -4 route add 198.51.100.0/25 via 203.0.113.254 # via veth0
    ip -6 route add 2001:db8:3::/56 via 2001:db8:1::ff # via veth0

    # Less specific route through second path
    ip -4 route add 198.51.100.0/24 via 192.0.2.254 # via veth2
    ip -6 route add 2001:db8:3::/48 via 2001:db8:2::ff # via veth2

    # H1: enable on "all"
    # H2: enable on "veth0"
    for v in ipv4 ipv6; do
      case $uts in
        H1)
          sysctl -qw net.${v}.conf.all.ignore_routes_with_linkdown=1
          ;;
        H2)
          sysctl -qw net.${v}.conf.veth0.ignore_routes_with_linkdown=1
          ;;
      esac
    done

    set -xe
    # When veth0 is up, best route is through veth0
    ip -o route get 198.51.100.1 | grep -Fw veth0
    ip -o route get 2001:db8:3::1 | grep -Fw veth0

    # When veth0 is down, best route should be through veth2 on H1/H2,
    # but on veth0 on H2
    ip link set down dev veth1 # down veth0
    ip route show
    [ $uts != H3 ] || ip -o route get 198.51.100.1 | grep -Fw veth0
    [ $uts != H3 ] || ip -o route get 2001:db8:3::1 | grep -Fw veth0
    [ $uts = H3 ] || ip -o route get 198.51.100.1 | grep -Fw veth2
    [ $uts = H3 ] || ip -o route get 2001:db8:3::1 | grep -Fw veth2

Without this patch, the two last lines would fail on H1 (the one using
the "all" sysctl). With the patch, everything succeeds as expected.

Also document the sysctl in `ip-sysctl.rst`.

Fixes: 0eeb075fad73 ("net: ipv4 sysctl option to ignore routes when nexthop link is down")
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst | 3 +++
 include/linux/inetdevice.h             | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 2aaf40b2d2cd..dd2b12a32b73 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1554,6 +1554,9 @@ igmpv3_unsolicited_report_interval - INTEGER
 
 	Default: 1000 (1 seconds)
 
+ignore_routes_with_linkdown - BOOLEAN
+        Ignore routes whose link is down when performing a FIB lookup.
+
 promote_secondaries - BOOLEAN
 	When a primary IP address is removed from this interface
 	promote a corresponding secondary IP address instead of
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 3515ca64e638..3bbcddd22df8 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -126,7 +126,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 	  IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS)))
 
 #define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \
-	IN_DEV_CONF_GET((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)
+	IN_DEV_ORCONF((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)
 
 #define IN_DEV_ARPFILTER(in_dev)	IN_DEV_ORCONF((in_dev), ARPFILTER)
 #define IN_DEV_ARP_ACCEPT(in_dev)	IN_DEV_ORCONF((in_dev), ARP_ACCEPT)
-- 
cgit 


From 1af5318c00a8acc33a90537af49b3f23f72a2c4b Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.ch>
Date: Sat, 7 Nov 2020 20:35:14 +0100
Subject: net: evaluate net.ipv4.conf.all.proxy_arp_pvlan

Introduced in 65324144b50b, the "proxy_arp_vlan" sysctl is a
per-interface sysctl to tune proxy ARP support for private VLANs.
While the "all" variant is exposed, it was a noop and never evaluated.
We use the usual "or" logic for this kind of sysctls.

Fixes: 65324144b50b ("net: RFC3069, private VLAN proxy arp support")
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 3bbcddd22df8..53aa0343bf69 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -105,7 +105,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 
 #define IN_DEV_LOG_MARTIANS(in_dev)	IN_DEV_ORCONF((in_dev), LOG_MARTIANS)
 #define IN_DEV_PROXY_ARP(in_dev)	IN_DEV_ORCONF((in_dev), PROXY_ARP)
-#define IN_DEV_PROXY_ARP_PVLAN(in_dev)	IN_DEV_CONF_GET(in_dev, PROXY_ARP_PVLAN)
+#define IN_DEV_PROXY_ARP_PVLAN(in_dev)	IN_DEV_ORCONF((in_dev), PROXY_ARP_PVLAN)
 #define IN_DEV_SHARED_MEDIA(in_dev)	IN_DEV_ORCONF((in_dev), SHARED_MEDIA)
 #define IN_DEV_TX_REDIRECTS(in_dev)	IN_DEV_ORCONF((in_dev), SEND_REDIRECTS)
 #define IN_DEV_SEC_REDIRECTS(in_dev)	IN_DEV_ORCONF((in_dev), \
-- 
cgit 


From 1f78ae99790812481b7944cf40008aed5fd2ef18 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Tue, 10 Nov 2020 18:48:40 -0300
Subject: serial: imx: Remove unused platform data support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 5.10-rc1 i.MX is a devicetree-only platform and the existing
platform data support in this driver was only useful for old non-devicetree
platforms.

Get rid of the platform data support since it is no longer used.

Reviewed-by: Fugang Duan <fugang.duan@nxp.com>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20201110214840.16768-1-festevam@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/imx.c                 | 32 +++-----------------------------
 include/linux/platform_data/serial-imx.h | 15 ---------------
 2 files changed, 3 insertions(+), 44 deletions(-)
 delete mode 100644 include/linux/platform_data/serial-imx.h

(limited to 'include/linux')

diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 1731d9728865..7ce38ade9a8e 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -30,7 +30,6 @@
 #include <linux/dma-mapping.h>
 
 #include <asm/irq.h>
-#include <linux/platform_data/serial-imx.h>
 #include <linux/platform_data/dma-imx.h>
 
 #include "serial_mctrl_gpio.h"
@@ -2191,10 +2190,9 @@ static struct uart_driver imx_uart_uart_driver = {
 	.cons           = IMX_CONSOLE,
 };
 
-#ifdef CONFIG_OF
 /*
- * This function returns 1 iff pdev isn't a device instatiated by dt, 0 iff it
- * could successfully get all information from dt or a negative errno.
+ * This function returns 0 iff it could successfully get all information
+ * from dt or a negative errno.
  */
 static int imx_uart_probe_dt(struct imx_port *sport,
 			     struct platform_device *pdev)
@@ -2232,28 +2230,6 @@ static int imx_uart_probe_dt(struct imx_port *sport,
 
 	return 0;
 }
-#else
-static inline int imx_uart_probe_dt(struct imx_port *sport,
-				    struct platform_device *pdev)
-{
-	return 1;
-}
-#endif
-
-static void imx_uart_probe_pdata(struct imx_port *sport,
-				 struct platform_device *pdev)
-{
-	struct imxuart_platform_data *pdata = dev_get_platdata(&pdev->dev);
-
-	sport->port.line = pdev->id;
-	sport->devdata = (struct imx_uart_data	*) pdev->id_entry->driver_data;
-
-	if (!pdata)
-		return;
-
-	if (pdata->flags & IMXUART_HAVE_RTSCTS)
-		sport->have_rtscts = 1;
-}
 
 static enum hrtimer_restart imx_trigger_start_tx(struct hrtimer *t)
 {
@@ -2295,9 +2271,7 @@ static int imx_uart_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	ret = imx_uart_probe_dt(sport, pdev);
-	if (ret > 0)
-		imx_uart_probe_pdata(sport, pdev);
-	else if (ret < 0)
+	if (ret < 0)
 		return ret;
 
 	if (sport->port.line >= ARRAY_SIZE(imx_uart_ports)) {
diff --git a/include/linux/platform_data/serial-imx.h b/include/linux/platform_data/serial-imx.h
deleted file mode 100644
index 0844b21372c7..000000000000
--- a/include/linux/platform_data/serial-imx.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2008 by Sascha Hauer <kernel@pengutronix.de>
- */
-
-#ifndef ASMARM_ARCH_UART_H
-#define ASMARM_ARCH_UART_H
-
-#define IMXUART_HAVE_RTSCTS (1<<0)
-
-struct imxuart_platform_data {
-	unsigned int flags;
-};
-
-#endif
-- 
cgit 


From 5e844cc37a5cbaa460e68f9a989d321d63088a89 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Wed, 11 Nov 2020 20:07:10 +0100
Subject: spi: Introduce device-managed SPI controller allocation

SPI driver probing currently comprises two steps, whereas removal
comprises only one step:

    spi_alloc_master()
    spi_register_controller()

    spi_unregister_controller()

That's because spi_unregister_controller() calls device_unregister()
instead of device_del(), thereby releasing the reference on the
spi_controller which was obtained by spi_alloc_master().

An SPI driver's private data is contained in the same memory allocation
as the spi_controller struct.  Thus, once spi_unregister_controller()
has been called, the private data is inaccessible.  But some drivers
need to access it after spi_unregister_controller() to perform further
teardown steps.

Introduce devm_spi_alloc_master() and devm_spi_alloc_slave(), which
release a reference on the spi_controller struct only after the driver
has unbound, thereby keeping the memory allocation accessible.  Change
spi_unregister_controller() to not release a reference if the
spi_controller was allocated by one of these new devm functions.

The present commit is small enough to be backportable to stable.
It allows fixing drivers which use the private data in their ->remove()
hook after it's been freed.  It also allows fixing drivers which neglect
to release a reference on the spi_controller in the probe error path.

Long-term, most SPI drivers shall be moved over to the devm functions
introduced herein.  The few that can't shall be changed in a treewide
commit to explicitly release the last reference on the controller.
That commit shall amend spi_unregister_controller() to no longer release
a reference, thereby completing the migration.

As a result, the behaviour will be less surprising and more consistent
with subsystems such as IIO, which also includes the private data in the
allocation of the generic iio_dev struct, but calls device_del() in
iio_device_unregister().

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Link: https://lore.kernel.org/r/272bae2ef08abd21388c98e23729886663d19192.1605121038.git.lukas@wunner.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/spi/spi.h | 19 ++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 7566482c052c..05c75f890ace 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2442,6 +2442,49 @@ struct spi_controller *__spi_alloc_controller(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(__spi_alloc_controller);
 
+static void devm_spi_release_controller(struct device *dev, void *ctlr)
+{
+	spi_controller_put(*(struct spi_controller **)ctlr);
+}
+
+/**
+ * __devm_spi_alloc_controller - resource-managed __spi_alloc_controller()
+ * @dev: physical device of SPI controller
+ * @size: how much zeroed driver-private data to allocate
+ * @slave: whether to allocate an SPI master (false) or SPI slave (true)
+ * Context: can sleep
+ *
+ * Allocate an SPI controller and automatically release a reference on it
+ * when @dev is unbound from its driver.  Drivers are thus relieved from
+ * having to call spi_controller_put().
+ *
+ * The arguments to this function are identical to __spi_alloc_controller().
+ *
+ * Return: the SPI controller structure on success, else NULL.
+ */
+struct spi_controller *__devm_spi_alloc_controller(struct device *dev,
+						   unsigned int size,
+						   bool slave)
+{
+	struct spi_controller **ptr, *ctlr;
+
+	ptr = devres_alloc(devm_spi_release_controller, sizeof(*ptr),
+			   GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	ctlr = __spi_alloc_controller(dev, size, slave);
+	if (ctlr) {
+		*ptr = ctlr;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return ctlr;
+}
+EXPORT_SYMBOL_GPL(__devm_spi_alloc_controller);
+
 #ifdef CONFIG_OF
 static int of_spi_get_gpio_numbers(struct spi_controller *ctlr)
 {
@@ -2778,6 +2821,11 @@ int devm_spi_register_controller(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_spi_register_controller);
 
+static int devm_spi_match_controller(struct device *dev, void *res, void *ctlr)
+{
+	return *(struct spi_controller **)res == ctlr;
+}
+
 static int __unregister(struct device *dev, void *null)
 {
 	spi_unregister_device(to_spi_device(dev));
@@ -2819,7 +2867,15 @@ void spi_unregister_controller(struct spi_controller *ctlr)
 	list_del(&ctlr->list);
 	mutex_unlock(&board_lock);
 
-	device_unregister(&ctlr->dev);
+	device_del(&ctlr->dev);
+
+	/* Release the last reference on the controller if its driver
+	 * has not yet been converted to devm_spi_alloc_master/slave().
+	 */
+	if (!devres_find(ctlr->dev.parent, devm_spi_release_controller,
+			 devm_spi_match_controller, ctlr))
+		put_device(&ctlr->dev);
+
 	/* free bus id */
 	mutex_lock(&board_lock);
 	if (found == ctlr)
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 99380c0825db..b390fdac1587 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -734,6 +734,25 @@ static inline struct spi_controller *spi_alloc_slave(struct device *host,
 	return __spi_alloc_controller(host, size, true);
 }
 
+struct spi_controller *__devm_spi_alloc_controller(struct device *dev,
+						   unsigned int size,
+						   bool slave);
+
+static inline struct spi_controller *devm_spi_alloc_master(struct device *dev,
+							   unsigned int size)
+{
+	return __devm_spi_alloc_controller(dev, size, false);
+}
+
+static inline struct spi_controller *devm_spi_alloc_slave(struct device *dev,
+							  unsigned int size)
+{
+	if (!IS_ENABLED(CONFIG_SPI_SLAVE))
+		return NULL;
+
+	return __devm_spi_alloc_controller(dev, size, true);
+}
+
 extern int spi_register_controller(struct spi_controller *ctlr);
 extern int devm_spi_register_controller(struct device *dev,
 					struct spi_controller *ctlr);
-- 
cgit 


From cd2c40ff90b0e385c18f881ab5e17f7137864223 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Thu, 29 Oct 2020 15:27:36 -0700
Subject: platform/chrome: cros_ec: Import Type C host commands

Import the EC_CMD_TYPEC_STATUS and EC_CMD_TYPEC_DISCOVERY Chrome OS EC
host commands from the EC code base [1].

These commands can be used by the application processor to query Power
Delivery (PD) discovery information concerning connected Type C
peripherals.

Also add the EC_FEATURE_TYPEC_CMD feature flag, which is used to
determine whether these commands are supported by the EC.

[1]:
https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/master/include/ec_commands.h

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20201029222738.482366-5-pmalani@chromium.org
---
 include/linux/platform_data/cros_ec_commands.h | 155 +++++++++++++++++++++++++
 1 file changed, 155 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 1fcfe9e63cb9..7f54fdcdd8cb 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -1284,6 +1284,8 @@ enum ec_feature_code {
 	EC_FEATURE_SCP = 39,
 	/* The MCU is an Integrated Sensor Hub */
 	EC_FEATURE_ISH = 40,
+	/* New TCPMv2 TYPEC_ prefaced commands supported */
+	EC_FEATURE_TYPEC_CMD = 41,
 };
 
 #define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32)
@@ -5528,6 +5530,159 @@ struct ec_response_regulator_get_voltage {
 	uint32_t voltage_mv;
 } __ec_align4;
 
+/*
+ * Gather all discovery information for the given port and partner type.
+ *
+ * Note that if discovery has not yet completed, only the currently completed
+ * responses will be filled in.   If the discovery data structures are changed
+ * in the process of the command running, BUSY will be returned.
+ *
+ * VDO field sizes are set to the maximum possible number of VDOs a VDM may
+ * contain, while the number of SVIDs here is selected to fit within the PROTO2
+ * maximum parameter size.
+ */
+#define EC_CMD_TYPEC_DISCOVERY 0x0131
+
+enum typec_partner_type {
+	TYPEC_PARTNER_SOP = 0,
+	TYPEC_PARTNER_SOP_PRIME = 1,
+};
+
+struct ec_params_typec_discovery {
+	uint8_t port;
+	uint8_t partner_type; /* enum typec_partner_type */
+} __ec_align1;
+
+struct svid_mode_info {
+	uint16_t svid;
+	uint16_t mode_count;  /* Number of modes partner sent */
+	uint32_t mode_vdo[6]; /* Max VDOs allowed after VDM header is 6 */
+};
+
+struct ec_response_typec_discovery {
+	uint8_t identity_count;    /* Number of identity VDOs partner sent */
+	uint8_t svid_count;	   /* Number of SVIDs partner sent */
+	uint16_t reserved;
+	uint32_t discovery_vdo[6]; /* Max VDOs allowed after VDM header is 6 */
+	struct svid_mode_info svids[0];
+} __ec_align1;
+
+/*
+ * Gather all status information for a port.
+ *
+ * Note: this covers many of the return fields from the deprecated
+ * EC_CMD_USB_PD_CONTROL command, except those that are redundant with the
+ * discovery data.  The "enum pd_cc_states" is defined with the deprecated
+ * EC_CMD_USB_PD_CONTROL command.
+ *
+ * This also combines in the EC_CMD_USB_PD_MUX_INFO flags.
+ */
+#define EC_CMD_TYPEC_STATUS 0x0133
+
+/*
+ * Power role.
+ *
+ * Note this is also used for PD header creation, and values align to those in
+ * the Power Delivery Specification Revision 3.0 (See
+ * 6.2.1.1.4 Port Power Role).
+ */
+enum pd_power_role {
+	PD_ROLE_SINK = 0,
+	PD_ROLE_SOURCE = 1
+};
+
+/*
+ * Data role.
+ *
+ * Note this is also used for PD header creation, and the first two values
+ * align to those in the Power Delivery Specification Revision 3.0 (See
+ * 6.2.1.1.6 Port Data Role).
+ */
+enum pd_data_role {
+	PD_ROLE_UFP = 0,
+	PD_ROLE_DFP = 1,
+	PD_ROLE_DISCONNECTED = 2,
+};
+
+enum pd_vconn_role {
+	PD_ROLE_VCONN_OFF = 0,
+	PD_ROLE_VCONN_SRC = 1,
+};
+
+/*
+ * Note: BIT(0) may be used to determine whether the polarity is CC1 or CC2,
+ * regardless of whether a debug accessory is connected.
+ */
+enum tcpc_cc_polarity {
+	/*
+	 * _CCx: is used to indicate the polarity while not connected to
+	 * a Debug Accessory.  Only one CC line will assert a resistor and
+	 * the other will be open.
+	 */
+	POLARITY_CC1 = 0,
+	POLARITY_CC2 = 1,
+
+	/*
+	 * _CCx_DTS is used to indicate the polarity while connected to a
+	 * SRC Debug Accessory.  Assert resistors on both lines.
+	 */
+	POLARITY_CC1_DTS = 2,
+	POLARITY_CC2_DTS = 3,
+
+	/*
+	 * The current TCPC code relies on these specific POLARITY values.
+	 * Adding in a check to verify if the list grows for any reason
+	 * that this will give a hint that other places need to be
+	 * adjusted.
+	 */
+	POLARITY_COUNT
+};
+
+#define PD_STATUS_EVENT_SOP_DISC_DONE		BIT(0)
+#define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE	BIT(1)
+
+struct ec_params_typec_status {
+	uint8_t port;
+} __ec_align1;
+
+struct ec_response_typec_status {
+	uint8_t pd_enabled;		/* PD communication enabled - bool */
+	uint8_t dev_connected;		/* Device connected - bool */
+	uint8_t sop_connected;		/* Device is SOP PD capable - bool */
+	uint8_t source_cap_count;	/* Number of Source Cap PDOs */
+
+	uint8_t power_role;		/* enum pd_power_role */
+	uint8_t data_role;		/* enum pd_data_role */
+	uint8_t vconn_role;		/* enum pd_vconn_role */
+	uint8_t sink_cap_count;		/* Number of Sink Cap PDOs */
+
+	uint8_t polarity;		/* enum tcpc_cc_polarity */
+	uint8_t cc_state;		/* enum pd_cc_states */
+	uint8_t dp_pin;			/* DP pin mode (MODE_DP_IN_[A-E]) */
+	uint8_t mux_state;		/* USB_PD_MUX* - encoded mux state */
+
+	char tc_state[32];		/* TC state name */
+
+	uint32_t events;		/* PD_STATUS_EVENT bitmask */
+
+	/*
+	 * BCD PD revisions for partners
+	 *
+	 * The format has the PD major reversion in the upper nibble, and PD
+	 * minor version in the next nibble.  Following two nibbles are
+	 * currently 0.
+	 * ex. PD 3.2 would map to 0x3200
+	 *
+	 * PD major/minor will be 0 if no PD device is connected.
+	 */
+	uint16_t sop_revision;
+	uint16_t sop_prime_revision;
+
+	uint32_t source_cap_pdos[7];	/* Max 7 PDOs can be present */
+
+	uint32_t sink_cap_pdos[7];	/* Max 7 PDOs can be present */
+} __ec_align1;
+
 /*****************************************************************************/
 /* The command range 0x200-0x2FF is reserved for Rotor. */
 
-- 
cgit 


From 7e890c37c25c7cbca37ff0ab292873d8146e713b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 12 Nov 2020 17:50:04 +0100
Subject: block: add a return value to set_capacity_revalidate_and_notify

Return if the function ended up sending an uevent or not.

Cc: stable@vger.kernel.org # v5.9
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 5 ++++-
 include/linux/genhd.h | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 0a273211fec2..9387f050c248 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -49,7 +49,7 @@ static void disk_release_events(struct gendisk *disk);
  * Set disk capacity and notify if the size is not currently
  * zero and will not be set to zero
  */
-void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
+bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
 					bool update_bdev)
 {
 	sector_t capacity = get_capacity(disk);
@@ -62,7 +62,10 @@ void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
 		char *envp[] = { "RESIZE=1", NULL };
 
 		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+		return true;
 	}
+
+	return false;
 }
 
 EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 38f23d757013..03da3f603d30 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -315,7 +315,7 @@ static inline int get_disk_ro(struct gendisk *disk)
 extern void disk_block_events(struct gendisk *disk);
 extern void disk_unblock_events(struct gendisk *disk);
 extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
-void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
+bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
 		bool update_bdev);
 
 /* drivers/char/random.c */
-- 
cgit 


From af0c351cc34857ad7b254850b9392d99da46be9e Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 10 Nov 2020 20:50:02 +0100
Subject: usbnet: switch to core handling of rx/tx byte/packet counters

Use netdev->tstats instead of a member of usbnet for storing a pointer
to the per-cpu counters. This allows us to use core functionality for
statistics handling.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/usbnet.c   | 23 +++++++----------------
 include/linux/usb/usbnet.h |  6 ++----
 2 files changed, 9 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 6062dc27870e..1447da1d5729 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -304,7 +304,7 @@ static void __usbnet_status_stop_force(struct usbnet *dev)
  */
 void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb)
 {
-	struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64);
+	struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->net->tstats);
 	unsigned long flags;
 	int	status;
 
@@ -980,15 +980,6 @@ int usbnet_set_link_ksettings(struct net_device *net,
 }
 EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings);
 
-void usbnet_get_stats64(struct net_device *net, struct rtnl_link_stats64 *stats)
-{
-	struct usbnet *dev = netdev_priv(net);
-
-	netdev_stats_to_stats64(stats, &net->stats);
-	dev_fetch_sw_netstats(stats, dev->stats64);
-}
-EXPORT_SYMBOL_GPL(usbnet_get_stats64);
-
 u32 usbnet_get_link (struct net_device *net)
 {
 	struct usbnet *dev = netdev_priv(net);
@@ -1220,7 +1211,7 @@ static void tx_complete (struct urb *urb)
 	struct usbnet		*dev = entry->dev;
 
 	if (urb->status == 0) {
-		struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64);
+		struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->net->tstats);
 		unsigned long flags;
 
 		flags = u64_stats_update_begin_irqsave(&stats64->syncp);
@@ -1596,7 +1587,7 @@ void usbnet_disconnect (struct usb_interface *intf)
 	usb_free_urb(dev->interrupt);
 	kfree(dev->padding_pkt);
 
-	free_percpu(dev->stats64);
+	free_percpu(net->tstats);
 	free_netdev(net);
 }
 EXPORT_SYMBOL_GPL(usbnet_disconnect);
@@ -1608,7 +1599,7 @@ static const struct net_device_ops usbnet_netdev_ops = {
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_set_rx_mode	= usbnet_set_rx_mode,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
@@ -1671,8 +1662,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	dev->driver_info = info;
 	dev->driver_name = name;
 
-	dev->stats64 = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
-	if (!dev->stats64)
+	net->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!net->tstats)
 		goto out0;
 
 	dev->msg_enable = netif_msg_init (msg_level, NETIF_MSG_DRV
@@ -1812,7 +1803,7 @@ out1:
 	 */
 	cancel_work_sync(&dev->kevent);
 	del_timer_sync(&dev->delay);
-	free_percpu(dev->stats64);
+	free_percpu(net->tstats);
 out0:
 	free_netdev(net);
 out:
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 2e4f7721fc4e..1f6dfa977e7f 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -65,8 +65,6 @@ struct usbnet {
 	struct usb_anchor	deferred;
 	struct tasklet_struct	bh;
 
-	struct pcpu_sw_netstats __percpu *stats64;
-
 	struct work_struct	kevent;
 	unsigned long		flags;
 #		define EVENT_TX_HALT	0
@@ -285,7 +283,7 @@ extern int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags);
 extern void usbnet_status_stop(struct usbnet *dev);
 
 extern void usbnet_update_max_qlen(struct usbnet *dev);
-extern void usbnet_get_stats64(struct net_device *dev,
-			       struct rtnl_link_stats64 *stats);
+
+#define usbnet_get_stats64 dev_get_tstats64
 
 #endif /* __LINUX_USB_USBNET_H */
-- 
cgit 


From 323955a0498ccaa1263e369b91efd8f4310768b6 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 10 Nov 2020 20:51:03 +0100
Subject: net: usb: switch to dev_get_tstats64 and remove usbnet_get_stats64
 alias
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace usbnet_get_stats64() with new identical core function
dev_get_tstats64() in all users and remove usbnet_get_stats64().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/aqc111.c          | 2 +-
 drivers/net/usb/asix_devices.c    | 6 +++---
 drivers/net/usb/ax88172a.c        | 2 +-
 drivers/net/usb/ax88179_178a.c    | 2 +-
 drivers/net/usb/cdc_mbim.c        | 2 +-
 drivers/net/usb/cdc_ncm.c         | 2 +-
 drivers/net/usb/dm9601.c          | 2 +-
 drivers/net/usb/int51x1.c         | 2 +-
 drivers/net/usb/mcs7830.c         | 2 +-
 drivers/net/usb/qmi_wwan.c        | 2 +-
 drivers/net/usb/rndis_host.c      | 2 +-
 drivers/net/usb/sierra_net.c      | 2 +-
 drivers/net/usb/smsc75xx.c        | 2 +-
 drivers/net/usb/smsc95xx.c        | 2 +-
 drivers/net/usb/sr9700.c          | 2 +-
 drivers/net/usb/sr9800.c          | 2 +-
 drivers/net/wireless/rndis_wlan.c | 2 +-
 include/linux/usb/usbnet.h        | 2 --
 18 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c
index 0717c18015c9..73b97f4cc1ec 100644
--- a/drivers/net/usb/aqc111.c
+++ b/drivers/net/usb/aqc111.c
@@ -641,7 +641,7 @@ static const struct net_device_ops aqc111_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_change_mtu		= aqc111_change_mtu,
 	.ndo_set_mac_address	= aqc111_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c
index ef548beba684..6e13d8165852 100644
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -194,7 +194,7 @@ static const struct net_device_ops ax88172_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= asix_ioctl,
@@ -580,7 +580,7 @@ static const struct net_device_ops ax88772_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= asix_ioctl,
@@ -1050,7 +1050,7 @@ static const struct net_device_ops ax88178_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= asix_set_multicast,
diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c
index fd3a04d98dc1..b404c9462dce 100644
--- a/drivers/net/usb/ax88172a.c
+++ b/drivers/net/usb/ax88172a.c
@@ -120,7 +120,7 @@ static const struct net_device_ops ax88172a_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= phy_do_ioctl_running,
diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index 5541f3faedbc..d650b39b6e5d 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1031,7 +1031,7 @@ static const struct net_device_ops ax88179_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_change_mtu		= ax88179_change_mtu,
 	.ndo_set_mac_address	= ax88179_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index eb100eb33de3..5db66272fc82 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -98,7 +98,7 @@ static const struct net_device_ops cdc_mbim_netdev_ops = {
 	.ndo_stop             = usbnet_stop,
 	.ndo_start_xmit       = usbnet_start_xmit,
 	.ndo_tx_timeout       = usbnet_tx_timeout,
-	.ndo_get_stats64      = usbnet_get_stats64,
+	.ndo_get_stats64      = dev_get_tstats64,
 	.ndo_change_mtu       = cdc_ncm_change_mtu,
 	.ndo_set_mac_address  = eth_mac_addr,
 	.ndo_validate_addr    = eth_validate_addr,
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 746353c51f6d..2bac57d5e8d5 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -793,7 +793,7 @@ static const struct net_device_ops cdc_ncm_netdev_ops = {
 	.ndo_start_xmit	     = usbnet_start_xmit,
 	.ndo_tx_timeout	     = usbnet_tx_timeout,
 	.ndo_set_rx_mode     = usbnet_set_rx_mode,
-	.ndo_get_stats64     = usbnet_get_stats64,
+	.ndo_get_stats64     = dev_get_tstats64,
 	.ndo_change_mtu	     = cdc_ncm_change_mtu,
 	.ndo_set_mac_address = eth_mac_addr,
 	.ndo_validate_addr   = eth_validate_addr,
diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c
index 915ac75b55fc..b5d2ac55a874 100644
--- a/drivers/net/usb/dm9601.c
+++ b/drivers/net/usb/dm9601.c
@@ -343,7 +343,7 @@ static const struct net_device_ops dm9601_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl 		= dm9601_ioctl,
 	.ndo_set_rx_mode	= dm9601_set_multicast,
diff --git a/drivers/net/usb/int51x1.c b/drivers/net/usb/int51x1.c
index cb5bc1a7fa5a..ed05f992c612 100644
--- a/drivers/net/usb/int51x1.c
+++ b/drivers/net/usb/int51x1.c
@@ -133,7 +133,7 @@ static const struct net_device_ops int51x1_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= int51x1_set_multicast,
diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c
index 09bfa6a4dfbc..fc512b780d15 100644
--- a/drivers/net/usb/mcs7830.c
+++ b/drivers/net/usb/mcs7830.c
@@ -462,7 +462,7 @@ static const struct net_device_ops mcs7830_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl 		= mcs7830_ioctl,
 	.ndo_set_rx_mode	= mcs7830_set_multicast,
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index b9d74d9a7301..afeb09b9624e 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -575,7 +575,7 @@ static const struct net_device_ops qmi_wwan_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address	= qmi_wwan_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
index 6fa7a009a24a..6609d21ef894 100644
--- a/drivers/net/usb/rndis_host.c
+++ b/drivers/net/usb/rndis_host.c
@@ -279,7 +279,7 @@ static const struct net_device_ops rndis_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c
index 0abd257b634c..55a244eca5ca 100644
--- a/drivers/net/usb/sierra_net.c
+++ b/drivers/net/usb/sierra_net.c
@@ -184,7 +184,7 @@ static const struct net_device_ops sierra_net_device_ops = {
 	.ndo_start_xmit         = usbnet_start_xmit,
 	.ndo_tx_timeout         = usbnet_tx_timeout,
 	.ndo_change_mtu         = usbnet_change_mtu,
-	.ndo_get_stats64        = usbnet_get_stats64,
+	.ndo_get_stats64        = dev_get_tstats64,
 	.ndo_set_mac_address    = eth_mac_addr,
 	.ndo_validate_addr      = eth_validate_addr,
 };
diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c
index 8689835a5214..4353b370249f 100644
--- a/drivers/net/usb/smsc75xx.c
+++ b/drivers/net/usb/smsc75xx.c
@@ -1435,7 +1435,7 @@ static const struct net_device_ops smsc75xx_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_change_mtu		= smsc75xx_change_mtu,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index ea0d5f04dc3a..4c8ee1cff4d4 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -1041,7 +1041,7 @@ static const struct net_device_ops smsc95xx_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl 		= smsc95xx_ioctl,
diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c
index e04c8054c2cf..878557ad03ad 100644
--- a/drivers/net/usb/sr9700.c
+++ b/drivers/net/usb/sr9700.c
@@ -308,7 +308,7 @@ static const struct net_device_ops sr9700_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= sr9700_ioctl,
 	.ndo_set_rx_mode	= sr9700_set_multicast,
diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c
index 681e0def6356..da56735d7755 100644
--- a/drivers/net/usb/sr9800.c
+++ b/drivers/net/usb/sr9800.c
@@ -681,7 +681,7 @@ static const struct net_device_ops sr9800_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address	= sr_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= sr_ioctl,
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index 75b5d545b49e..9fe77556858e 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -3379,7 +3379,7 @@ static const struct net_device_ops rndis_wlan_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
-	.ndo_get_stats64	= usbnet_get_stats64,
+	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= rndis_wlan_set_multicast_list,
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 1f6dfa977e7f..88a7673894d5 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -284,6 +284,4 @@ extern void usbnet_status_stop(struct usbnet *dev);
 
 extern void usbnet_update_max_qlen(struct usbnet *dev);
 
-#define usbnet_get_stats64 dev_get_tstats64
-
 #endif /* __LINUX_USB_USBNET_H */
-- 
cgit 


From 6d94e741a8ff818e5518da8257f5ca0aaed1f269 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 10 Nov 2020 19:12:11 -0800
Subject: bpf: Support for pointers beyond pkt_end.

This patch adds the verifier support to recognize inlined branch conditions.
The LLVM knows that the branch evaluates to the same value, but the verifier
couldn't track it. Hence causing valid programs to be rejected.
The potential LLVM workaround: https://reviews.llvm.org/D87428
can have undesired side effects, since LLVM doesn't know that
skb->data/data_end are being compared. LLVM has to introduce extra boolean
variable and use inline_asm trick to force easier for the verifier assembly.

Instead teach the verifier to recognize that
r1 = skb->data;
r1 += 10;
r2 = skb->data_end;
if (r1 > r2) {
  here r1 points beyond packet_end and
  subsequent
  if (r1 > r2) // always evaluates to "true".
}

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20201111031213.25109-2-alexei.starovoitov@gmail.com
---
 include/linux/bpf_verifier.h |   2 +-
 kernel/bpf/verifier.c        | 129 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 108 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e83ef6f6bf43..306869d4743b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -45,7 +45,7 @@ struct bpf_reg_state {
 	enum bpf_reg_type type;
 	union {
 		/* valid when type == PTR_TO_PACKET */
-		u16 range;
+		int range;
 
 		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
 		 *   PTR_TO_MAP_VALUE_OR_NULL
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 10da26e55130..7b1f85aa9741 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2739,7 +2739,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 			regno);
 		return -EACCES;
 	}
-	err = __check_mem_access(env, regno, off, size, reg->range,
+
+	err = reg->range < 0 ? -EINVAL :
+	      __check_mem_access(env, regno, off, size, reg->range,
 				 zero_size_allowed);
 	if (err) {
 		verbose(env, "R%d offset is outside of the packet\n", regno);
@@ -4697,6 +4699,32 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		__clear_all_pkt_pointers(env, vstate->frame[i]);
 }
 
+enum {
+	AT_PKT_END = -1,
+	BEYOND_PKT_END = -2,
+};
+
+static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open)
+{
+	struct bpf_func_state *state = vstate->frame[vstate->curframe];
+	struct bpf_reg_state *reg = &state->regs[regn];
+
+	if (reg->type != PTR_TO_PACKET)
+		/* PTR_TO_PACKET_META is not supported yet */
+		return;
+
+	/* The 'reg' is pkt > pkt_end or pkt >= pkt_end.
+	 * How far beyond pkt_end it goes is unknown.
+	 * if (!range_open) it's the case of pkt >= pkt_end
+	 * if (range_open) it's the case of pkt > pkt_end
+	 * hence this pointer is at least 1 byte bigger than pkt_end
+	 */
+	if (range_open)
+		reg->range = BEYOND_PKT_END;
+	else
+		reg->range = AT_PKT_END;
+}
+
 static void release_reg_references(struct bpf_verifier_env *env,
 				   struct bpf_func_state *state,
 				   int ref_obj_id)
@@ -6708,7 +6736,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 static void __find_good_pkt_pointers(struct bpf_func_state *state,
 				     struct bpf_reg_state *dst_reg,
-				     enum bpf_reg_type type, u16 new_range)
+				     enum bpf_reg_type type, int new_range)
 {
 	struct bpf_reg_state *reg;
 	int i;
@@ -6733,8 +6761,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 				   enum bpf_reg_type type,
 				   bool range_right_open)
 {
-	u16 new_range;
-	int i;
+	int new_range, i;
 
 	if (dst_reg->off < 0 ||
 	    (dst_reg->off == 0 && range_right_open))
@@ -6985,6 +7012,67 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
 	return is_branch64_taken(reg, val, opcode);
 }
 
+static int flip_opcode(u32 opcode)
+{
+	/* How can we transform "a <op> b" into "b <op> a"? */
+	static const u8 opcode_flip[16] = {
+		/* these stay the same */
+		[BPF_JEQ  >> 4] = BPF_JEQ,
+		[BPF_JNE  >> 4] = BPF_JNE,
+		[BPF_JSET >> 4] = BPF_JSET,
+		/* these swap "lesser" and "greater" (L and G in the opcodes) */
+		[BPF_JGE  >> 4] = BPF_JLE,
+		[BPF_JGT  >> 4] = BPF_JLT,
+		[BPF_JLE  >> 4] = BPF_JGE,
+		[BPF_JLT  >> 4] = BPF_JGT,
+		[BPF_JSGE >> 4] = BPF_JSLE,
+		[BPF_JSGT >> 4] = BPF_JSLT,
+		[BPF_JSLE >> 4] = BPF_JSGE,
+		[BPF_JSLT >> 4] = BPF_JSGT
+	};
+	return opcode_flip[opcode >> 4];
+}
+
+static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
+				   struct bpf_reg_state *src_reg,
+				   u8 opcode)
+{
+	struct bpf_reg_state *pkt;
+
+	if (src_reg->type == PTR_TO_PACKET_END) {
+		pkt = dst_reg;
+	} else if (dst_reg->type == PTR_TO_PACKET_END) {
+		pkt = src_reg;
+		opcode = flip_opcode(opcode);
+	} else {
+		return -1;
+	}
+
+	if (pkt->range >= 0)
+		return -1;
+
+	switch (opcode) {
+	case BPF_JLE:
+		/* pkt <= pkt_end */
+		fallthrough;
+	case BPF_JGT:
+		/* pkt > pkt_end */
+		if (pkt->range == BEYOND_PKT_END)
+			/* pkt has at last one extra byte beyond pkt_end */
+			return opcode == BPF_JGT;
+		break;
+	case BPF_JLT:
+		/* pkt < pkt_end */
+		fallthrough;
+	case BPF_JGE:
+		/* pkt >= pkt_end */
+		if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END)
+			return opcode == BPF_JGE;
+		break;
+	}
+	return -1;
+}
+
 /* Adjusts the register min/max values in the case that the dst_reg is the
  * variable register that we are working on, and src_reg is a constant or we're
  * simply doing a BPF_K check.
@@ -7148,23 +7236,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				u64 val, u32 val32,
 				u8 opcode, bool is_jmp32)
 {
-	/* How can we transform "a <op> b" into "b <op> a"? */
-	static const u8 opcode_flip[16] = {
-		/* these stay the same */
-		[BPF_JEQ  >> 4] = BPF_JEQ,
-		[BPF_JNE  >> 4] = BPF_JNE,
-		[BPF_JSET >> 4] = BPF_JSET,
-		/* these swap "lesser" and "greater" (L and G in the opcodes) */
-		[BPF_JGE  >> 4] = BPF_JLE,
-		[BPF_JGT  >> 4] = BPF_JLT,
-		[BPF_JLE  >> 4] = BPF_JGE,
-		[BPF_JLT  >> 4] = BPF_JGT,
-		[BPF_JSGE >> 4] = BPF_JSLE,
-		[BPF_JSGT >> 4] = BPF_JSLT,
-		[BPF_JSLE >> 4] = BPF_JSGE,
-		[BPF_JSLT >> 4] = BPF_JSGT
-	};
-	opcode = opcode_flip[opcode >> 4];
+	opcode = flip_opcode(opcode);
 	/* This uses zero as "not present in table"; luckily the zero opcode,
 	 * BPF_JA, can't get here.
 	 */
@@ -7346,6 +7418,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 			/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
 			find_good_pkt_pointers(this_branch, dst_reg,
 					       dst_reg->type, false);
+			mark_pkt_end(other_branch, insn->dst_reg, true);
 		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
 			    src_reg->type == PTR_TO_PACKET) ||
 			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
@@ -7353,6 +7426,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 			/* pkt_end > pkt_data', pkt_data > pkt_meta' */
 			find_good_pkt_pointers(other_branch, src_reg,
 					       src_reg->type, true);
+			mark_pkt_end(this_branch, insn->src_reg, false);
 		} else {
 			return false;
 		}
@@ -7365,6 +7439,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 			/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
 			find_good_pkt_pointers(other_branch, dst_reg,
 					       dst_reg->type, true);
+			mark_pkt_end(this_branch, insn->dst_reg, false);
 		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
 			    src_reg->type == PTR_TO_PACKET) ||
 			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
@@ -7372,6 +7447,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 			/* pkt_end < pkt_data', pkt_data > pkt_meta' */
 			find_good_pkt_pointers(this_branch, src_reg,
 					       src_reg->type, false);
+			mark_pkt_end(other_branch, insn->src_reg, true);
 		} else {
 			return false;
 		}
@@ -7384,6 +7460,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 			/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
 			find_good_pkt_pointers(this_branch, dst_reg,
 					       dst_reg->type, true);
+			mark_pkt_end(other_branch, insn->dst_reg, false);
 		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
 			    src_reg->type == PTR_TO_PACKET) ||
 			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
@@ -7391,6 +7468,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 			/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
 			find_good_pkt_pointers(other_branch, src_reg,
 					       src_reg->type, false);
+			mark_pkt_end(this_branch, insn->src_reg, true);
 		} else {
 			return false;
 		}
@@ -7403,6 +7481,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 			/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
 			find_good_pkt_pointers(other_branch, dst_reg,
 					       dst_reg->type, false);
+			mark_pkt_end(this_branch, insn->dst_reg, true);
 		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
 			    src_reg->type == PTR_TO_PACKET) ||
 			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
@@ -7410,6 +7489,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 			/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
 			find_good_pkt_pointers(this_branch, src_reg,
 					       src_reg->type, true);
+			mark_pkt_end(other_branch, insn->src_reg, false);
 		} else {
 			return false;
 		}
@@ -7509,6 +7589,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 				       src_reg->var_off.value,
 				       opcode,
 				       is_jmp32);
+	} else if (reg_is_pkt_pointer_any(dst_reg) &&
+		   reg_is_pkt_pointer_any(src_reg) &&
+		   !is_jmp32) {
+		pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode);
 	}
 
 	if (pred >= 0) {
@@ -7517,7 +7601,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		 */
 		if (!__is_pointer_value(false, dst_reg))
 			err = mark_chain_precision(env, insn->dst_reg);
-		if (BPF_SRC(insn->code) == BPF_X && !err)
+		if (BPF_SRC(insn->code) == BPF_X && !err &&
+		    !__is_pointer_value(false, src_reg))
 			err = mark_chain_precision(env, insn->src_reg);
 		if (err)
 			return err;
-- 
cgit 


From 8d0dd23c6c78d140ed2132f523592ddb4cea839f Mon Sep 17 00:00:00 2001
From: Tal Zussman <tz2294@columbia.edu>
Date: Thu, 12 Nov 2020 16:56:57 -0500
Subject: syscalls: Fix file comments for syscalls implemented in kernel/sys.c

The relevant syscalls were previously moved from kernel/timer.c to kernel/sys.c,
but the comments weren't updated to reflect this change.

Fixing these comments messes up the alphabetical ordering of syscalls by
filename. This could be fixed by merging the two groups of kernel/sys.c syscalls,
but that would require reordering the syscalls and renumbering them to maintain
the numerical order in unistd.h.

Signed-off-by: Tal Zussman <tz2294@columbia.edu>
Link: https://lore.kernel.org/r/20201112215657.GA4539@charmander'
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/syscalls.h          | 2 +-
 include/uapi/asm-generic/unistd.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 37bea07c12f2..629870fbb2c9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -744,7 +744,7 @@ asmlinkage long sys_settimeofday(struct __kernel_old_timeval __user *tv,
 asmlinkage long sys_adjtimex(struct __kernel_timex __user *txc_p);
 asmlinkage long sys_adjtimex_time32(struct old_timex32 __user *txc_p);
 
-/* kernel/timer.c */
+/* kernel/sys.c */
 asmlinkage long sys_getpid(void);
 asmlinkage long sys_getppid(void);
 asmlinkage long sys_getuid(void);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 2056318988f7..fc48c64700eb 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -517,7 +517,7 @@ __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday)
 __SC_3264(__NR_adjtimex, sys_adjtimex_time32, sys_adjtimex)
 #endif
 
-/* kernel/timer.c */
+/* kernel/sys.c */
 #define __NR_getpid 172
 __SYSCALL(__NR_getpid, sys_getpid)
 #define __NR_getppid 173
-- 
cgit 


From a609c58086e381c13bdad1ba97e6510a13d465e7 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 12 Nov 2020 10:58:55 +0000
Subject: tty: serial: 8250: 8250_port: Move prototypes to shared location
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following W=1 kernel build warning(s):

 drivers/tty/serial/8250/8250_port.c:349:14: warning: no previous prototype for ‘au_serial_in’ [-Wmissing-prototypes]
 drivers/tty/serial/8250/8250_port.c:359:6: warning: no previous prototype for ‘au_serial_out’ [-Wmissing-prototypes]

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Mike Hudson <Exoray@isys.ca>
Cc: linux-serial@vger.kernel.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20201112105857.2078977-3-lee.jones@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_early.c | 3 ---
 include/linux/serial_8250.h          | 5 +++++
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c
index 70d7826788f5..c171ce6db691 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -204,9 +204,6 @@ OF_EARLYCON_DECLARE(omap8250, "ti,omap4-uart", early_omap8250_setup);
 
 #ifdef CONFIG_SERIAL_8250_RT288X
 
-unsigned int au_serial_in(struct uart_port *p, int offset);
-void au_serial_out(struct uart_port *p, int offset, int value);
-
 static int __init early_au_setup(struct earlycon_device *dev, const char *opt)
 {
 	dev->port.serial_in = au_serial_in;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 2b70f736b091..9e655055112d 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -187,4 +187,9 @@ extern void serial8250_set_isa_configurator(void (*v)
 					(int port, struct uart_port *up,
 						u32 *capabilities));
 
+#ifdef CONFIG_SERIAL_8250_RT288X
+unsigned int au_serial_in(struct uart_port *p, int offset);
+void au_serial_out(struct uart_port *p, int offset, int value);
+#endif
+
 #endif
-- 
cgit 


From 423f16108c9d832bd96059d5c882c8ef6d76eb96 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Fri, 13 Nov 2020 00:59:29 +0000
Subject: bpf: Augment the set of sleepable LSM hooks

Update the set of sleepable hooks with the ones that do not trigger
a warning with might_fault() when exercised with the correct kernel
config options enabled, i.e.

	DEBUG_ATOMIC_SLEEP=y
	LOCKDEP=y
	PROVE_LOCKING=y

This means that a sleepable LSM eBPF program can be attached to these
LSM hooks. A new helper method bpf_lsm_is_sleepable_hook is added and
the set is maintained locally in bpf_lsm.c

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20201113005930.541956-2-kpsingh@chromium.org
---
 include/linux/bpf_lsm.h |  7 +++++
 kernel/bpf/bpf_lsm.c    | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c   | 16 +---------
 3 files changed, 89 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 73226181b744..0d1c33ace398 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -27,6 +27,8 @@ extern struct lsm_blob_sizes bpf_lsm_blob_sizes;
 int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 			const struct bpf_prog *prog);
 
+bool bpf_lsm_is_sleepable_hook(u32 btf_id);
+
 static inline struct bpf_storage_blob *bpf_inode(
 	const struct inode *inode)
 {
@@ -54,6 +56,11 @@ void bpf_task_storage_free(struct task_struct *task);
 
 #else /* !CONFIG_BPF_LSM */
 
+static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
+{
+	return false;
+}
+
 static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 				      const struct bpf_prog *prog)
 {
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e92c51bebb47..aed74b853415 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -13,6 +13,7 @@
 #include <linux/bpf_verifier.h>
 #include <net/bpf_sk_storage.h>
 #include <linux/bpf_local_storage.h>
+#include <linux/btf_ids.h>
 
 /* For every LSM hook that allows attachment of BPF programs, declare a nop
  * function where a BPF program can be attached.
@@ -72,6 +73,86 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+/* The set of hooks which are called without pagefaults disabled and are allowed
+ * to "sleep" and thus can be used for sleeable BPF programs.
+ */
+BTF_SET_START(sleepable_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf)
+BTF_ID(func, bpf_lsm_bpf_map)
+BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bprm_check_security)
+BTF_ID(func, bpf_lsm_bprm_committed_creds)
+BTF_ID(func, bpf_lsm_bprm_committing_creds)
+BTF_ID(func, bpf_lsm_bprm_creds_for_exec)
+BTF_ID(func, bpf_lsm_bprm_creds_from_file)
+BTF_ID(func, bpf_lsm_capget)
+BTF_ID(func, bpf_lsm_capset)
+BTF_ID(func, bpf_lsm_cred_prepare)
+BTF_ID(func, bpf_lsm_file_ioctl)
+BTF_ID(func, bpf_lsm_file_lock)
+BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_receive)
+BTF_ID(func, bpf_lsm_inet_conn_established)
+BTF_ID(func, bpf_lsm_inode_create)
+BTF_ID(func, bpf_lsm_inode_free_security)
+BTF_ID(func, bpf_lsm_inode_getattr)
+BTF_ID(func, bpf_lsm_inode_getxattr)
+BTF_ID(func, bpf_lsm_inode_mknod)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_readlink)
+BTF_ID(func, bpf_lsm_inode_rename)
+BTF_ID(func, bpf_lsm_inode_rmdir)
+BTF_ID(func, bpf_lsm_inode_setattr)
+BTF_ID(func, bpf_lsm_inode_setxattr)
+BTF_ID(func, bpf_lsm_inode_symlink)
+BTF_ID(func, bpf_lsm_inode_unlink)
+BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernfs_init_security)
+BTF_ID(func, bpf_lsm_key_free)
+BTF_ID(func, bpf_lsm_mmap_file)
+BTF_ID(func, bpf_lsm_netlink_send)
+BTF_ID(func, bpf_lsm_path_notify)
+BTF_ID(func, bpf_lsm_release_secctx)
+BTF_ID(func, bpf_lsm_sb_alloc_security)
+BTF_ID(func, bpf_lsm_sb_eat_lsm_opts)
+BTF_ID(func, bpf_lsm_sb_kern_mount)
+BTF_ID(func, bpf_lsm_sb_mount)
+BTF_ID(func, bpf_lsm_sb_remount)
+BTF_ID(func, bpf_lsm_sb_set_mnt_opts)
+BTF_ID(func, bpf_lsm_sb_show_options)
+BTF_ID(func, bpf_lsm_sb_statfs)
+BTF_ID(func, bpf_lsm_sb_umount)
+BTF_ID(func, bpf_lsm_settime)
+BTF_ID(func, bpf_lsm_socket_accept)
+BTF_ID(func, bpf_lsm_socket_bind)
+BTF_ID(func, bpf_lsm_socket_connect)
+BTF_ID(func, bpf_lsm_socket_create)
+BTF_ID(func, bpf_lsm_socket_getpeername)
+BTF_ID(func, bpf_lsm_socket_getpeersec_dgram)
+BTF_ID(func, bpf_lsm_socket_getsockname)
+BTF_ID(func, bpf_lsm_socket_getsockopt)
+BTF_ID(func, bpf_lsm_socket_listen)
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_recvmsg)
+BTF_ID(func, bpf_lsm_socket_sendmsg)
+BTF_ID(func, bpf_lsm_socket_shutdown)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+BTF_ID(func, bpf_lsm_syslog)
+BTF_ID(func, bpf_lsm_task_alloc)
+BTF_ID(func, bpf_lsm_task_getsecid)
+BTF_ID(func, bpf_lsm_task_prctl)
+BTF_ID(func, bpf_lsm_task_setscheduler)
+BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_SET_END(sleepable_lsm_hooks)
+
+bool bpf_lsm_is_sleepable_hook(u32 btf_id)
+{
+	return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
+}
+
 const struct bpf_prog_ops lsm_prog_ops = {
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7b1f85aa9741..fb2943ea715d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11562,20 +11562,6 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name)
 	return -EINVAL;
 }
 
-/* non exhaustive list of sleepable bpf_lsm_*() functions */
-BTF_SET_START(btf_sleepable_lsm_hooks)
-#ifdef CONFIG_BPF_LSM
-BTF_ID(func, bpf_lsm_bprm_committed_creds)
-#else
-BTF_ID_UNUSED
-#endif
-BTF_SET_END(btf_sleepable_lsm_hooks)
-
-static int check_sleepable_lsm_hook(u32 btf_id)
-{
-	return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id);
-}
-
 /* list of non-sleepable functions that are otherwise on
  * ALLOW_ERROR_INJECTION list
  */
@@ -11797,7 +11783,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 				/* LSM progs check that they are attached to bpf_lsm_*() funcs.
 				 * Only some of them are sleepable.
 				 */
-				if (check_sleepable_lsm_hook(btf_id))
+				if (bpf_lsm_is_sleepable_hook(btf_id))
 					ret = 0;
 				break;
 			default:
-- 
cgit 


From d19ad0775dcd64b49eecf4fa79c17959ebfbd26b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Oct 2020 17:42:17 -0400
Subject: ftrace: Have the callbacks receive a struct ftrace_regs instead of
 pt_regs

In preparation to have arguments of a function passed to callbacks attached
to functions as default, change the default callback prototype to receive a
struct ftrace_regs as the forth parameter instead of a pt_regs.

For callbacks that set the FL_SAVE_REGS flag in their ftrace_ops flags, they
will now need to get the pt_regs via a ftrace_get_regs() helper call. If
this is called by a callback that their ftrace_ops did not have a
FL_SAVE_REGS flag set, it that helper function will return NULL.

This will allow the ftrace_regs to hold enough just to get the parameters
and stack pointer, but without the worry that callbacks may have a pt_regs
that is not completely filled.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/csky/kernel/probes/ftrace.c     |  4 +++-
 arch/nds32/kernel/ftrace.c           |  4 ++--
 arch/parisc/kernel/ftrace.c          |  8 +++++---
 arch/powerpc/kernel/kprobes-ftrace.c |  4 +++-
 arch/s390/kernel/ftrace.c            |  4 +++-
 arch/x86/kernel/kprobes/ftrace.c     |  3 ++-
 fs/pstore/ftrace.c                   |  2 +-
 include/linux/ftrace.h               | 16 ++++++++++++++--
 include/linux/kprobes.h              |  2 +-
 kernel/livepatch/patch.c             |  3 ++-
 kernel/trace/ftrace.c                | 27 +++++++++++++++------------
 kernel/trace/trace_event_perf.c      |  2 +-
 kernel/trace/trace_events.c          |  2 +-
 kernel/trace/trace_functions.c       |  9 ++++-----
 kernel/trace/trace_irqsoff.c         |  2 +-
 kernel/trace/trace_sched_wakeup.c    |  2 +-
 kernel/trace/trace_selftest.c        | 20 +++++++++++---------
 kernel/trace/trace_stack.c           |  2 +-
 18 files changed, 71 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c
index f30b179924ef..ae2b1c7b3b5c 100644
--- a/arch/csky/kernel/probes/ftrace.c
+++ b/arch/csky/kernel/probes/ftrace.c
@@ -11,17 +11,19 @@ int arch_check_ftrace_location(struct kprobe *p)
 
 /* Ftrace callback handler for kprobes -- called under preepmt disabed */
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-			   struct ftrace_ops *ops, struct pt_regs *regs)
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	int bit;
 	bool lr_saver = false;
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
 
 	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
+	regs = ftrace_get_regs(fregs);
 	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (!p) {
diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c
index 3763b3f8c3db..414f8a780cc3 100644
--- a/arch/nds32/kernel/ftrace.c
+++ b/arch/nds32/kernel/ftrace.c
@@ -10,7 +10,7 @@ extern void (*ftrace_trace_function)(unsigned long, unsigned long,
 extern void ftrace_graph_caller(void);
 
 noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip,
-				  struct ftrace_ops *op, struct pt_regs *regs)
+				  struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	__asm__ ("");  /* avoid to optimize as pure function */
 }
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(_mcount);
 #else /* CONFIG_DYNAMIC_FTRACE */
 
 noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip,
-				  struct ftrace_ops *op, struct pt_regs *regs)
+				  struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	__asm__ ("");  /* avoid to optimize as pure function */
 }
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 1c5d3732bda2..0a1e75af5382 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -51,7 +51,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent,
 void notrace __hot ftrace_function_trampoline(unsigned long parent,
 				unsigned long self_addr,
 				unsigned long org_sp_gr3,
-				struct pt_regs *regs)
+				struct ftrace_regs *fregs)
 {
 #ifndef CONFIG_DYNAMIC_FTRACE
 	extern ftrace_func_t ftrace_trace_function;
@@ -61,7 +61,7 @@ void notrace __hot ftrace_function_trampoline(unsigned long parent,
 	if (function_trace_op->flags & FTRACE_OPS_FL_ENABLED &&
 	    ftrace_trace_function != ftrace_stub)
 		ftrace_trace_function(self_addr, parent,
-				function_trace_op, regs);
+				function_trace_op, fregs);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	if (dereference_function_descriptor(ftrace_graph_return) !=
@@ -204,9 +204,10 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 
 #ifdef CONFIG_KPROBES_ON_FTRACE
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-			   struct ftrace_ops *ops, struct pt_regs *regs)
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
 	struct kprobe *p;
 	int bit;
 
@@ -214,6 +215,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	if (bit < 0)
 		return;
 
+	regs = ftrace_get_regs(fregs);
 	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (unlikely(!p) || kprobe_disabled(p))
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index fdfee39938ea..660138f6c4b2 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -14,16 +14,18 @@
 
 /* Ftrace callback handler for kprobes */
 void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
-			   struct ftrace_ops *ops, struct pt_regs *regs)
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
 	int bit;
 
 	bit = ftrace_test_recursion_trylock(nip, parent_nip);
 	if (bit < 0)
 		return;
 
+	regs = ftrace_get_regs(fregs);
 	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)nip);
 	if (unlikely(!p) || kprobe_disabled(p))
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 657c1ab45408..67b80f4412f9 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -198,9 +198,10 @@ int ftrace_disable_ftrace_graph_caller(void)
 
 #ifdef CONFIG_KPROBES_ON_FTRACE
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-		struct ftrace_ops *ops, struct pt_regs *regs)
+		struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
 	struct kprobe *p;
 	int bit;
 
@@ -208,6 +209,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	if (bit < 0)
 		return;
 
+	regs = ftrace_get_regs(fregs);
 	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (unlikely(!p) || kprobe_disabled(p))
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 954d930a7127..373e5fa3ce1f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -14,8 +14,9 @@
 
 /* Ftrace callback handler for kprobes -- called under preepmt disabed */
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-			   struct ftrace_ops *ops, struct pt_regs *regs)
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 	int bit;
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index adb0935eb062..5939595f0115 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -26,7 +26,7 @@ static u64 pstore_ftrace_stamp;
 static void notrace pstore_ftrace_call(unsigned long ip,
 				       unsigned long parent_ip,
 				       struct ftrace_ops *op,
-				       struct pt_regs *regs)
+				       struct ftrace_regs *fregs)
 {
 	int bit;
 	unsigned long flags;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8dde9c17aaa5..24e1fa52337d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -90,8 +90,20 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 struct ftrace_ops;
 
+struct ftrace_regs {
+	struct pt_regs		regs;
+};
+
+static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs)
+{
+	if (!fregs)
+		return NULL;
+
+	return &fregs->regs;
+}
+
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
-			      struct ftrace_ops *op, struct pt_regs *regs);
+			      struct ftrace_ops *op, struct ftrace_regs *fregs);
 
 ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
 
@@ -259,7 +271,7 @@ int register_ftrace_function(struct ftrace_ops *ops);
 int unregister_ftrace_function(struct ftrace_ops *ops);
 
 extern void ftrace_stub(unsigned long a0, unsigned long a1,
-			struct ftrace_ops *op, struct pt_regs *regs);
+			struct ftrace_ops *op, struct ftrace_regs *fregs);
 
 #else /* !CONFIG_FUNCTION_TRACER */
 /*
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 629abaf25681..be73350955e4 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -345,7 +345,7 @@ static inline void wait_for_kprobe_optimizer(void) { }
 #endif /* CONFIG_OPTPROBES */
 #ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-				  struct ftrace_ops *ops, struct pt_regs *regs);
+				  struct ftrace_ops *ops, struct ftrace_regs *fregs);
 extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
 #endif
 
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 875c5dbbdd33..f89f9e7e9b07 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -40,8 +40,9 @@ struct klp_ops *klp_find_ops(void *old_func)
 static void notrace klp_ftrace_handler(unsigned long ip,
 				       unsigned long parent_ip,
 				       struct ftrace_ops *fops,
-				       struct pt_regs *regs)
+				       struct ftrace_regs *fregs)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
 	struct klp_ops *ops;
 	struct klp_func *func;
 	int patch_state;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3db64fb0cce8..67888311784e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -121,7 +121,7 @@ struct ftrace_ops global_ops;
 
 #if ARCH_SUPPORTS_FTRACE_OPS
 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-				 struct ftrace_ops *op, struct pt_regs *regs);
+				 struct ftrace_ops *op, struct ftrace_regs *fregs);
 #else
 /* See comment below, where ftrace_ops_list_func is defined */
 static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
@@ -140,7 +140,7 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
 }
 
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
-			    struct ftrace_ops *op, struct pt_regs *regs)
+			    struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = op->private;
 	int pid;
@@ -154,7 +154,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 			return;
 	}
 
-	op->saved_func(ip, parent_ip, op, regs);
+	op->saved_func(ip, parent_ip, op, fregs);
 }
 
 static void ftrace_sync_ipi(void *data)
@@ -754,7 +754,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
 
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip,
-		      struct ftrace_ops *ops, struct pt_regs *regs)
+		      struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
@@ -2143,6 +2143,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 				else
 					rec->flags &= ~FTRACE_FL_TRAMP_EN;
 			}
+
 			if (flag & FTRACE_FL_DIRECT) {
 				/*
 				 * If there's only one user (direct_ops helper)
@@ -2368,8 +2369,9 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
 }
 
 static void call_direct_funcs(unsigned long ip, unsigned long pip,
-			      struct ftrace_ops *ops, struct pt_regs *regs)
+			      struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
 	unsigned long addr;
 
 	addr = ftrace_find_rec_direct(ip);
@@ -4292,7 +4294,7 @@ static int __init ftrace_mod_cmd_init(void)
 core_initcall(ftrace_mod_cmd_init);
 
 static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
-				      struct ftrace_ops *op, struct pt_regs *pt_regs)
+				      struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct ftrace_probe_ops *probe_ops;
 	struct ftrace_func_probe *probe;
@@ -6911,8 +6913,9 @@ void ftrace_reset_array_ops(struct trace_array *tr)
 
 static nokprobe_inline void
 __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-		       struct ftrace_ops *ignored, struct pt_regs *regs)
+		       struct ftrace_ops *ignored, struct ftrace_regs *fregs)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
 	struct ftrace_ops *op;
 	int bit;
 
@@ -6945,7 +6948,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
 			}
-			op->func(ip, parent_ip, op, regs);
+			op->func(ip, parent_ip, op, fregs);
 		}
 	} while_for_each_ftrace_op(op);
 out:
@@ -6968,9 +6971,9 @@ out:
  */
 #if ARCH_SUPPORTS_FTRACE_OPS
 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-				 struct ftrace_ops *op, struct pt_regs *regs)
+				 struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+	__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
 }
 NOKPROBE_SYMBOL(ftrace_ops_list_func);
 #else
@@ -6987,7 +6990,7 @@ NOKPROBE_SYMBOL(ftrace_ops_no_ops);
  * this function will be called by the mcount trampoline.
  */
 static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
-				   struct ftrace_ops *op, struct pt_regs *regs)
+				   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	int bit;
 
@@ -6998,7 +7001,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 	preempt_disable_notrace();
 
 	if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching())
-		op->func(ip, parent_ip, op, regs);
+		op->func(ip, parent_ip, op, fregs);
 
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 1b202e28dfaa..a71181655958 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -432,7 +432,7 @@ NOKPROBE_SYMBOL(perf_trace_buf_update);
 #ifdef CONFIG_FUNCTION_TRACER
 static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
-			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
+			  struct ftrace_ops *ops,  struct ftrace_regs *fregs)
 {
 	struct ftrace_entry *entry;
 	struct perf_event *event;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f4b459bb6d33..98d194d8460e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3673,7 +3673,7 @@ static struct trace_event_file event_trace_file __initdata;
 
 static void __init
 function_test_events_call(unsigned long ip, unsigned long parent_ip,
-			  struct ftrace_ops *op, struct pt_regs *pt_regs)
+			  struct ftrace_ops *op, struct ftrace_regs *regs)
 {
 	struct trace_buffer *buffer;
 	struct ring_buffer_event *event;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 646eda6c44a5..c5095dd28e20 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -23,10 +23,10 @@ static void tracing_start_function_trace(struct trace_array *tr);
 static void tracing_stop_function_trace(struct trace_array *tr);
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
-		    struct ftrace_ops *op, struct pt_regs *pt_regs);
+		    struct ftrace_ops *op, struct ftrace_regs *fregs);
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
-			  struct ftrace_ops *op, struct pt_regs *pt_regs);
+			  struct ftrace_ops *op, struct ftrace_regs *fregs);
 static struct tracer_flags func_flags;
 
 /* Our option */
@@ -89,7 +89,6 @@ void ftrace_destroy_function_files(struct trace_array *tr)
 static int function_trace_init(struct trace_array *tr)
 {
 	ftrace_func_t func;
-
 	/*
 	 * Instance trace_arrays get their ops allocated
 	 * at instance creation. Unless it failed
@@ -129,7 +128,7 @@ static void function_trace_start(struct trace_array *tr)
 
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
-		    struct ftrace_ops *op, struct pt_regs *pt_regs)
+		    struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = op->private;
 	struct trace_array_cpu *data;
@@ -178,7 +177,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
-			  struct ftrace_ops *op, struct pt_regs *pt_regs)
+			  struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = op->private;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 10bbb0f381d5..d06aab4dcbb8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -138,7 +138,7 @@ static int func_prolog_dec(struct trace_array *tr,
  */
 static void
 irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
-		    struct ftrace_ops *op, struct pt_regs *pt_regs)
+		    struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 97b10bb31a1f..c0181066dbe9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -212,7 +212,7 @@ static void wakeup_print_header(struct seq_file *s)
  */
 static void
 wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
-		   struct ftrace_ops *op, struct pt_regs *pt_regs)
+		   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 8ee3c0bb5d8a..5ed081c6471c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -107,7 +107,7 @@ static int trace_selftest_test_probe1_cnt;
 static void trace_selftest_test_probe1_func(unsigned long ip,
 					    unsigned long pip,
 					    struct ftrace_ops *op,
-					    struct pt_regs *pt_regs)
+					    struct ftrace_regs *fregs)
 {
 	trace_selftest_test_probe1_cnt++;
 }
@@ -116,7 +116,7 @@ static int trace_selftest_test_probe2_cnt;
 static void trace_selftest_test_probe2_func(unsigned long ip,
 					    unsigned long pip,
 					    struct ftrace_ops *op,
-					    struct pt_regs *pt_regs)
+					    struct ftrace_regs *fregs)
 {
 	trace_selftest_test_probe2_cnt++;
 }
@@ -125,7 +125,7 @@ static int trace_selftest_test_probe3_cnt;
 static void trace_selftest_test_probe3_func(unsigned long ip,
 					    unsigned long pip,
 					    struct ftrace_ops *op,
-					    struct pt_regs *pt_regs)
+					    struct ftrace_regs *fregs)
 {
 	trace_selftest_test_probe3_cnt++;
 }
@@ -134,7 +134,7 @@ static int trace_selftest_test_global_cnt;
 static void trace_selftest_test_global_func(unsigned long ip,
 					    unsigned long pip,
 					    struct ftrace_ops *op,
-					    struct pt_regs *pt_regs)
+					    struct ftrace_regs *fregs)
 {
 	trace_selftest_test_global_cnt++;
 }
@@ -143,7 +143,7 @@ static int trace_selftest_test_dyn_cnt;
 static void trace_selftest_test_dyn_func(unsigned long ip,
 					 unsigned long pip,
 					 struct ftrace_ops *op,
-					 struct pt_regs *pt_regs)
+					 struct ftrace_regs *fregs)
 {
 	trace_selftest_test_dyn_cnt++;
 }
@@ -414,7 +414,7 @@ static int trace_selftest_recursion_cnt;
 static void trace_selftest_test_recursion_func(unsigned long ip,
 					       unsigned long pip,
 					       struct ftrace_ops *op,
-					       struct pt_regs *pt_regs)
+					       struct ftrace_regs *fregs)
 {
 	/*
 	 * This function is registered without the recursion safe flag.
@@ -429,7 +429,7 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
 static void trace_selftest_test_recursion_safe_func(unsigned long ip,
 						    unsigned long pip,
 						    struct ftrace_ops *op,
-						    struct pt_regs *pt_regs)
+						    struct ftrace_regs *fregs)
 {
 	/*
 	 * We said we would provide our own recursion. By calling
@@ -548,9 +548,11 @@ static enum {
 static void trace_selftest_test_regs_func(unsigned long ip,
 					  unsigned long pip,
 					  struct ftrace_ops *op,
-					  struct pt_regs *pt_regs)
+					  struct ftrace_regs *fregs)
 {
-	if (pt_regs)
+	struct pt_regs *regs = ftrace_get_regs(fregs);
+
+	if (regs)
 		trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
 	else
 		trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 969db526a563..63c285042051 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -290,7 +290,7 @@ static void check_stack(unsigned long ip, unsigned long *stack)
 
 static void
 stack_trace_call(unsigned long ip, unsigned long parent_ip,
-		 struct ftrace_ops *op, struct pt_regs *pt_regs)
+		 struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	unsigned long stack;
 
-- 
cgit 


From 02a474ca266a47ea8f4d5a11f4ffa120f83730ad Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 27 Oct 2020 10:55:55 -0400
Subject: ftrace/x86: Allow for arguments to be passed in to ftrace_regs by
 default

Currently, the only way to get access to the registers of a function via a
ftrace callback is to set the "FL_SAVE_REGS" bit in the ftrace_ops. But as this
saves all regs as if a breakpoint were to trigger (for use with kprobes), it
is expensive.

The regs are already saved on the stack for the default ftrace callbacks, as
that is required otherwise a function being traced will get the wrong
arguments and possibly crash. And on x86, the arguments are already stored
where they would be on a pt_regs structure to use that code for both the
regs version of a callback, it makes sense to pass that information always
to all functions.

If an architecture does this (as x86_64 now does), it is to set
HAVE_DYNAMIC_FTRACE_WITH_ARGS, and this will let the generic code that it
could have access to arguments without having to set the flags.

This also includes having the stack pointer being saved, which could be used
for accessing arguments on the stack, as well as having the function graph
tracer not require its own trampoline!

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/Kconfig              |  1 +
 arch/x86/include/asm/ftrace.h | 15 +++++++++++++++
 arch/x86/kernel/ftrace_64.S   | 11 +++++++++--
 include/linux/ftrace.h        |  7 ++++++-
 kernel/trace/Kconfig          |  9 +++++++++
 5 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f6946b81f74a..478526aabe5d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -167,6 +167,7 @@ config X86
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_DYNAMIC_FTRACE_WITH_ARGS	if X86_64
 	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 84b9449be080..e00fe88146e0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -41,6 +41,21 @@ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned
 	regs->orig_ax = addr;
 }
 
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+struct ftrace_regs {
+	struct pt_regs		regs;
+};
+
+static __always_inline struct pt_regs *
+arch_ftrace_get_regs(struct ftrace_regs *fregs)
+{
+	/* Only when FL_SAVE_REGS is set, cs will be non zero */
+	if (!fregs->regs.cs)
+		return NULL;
+	return &fregs->regs;
+}
+#endif
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 struct dyn_arch_ftrace {
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index ac3d5f22fe64..60e3b64f5ea6 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -140,12 +140,19 @@ SYM_FUNC_START(ftrace_caller)
 	/* save_mcount_regs fills in first two parameters */
 	save_mcount_regs
 
+	/* Stack - skipping return address of ftrace_caller */
+	leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
+	movq %rcx, RSP(%rsp)
+
 SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
 	/* Load the ftrace_ops into the 3rd parameter */
 	movq function_trace_op(%rip), %rdx
 
-	/* regs go into 4th parameter (but make it NULL) */
-	movq $0, %rcx
+	/* regs go into 4th parameter */
+	leaq (%rsp), %rcx
+
+	/* Only ops with REGS flag set should have CS register set */
+	movq $0, CS(%rsp)
 
 SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	call ftrace_stub
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 24e1fa52337d..588ea7023a7a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -90,16 +90,21 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 struct ftrace_ops;
 
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
 struct ftrace_regs {
 	struct pt_regs		regs;
 };
+#define arch_ftrace_get_regs(fregs) (&(fregs)->regs)
+
+#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
 
 static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs)
 {
 	if (!fregs)
 		return NULL;
 
-	return &fregs->regs;
+	return arch_ftrace_get_regs(fregs);
 }
 
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6aa36ec73ccb..c9b64dea1216 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -31,6 +31,15 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
 config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	bool
 
+config HAVE_DYNAMIC_FTRACE_WITH_ARGS
+	bool
+	help
+	 If this is set, then arguments and stack can be found from
+	 the pt_regs passed into the function callback regs parameter
+	 by default, even without setting the REGS flag in the ftrace_ops.
+	 This allows for use of regs_get_kernel_argument() and
+	 kernel_stack_pointer().
+
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 	help
-- 
cgit 


From 2860cd8a235375df3c8ec8039d9fe5eb2f658b86 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Oct 2020 17:15:27 -0400
Subject: livepatch: Use the default ftrace_ops instead of REGS when ARGS is
 available

When CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS is available, the ftrace call
will be able to set the ip of the calling function. This will improve the
performance of live kernel patching where it does not need all the regs to
be stored just to change the instruction pointer.

If all archs that support live kernel patching also support
HAVE_DYNAMIC_FTRACE_WITH_ARGS, then the architecture specific function
klp_arch_set_pc() could be made generic.

It is possible that an arch can support HAVE_DYNAMIC_FTRACE_WITH_ARGS but
not HAVE_DYNAMIC_FTRACE_WITH_REGS and then have access to live patching.

Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: live-patching@vger.kernel.org
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/powerpc/include/asm/livepatch.h | 4 +++-
 arch/s390/include/asm/livepatch.h    | 5 ++++-
 arch/x86/include/asm/ftrace.h        | 3 +++
 arch/x86/include/asm/livepatch.h     | 4 ++--
 arch/x86/kernel/ftrace_64.S          | 4 ++++
 include/linux/ftrace.h               | 7 +++++++
 kernel/livepatch/Kconfig             | 2 +-
 kernel/livepatch/patch.c             | 9 +++++----
 8 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h
index 4a3d5d25fed5..ae25e6e72997 100644
--- a/arch/powerpc/include/asm/livepatch.h
+++ b/arch/powerpc/include/asm/livepatch.h
@@ -12,8 +12,10 @@
 #include <linux/sched/task_stack.h>
 
 #ifdef CONFIG_LIVEPATCH
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
+
 	regs->nip = ip;
 }
 
diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h
index 818612b784cd..d578a8c76676 100644
--- a/arch/s390/include/asm/livepatch.h
+++ b/arch/s390/include/asm/livepatch.h
@@ -11,10 +11,13 @@
 #ifndef ASM_LIVEPATCH_H
 #define ASM_LIVEPATCH_H
 
+#include <linux/ftrace.h>
 #include <asm/ptrace.h>
 
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
+
 	regs->psw.addr = ip;
 }
 
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index e00fe88146e0..9f3130f40807 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -54,6 +54,9 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
 		return NULL;
 	return &fregs->regs;
 }
+
+#define ftrace_instruction_pointer_set(fregs, _ip)	\
+	do { (fregs)->regs.ip = (_ip); } while (0)
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 1fde1ab6559e..7c5cc6660e4b 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -12,9 +12,9 @@
 #include <asm/setup.h>
 #include <linux/ftrace.h>
 
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
 {
-	regs->ip = ip;
+	ftrace_instruction_pointer_set(fregs, ip);
 }
 
 #endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 60e3b64f5ea6..0d54099c2a3a 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -157,6 +157,10 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
 SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	call ftrace_stub
 
+	/* Handlers can change the RIP */
+	movq RIP(%rsp), %rax
+	movq %rax, MCOUNT_REG_SIZE(%rsp)
+
 	restore_mcount_regs
 
 	/*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 588ea7023a7a..9a8ce28e4485 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -97,6 +97,13 @@ struct ftrace_regs {
 };
 #define arch_ftrace_get_regs(fregs) (&(fregs)->regs)
 
+/*
+ * ftrace_instruction_pointer_set() is to be defined by the architecture
+ * if to allow setting of the instruction pointer from the ftrace_regs
+ * when HAVE_DYNAMIC_FTRACE_WITH_ARGS is set and it supports
+ * live kernel patching.
+ */
+#define ftrace_instruction_pointer_set(fregs, ip) do { } while (0)
 #endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
 
 static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs)
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 54102deb50ba..53d51ed619a3 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -6,7 +6,7 @@ config HAVE_LIVEPATCH
 
 config LIVEPATCH
 	bool "Kernel Live Patching"
-	depends on DYNAMIC_FTRACE_WITH_REGS
+	depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
 	depends on MODULES
 	depends on SYSFS
 	depends on KALLSYMS_ALL
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index f89f9e7e9b07..e8029aea67f1 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -42,7 +42,6 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 				       struct ftrace_ops *fops,
 				       struct ftrace_regs *fregs)
 {
-	struct pt_regs *regs = ftrace_get_regs(fregs);
 	struct klp_ops *ops;
 	struct klp_func *func;
 	int patch_state;
@@ -118,7 +117,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 	if (func->nop)
 		goto unlock;
 
-	klp_arch_set_pc(regs, (unsigned long)func->new_func);
+	klp_arch_set_pc(fregs, (unsigned long)func->new_func);
 
 unlock:
 	preempt_enable_notrace();
@@ -200,8 +199,10 @@ static int klp_patch_func(struct klp_func *func)
 			return -ENOMEM;
 
 		ops->fops.func = klp_ftrace_handler;
-		ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
-				  FTRACE_OPS_FL_DYNAMIC |
+		ops->fops.flags = FTRACE_OPS_FL_DYNAMIC |
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+				  FTRACE_OPS_FL_SAVE_REGS |
+#endif
 				  FTRACE_OPS_FL_IPMODIFY |
 				  FTRACE_OPS_FL_PERMANENT;
 
-- 
cgit 


From e7018751d2e603381ae6028ba4dd21ec45ce38bb Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Fri, 13 Nov 2020 14:12:31 -0300
Subject: usb: host: ehci-mxc: Remove the driver

The ehci-mxc driver was only used by i.MX non-DT platforms.

Since 5.10-rc1, i.MX has been converted to a DT-only platform and all
board files are gone.

Remove the ehci-mxc driver as there are no more users at all.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20201113171231.2205-1-festevam@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/Kconfig                   |   7 -
 drivers/usb/host/Makefile                  |   1 -
 drivers/usb/host/ehci-mxc.c                | 213 -----------------------------
 include/linux/platform_data/usb-ehci-mxc.h |  14 --
 4 files changed, 235 deletions(-)
 delete mode 100644 drivers/usb/host/ehci-mxc.c
 delete mode 100644 include/linux/platform_data/usb-ehci-mxc.h

(limited to 'include/linux')

diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index a633e9c15f7e..31e59309da1f 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -213,13 +213,6 @@ config USB_EHCI_FSL
 	help
 	  Variation of ARC USB block used in some Freescale chips.
 
-config USB_EHCI_MXC
-	tristate "Support for Freescale i.MX on-chip EHCI USB controller"
-	depends on ARCH_MXC || COMPILE_TEST
-	select USB_EHCI_ROOT_HUB_TT
-	help
-	  Variation of ARC USB block used in some Freescale chips.
-
 config USB_EHCI_HCD_NPCM7XX
 	tristate "Support for Nuvoton NPCM7XX on-chip EHCI USB controller"
 	depends on (USB_EHCI_HCD && ARCH_NPCM7XX) || COMPILE_TEST
diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
index d03f6f0764d2..c1b08703af10 100644
--- a/drivers/usb/host/Makefile
+++ b/drivers/usb/host/Makefile
@@ -40,7 +40,6 @@ obj-$(CONFIG_USB_PCI)	+= pci-quirks.o
 obj-$(CONFIG_USB_EHCI_HCD)	+= ehci-hcd.o
 obj-$(CONFIG_USB_EHCI_PCI)	+= ehci-pci.o
 obj-$(CONFIG_USB_EHCI_HCD_PLATFORM)	+= ehci-platform.o
-obj-$(CONFIG_USB_EHCI_MXC)	+= ehci-mxc.o
 obj-$(CONFIG_USB_EHCI_HCD_NPCM7XX)	+= ehci-npcm7xx.o
 obj-$(CONFIG_USB_EHCI_HCD_OMAP)	+= ehci-omap.o
 obj-$(CONFIG_USB_EHCI_HCD_ORION)	+= ehci-orion.o
diff --git a/drivers/usb/host/ehci-mxc.c b/drivers/usb/host/ehci-mxc.c
deleted file mode 100644
index dc2676320527..000000000000
--- a/drivers/usb/host/ehci-mxc.c
+++ /dev/null
@@ -1,213 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (c) 2008 Sascha Hauer <s.hauer@pengutronix.de>, Pengutronix
- * Copyright (c) 2009 Daniel Mack <daniel@caiaq.de>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/usb/otg.h>
-#include <linux/usb/ulpi.h>
-#include <linux/slab.h>
-#include <linux/usb.h>
-#include <linux/usb/hcd.h>
-#include <linux/platform_data/usb-ehci-mxc.h>
-#include "ehci.h"
-
-#define DRIVER_DESC "Freescale On-Chip EHCI Host driver"
-
-static const char hcd_name[] = "ehci-mxc";
-
-#define ULPI_VIEWPORT_OFFSET	0x170
-
-struct ehci_mxc_priv {
-	struct clk *usbclk, *ahbclk, *phyclk;
-};
-
-static struct hc_driver __read_mostly ehci_mxc_hc_driver;
-
-static const struct ehci_driver_overrides ehci_mxc_overrides __initconst = {
-	.extra_priv_size =	sizeof(struct ehci_mxc_priv),
-};
-
-static int ehci_mxc_drv_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	struct mxc_usbh_platform_data *pdata = dev_get_platdata(dev);
-	struct usb_hcd *hcd;
-	struct resource *res;
-	int irq, ret;
-	struct ehci_mxc_priv *priv;
-	struct ehci_hcd *ehci;
-
-	if (!pdata) {
-		dev_err(dev, "No platform data given, bailing out.\n");
-		return -EINVAL;
-	}
-
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return irq;
-
-	hcd = usb_create_hcd(&ehci_mxc_hc_driver, dev, dev_name(dev));
-	if (!hcd)
-		return -ENOMEM;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	hcd->regs = devm_ioremap_resource(dev, res);
-	if (IS_ERR(hcd->regs)) {
-		ret = PTR_ERR(hcd->regs);
-		goto err_alloc;
-	}
-	hcd->rsrc_start = res->start;
-	hcd->rsrc_len = resource_size(res);
-
-	hcd->has_tt = 1;
-	ehci = hcd_to_ehci(hcd);
-	priv = (struct ehci_mxc_priv *) ehci->priv;
-
-	/* enable clocks */
-	priv->usbclk = devm_clk_get(dev, "ipg");
-	if (IS_ERR(priv->usbclk)) {
-		ret = PTR_ERR(priv->usbclk);
-		goto err_alloc;
-	}
-	clk_prepare_enable(priv->usbclk);
-
-	priv->ahbclk = devm_clk_get(dev, "ahb");
-	if (IS_ERR(priv->ahbclk)) {
-		ret = PTR_ERR(priv->ahbclk);
-		goto err_clk_ahb;
-	}
-	clk_prepare_enable(priv->ahbclk);
-
-	/* "dr" device has its own clock on i.MX51 */
-	priv->phyclk = devm_clk_get(dev, "phy");
-	if (IS_ERR(priv->phyclk))
-		priv->phyclk = NULL;
-	if (priv->phyclk)
-		clk_prepare_enable(priv->phyclk);
-
-	/* call platform specific init function */
-	if (pdata->init) {
-		ret = pdata->init(pdev);
-		if (ret) {
-			dev_err(dev, "platform init failed\n");
-			goto err_init;
-		}
-		/* platforms need some time to settle changed IO settings */
-		mdelay(10);
-	}
-
-	/* EHCI registers start at offset 0x100 */
-	ehci->caps = hcd->regs + 0x100;
-	ehci->regs = hcd->regs + 0x100 +
-		HC_LENGTH(ehci, ehci_readl(ehci, &ehci->caps->hc_capbase));
-
-	/* set up the PORTSCx register */
-	ehci_writel(ehci, pdata->portsc, &ehci->regs->port_status[0]);
-
-	/* is this really needed? */
-	msleep(10);
-
-	/* Initialize the transceiver */
-	if (pdata->otg) {
-		pdata->otg->io_priv = hcd->regs + ULPI_VIEWPORT_OFFSET;
-		ret = usb_phy_init(pdata->otg);
-		if (ret) {
-			dev_err(dev, "unable to init transceiver, probably missing\n");
-			ret = -ENODEV;
-			goto err_add;
-		}
-		ret = otg_set_vbus(pdata->otg->otg, 1);
-		if (ret) {
-			dev_err(dev, "unable to enable vbus on transceiver\n");
-			goto err_add;
-		}
-	}
-
-	platform_set_drvdata(pdev, hcd);
-
-	ret = usb_add_hcd(hcd, irq, IRQF_SHARED);
-	if (ret)
-		goto err_add;
-
-	device_wakeup_enable(hcd->self.controller);
-	return 0;
-
-err_add:
-	if (pdata && pdata->exit)
-		pdata->exit(pdev);
-err_init:
-	if (priv->phyclk)
-		clk_disable_unprepare(priv->phyclk);
-
-	clk_disable_unprepare(priv->ahbclk);
-err_clk_ahb:
-	clk_disable_unprepare(priv->usbclk);
-err_alloc:
-	usb_put_hcd(hcd);
-	return ret;
-}
-
-static int ehci_mxc_drv_remove(struct platform_device *pdev)
-{
-	struct mxc_usbh_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	struct usb_hcd *hcd = platform_get_drvdata(pdev);
-	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
-	struct ehci_mxc_priv *priv = (struct ehci_mxc_priv *) ehci->priv;
-
-	usb_remove_hcd(hcd);
-
-	if (pdata && pdata->exit)
-		pdata->exit(pdev);
-
-	if (pdata && pdata->otg)
-		usb_phy_shutdown(pdata->otg);
-
-	clk_disable_unprepare(priv->usbclk);
-	clk_disable_unprepare(priv->ahbclk);
-
-	if (priv->phyclk)
-		clk_disable_unprepare(priv->phyclk);
-
-	usb_put_hcd(hcd);
-	return 0;
-}
-
-MODULE_ALIAS("platform:mxc-ehci");
-
-static struct platform_driver ehci_mxc_driver = {
-	.probe = ehci_mxc_drv_probe,
-	.remove = ehci_mxc_drv_remove,
-	.shutdown = usb_hcd_platform_shutdown,
-	.driver = {
-		   .name = "mxc-ehci",
-	},
-};
-
-static int __init ehci_mxc_init(void)
-{
-	if (usb_disabled())
-		return -ENODEV;
-
-	pr_info("%s: " DRIVER_DESC "\n", hcd_name);
-
-	ehci_init_driver(&ehci_mxc_hc_driver, &ehci_mxc_overrides);
-	return platform_driver_register(&ehci_mxc_driver);
-}
-module_init(ehci_mxc_init);
-
-static void __exit ehci_mxc_cleanup(void)
-{
-	platform_driver_unregister(&ehci_mxc_driver);
-}
-module_exit(ehci_mxc_cleanup);
-
-MODULE_DESCRIPTION(DRIVER_DESC);
-MODULE_AUTHOR("Sascha Hauer");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_data/usb-ehci-mxc.h b/include/linux/platform_data/usb-ehci-mxc.h
deleted file mode 100644
index ad9794d09bc8..000000000000
--- a/include/linux/platform_data/usb-ehci-mxc.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __INCLUDE_ASM_ARCH_MXC_EHCI_H
-#define __INCLUDE_ASM_ARCH_MXC_EHCI_H
-
-struct mxc_usbh_platform_data {
-	int (*init)(struct platform_device *pdev);
-	int (*exit)(struct platform_device *pdev);
-
-	unsigned int		 portsc;
-	struct usb_phy		*otg;
-};
-
-#endif /* __INCLUDE_ASM_ARCH_MXC_EHCI_H */
-
-- 
cgit 


From 56c62080d5b57dac2c2cdd4a83571450e38ca763 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 13 Nov 2020 22:27:04 +0100
Subject: usb: hcd.h: Remove RUN_CONTEXT

The last user of RUN_CONTEXT was removed in commit
   97c17beb3b668 ("[PATCH] ehci-hcd (1/2):  portability (2.4), tasklet,")
in the history.git repo.

There are no users of RUN_CONTEXT, remove it.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/20201113212704.2243807-1-bigeasy@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/hcd.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 3dbb42c637c1..96281cd50ff6 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -734,10 +734,6 @@ static inline void usbmon_urb_complete(struct usb_bus *bus, struct urb *urb,
 
 /* random stuff */
 
-#define	RUN_CONTEXT (in_irq() ? "in_irq" \
-		: (in_interrupt() ? "in_interrupt" : "can sleep"))
-
-
 /* This rwsem is for use only by the hub driver and ehci-hcd.
  * Nobody else should touch it.
  */
-- 
cgit 


From 8dedcc3eee3aceb37832176f0a1b03d5687acda3 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Thu, 24 Sep 2020 11:41:55 +0300
Subject: iio: core: centralize ioctl() calls to the main chardev

The aim of this is to improve a bit the organization of ioctl() calls in
IIO core. Currently the chardev is split across IIO core sub-modules/files.
The main chardev has to be able to handle ioctl() calls, and if we need to
add buffer ioctl() calls, this would complicate things.

The 'industrialio-core.c' file will provide a 'iio_device_ioctl()' which
will iterate over a list of ioctls registered with the IIO device. These
can be event ioctl() or buffer ioctl() calls, or something else.

Each ioctl() handler will have to return a IIO_IOCTL_UNHANDLED code (which
is positive 1), if the ioctl() did not handle the call in any. This
eliminates any potential ambiguities about negative error codes, which
should fail the call altogether.

If any ioctl() returns 0, it was considered that it was serviced
successfully and the loop will exit.

This change also moves the handling of the IIO_GET_EVENT_FD_IOCTL command
inside 'industrialio-event.c', where this is better suited.

This patch is a combination of 2 other patches from an older series:
Patch 1: iio: core: add simple centralized mechanism for ioctl() handlers
  Link: https://lore.kernel.org/linux-iio/20200427131100.50845-6-alexandru.ardelean@analog.com/
Patch 2: iio: core: use new common ioctl() mechanism
  Link: https://lore.kernel.org/linux-iio/20200427131100.50845-7-alexandru.ardelean@analog.com/

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20200924084155.99406-1-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/iio_core.h           | 15 ++++++++++-
 drivers/iio/industrialio-core.c  | 56 ++++++++++++++++++++++++++++++----------
 drivers/iio/industrialio-event.c | 28 +++++++++++++++++++-
 include/linux/iio/iio-opaque.h   |  2 ++
 4 files changed, 85 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index fd9a5f1d5e51..fced02cadcc3 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -17,6 +17,20 @@ struct iio_dev;
 
 extern struct device_type iio_device_type;
 
+#define IIO_IOCTL_UNHANDLED	1
+struct iio_ioctl_handler {
+	struct list_head entry;
+	long (*ioctl)(struct iio_dev *indio_dev, struct file *filp,
+		      unsigned int cmd, unsigned long arg);
+};
+
+long iio_device_ioctl(struct iio_dev *indio_dev, struct file *filp,
+		      unsigned int cmd, unsigned long arg);
+
+void iio_device_ioctl_handler_register(struct iio_dev *indio_dev,
+				       struct iio_ioctl_handler *h);
+void iio_device_ioctl_handler_unregister(struct iio_ioctl_handler *h);
+
 int __iio_add_chan_devattr(const char *postfix,
 			   struct iio_chan_spec const *chan,
 			   ssize_t (*func)(struct device *dev,
@@ -74,7 +88,6 @@ static inline void iio_buffer_wakeup_poll(struct iio_dev *indio_dev) {}
 int iio_device_register_eventset(struct iio_dev *indio_dev);
 void iio_device_unregister_eventset(struct iio_dev *indio_dev);
 void iio_device_wakeup_eventset(struct iio_dev *indio_dev);
-int iio_event_getfd(struct iio_dev *indio_dev);
 
 struct iio_event_interface;
 bool iio_event_enabled(const struct iio_event_interface *ev_int);
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 0402be441ffb..e53c771d66eb 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1612,6 +1612,7 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
 	}
 	dev_set_name(&dev->dev, "iio:device%d", dev->id);
 	INIT_LIST_HEAD(&iio_dev_opaque->buffer_list);
+	INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers);
 
 	return dev;
 }
@@ -1705,26 +1706,47 @@ static int iio_chrdev_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-/* Somewhat of a cross file organization violation - ioctls here are actually
- * event related */
+void iio_device_ioctl_handler_register(struct iio_dev *indio_dev,
+				       struct iio_ioctl_handler *h)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+
+	list_add_tail(&h->entry, &iio_dev_opaque->ioctl_handlers);
+}
+
+void iio_device_ioctl_handler_unregister(struct iio_ioctl_handler *h)
+{
+	list_del(&h->entry);
+}
+
 static long iio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct iio_dev *indio_dev = filp->private_data;
-	int __user *ip = (int __user *)arg;
-	int fd;
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	struct iio_ioctl_handler *h;
+	int ret = -ENODEV;
+
+	mutex_lock(&indio_dev->info_exist_lock);
 
+	/**
+	 * The NULL check here is required to prevent crashing when a device
+	 * is being removed while userspace would still have open file handles
+	 * to try to access this device.
+	 */
 	if (!indio_dev->info)
-		return -ENODEV;
-
-	if (cmd == IIO_GET_EVENT_FD_IOCTL) {
-		fd = iio_event_getfd(indio_dev);
-		if (fd < 0)
-			return fd;
-		if (copy_to_user(ip, &fd, sizeof(fd)))
-			return -EFAULT;
-		return 0;
+		goto out_unlock;
+
+	ret = -EINVAL;
+	list_for_each_entry(h, &iio_dev_opaque->ioctl_handlers, entry) {
+		ret = h->ioctl(indio_dev, filp, cmd, arg);
+		if (ret != IIO_IOCTL_UNHANDLED)
+			break;
 	}
-	return -EINVAL;
+
+out_unlock:
+	mutex_unlock(&indio_dev->info_exist_lock);
+
+	return ret;
 }
 
 static const struct file_operations iio_buffer_fileops = {
@@ -1841,6 +1863,9 @@ EXPORT_SYMBOL(__iio_device_register);
  **/
 void iio_device_unregister(struct iio_dev *indio_dev)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	struct iio_ioctl_handler *h, *t;
+
 	cdev_device_del(&indio_dev->chrdev, &indio_dev->dev);
 
 	mutex_lock(&indio_dev->info_exist_lock);
@@ -1851,6 +1876,9 @@ void iio_device_unregister(struct iio_dev *indio_dev)
 
 	indio_dev->info = NULL;
 
+	list_for_each_entry_safe(h, t, &iio_dev_opaque->ioctl_handlers, entry)
+		list_del(&h->entry);
+
 	iio_device_wakeup_eventset(indio_dev);
 	iio_buffer_wakeup_poll(indio_dev);
 
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 99ba657b8568..7e532117ac55 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -31,6 +31,7 @@
  * @flags:		file operations related flags including busy flag.
  * @group:		event interface sysfs attribute group
  * @read_lock:		lock to protect kfifo read operations
+ * @ioctl_handler:	handler for event ioctl() calls
  */
 struct iio_event_interface {
 	wait_queue_head_t	wait;
@@ -40,6 +41,7 @@ struct iio_event_interface {
 	unsigned long		flags;
 	struct attribute_group	group;
 	struct mutex		read_lock;
+	struct iio_ioctl_handler	ioctl_handler;
 };
 
 bool iio_event_enabled(const struct iio_event_interface *ev_int)
@@ -187,7 +189,7 @@ static const struct file_operations iio_event_chrdev_fileops = {
 	.llseek = noop_llseek,
 };
 
-int iio_event_getfd(struct iio_dev *indio_dev)
+static int iio_event_getfd(struct iio_dev *indio_dev)
 {
 	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
 	struct iio_event_interface *ev_int = iio_dev_opaque->event_interface;
@@ -473,6 +475,24 @@ static void iio_setup_ev_int(struct iio_event_interface *ev_int)
 	mutex_init(&ev_int->read_lock);
 }
 
+static long iio_event_ioctl(struct iio_dev *indio_dev, struct file *filp,
+			    unsigned int cmd, unsigned long arg)
+{
+	int __user *ip = (int __user *)arg;
+	int fd;
+
+	if (cmd == IIO_GET_EVENT_FD_IOCTL) {
+		fd = iio_event_getfd(indio_dev);
+		if (fd < 0)
+			return fd;
+		if (copy_to_user(ip, &fd, sizeof(fd)))
+			return -EFAULT;
+		return 0;
+	}
+
+	return IIO_IOCTL_UNHANDLED;
+}
+
 static const char *iio_event_group_name = "events";
 int iio_device_register_eventset(struct iio_dev *indio_dev)
 {
@@ -526,6 +546,10 @@ int iio_device_register_eventset(struct iio_dev *indio_dev)
 		ev_int->group.attrs[attrn++] = &p->dev_attr.attr;
 	indio_dev->groups[indio_dev->groupcounter++] = &ev_int->group;
 
+	ev_int->ioctl_handler.ioctl = iio_event_ioctl;
+	iio_device_ioctl_handler_register(&iio_dev_opaque->indio_dev,
+					  &ev_int->ioctl_handler);
+
 	return 0;
 
 error_free_setup_event_lines:
@@ -558,6 +582,8 @@ void iio_device_unregister_eventset(struct iio_dev *indio_dev)
 
 	if (ev_int == NULL)
 		return;
+
+	iio_device_ioctl_handler_unregister(&ev_int->ioctl_handler);
 	iio_free_chan_devattr_list(&ev_int->dev_attr_list);
 	kfree(ev_int->group.attrs);
 	kfree(ev_int);
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index f2e94196d31f..07c5a8e52ca8 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -11,6 +11,7 @@
  * @channel_attr_list:		keep track of automatically created channel
  *				attributes
  * @chan_attr_group:		group for all attrs in base directory
+ * @ioctl_handlers:		ioctl handlers registered with the core handler
  * @debugfs_dentry:		device specific debugfs dentry
  * @cached_reg_addr:		cached register address for debugfs reads
  * @read_buf:			read buffer to be used for the initial reg read
@@ -22,6 +23,7 @@ struct iio_dev_opaque {
 	struct list_head		buffer_list;
 	struct list_head		channel_attr_list;
 	struct attribute_group		chan_attr_group;
+	struct list_head		ioctl_handlers;
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry			*debugfs_dentry;
 	unsigned			cached_reg_addr;
-- 
cgit 


From 3347acc6fcd4ee71ad18a9ff9d9dac176b517329 Mon Sep 17 00:00:00 2001
From: Arvind Sankar <nivedita@alum.mit.edu>
Date: Fri, 13 Nov 2020 22:51:59 -0800
Subject: compiler.h: fix barrier_data() on clang

Commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h
mutually exclusive") neglected to copy barrier_data() from
compiler-gcc.h into compiler-clang.h.

The definition in compiler-gcc.h was really to work around clang's more
aggressive optimization, so this broke barrier_data() on clang, and
consequently memzero_explicit() as well.

For example, this results in at least the memzero_explicit() call in
lib/crypto/sha256.c:sha256_transform() being optimized away by clang.

Fix this by moving the definition of barrier_data() into compiler.h.

Also move the gcc/clang definition of barrier() into compiler.h,
__memory_barrier() is icc-specific (and barrier() is already defined
using it in compiler-intel.h) and doesn't belong in compiler.h.

[rdunlap@infradead.org: fix ALPHA builds when SMP is not enabled]

Link: https://lkml.kernel.org/r/20201101231835.4589-1-rdunlap@infradead.org
Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive")
Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20201014212631.207844-1-nivedita@alum.mit.edu
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/barrier.h  |  1 +
 include/linux/compiler-clang.h |  6 ------
 include/linux/compiler-gcc.h   | 19 -------------------
 include/linux/compiler.h       | 18 ++++++++++++++++--
 4 files changed, 17 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 798027bb89be..640f09479bdf 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -13,6 +13,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/compiler.h>
 #include <asm/rwonce.h>
 
 #ifndef nop
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 230604e7f057..dd7233c48bf3 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -60,12 +60,6 @@
 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
 #endif
 
-/* The following are for compatibility with GCC, from compiler-gcc.h,
- * and may be redefined here because they should not be shared with other
- * compilers, like ICC.
- */
-#define barrier() __asm__ __volatile__("" : : : "memory")
-
 #if __has_feature(shadow_call_stack)
 # define __noscs	__attribute__((__no_sanitize__("shadow-call-stack")))
 #endif
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5deb37024574..74c6c0486eed 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -15,25 +15,6 @@
 # error Sorry, your version of GCC is too old - please use 4.9 or newer.
 #endif
 
-/* Optimization barrier */
-
-/* The "volatile" is due to gcc bugs */
-#define barrier() __asm__ __volatile__("": : :"memory")
-/*
- * This version is i.e. to prevent dead stores elimination on @ptr
- * where gcc and llvm may behave differently when otherwise using
- * normal barrier(): while gcc behavior gets along with a normal
- * barrier(), llvm needs an explicit input variable to be assumed
- * clobbered. The issue is as follows: while the inline asm might
- * access any memory it wants, the compiler could have fit all of
- * @ptr into memory registers instead, and since @ptr never escaped
- * from that, it proved that the inline asm wasn't touching any of
- * it. This version works well with both compilers, i.e. we're telling
- * the compiler that the inline asm absolutely may see the contents
- * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495
- */
-#define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory")
-
 /*
  * This macro obfuscates arithmetic on a variable address so that gcc
  * shouldn't recognize the original var, and make assumptions about it.
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index e512f5505dad..b8fe0c23cfff 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -80,11 +80,25 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 
 /* Optimization barrier */
 #ifndef barrier
-# define barrier() __memory_barrier()
+/* The "volatile" is due to gcc bugs */
+# define barrier() __asm__ __volatile__("": : :"memory")
 #endif
 
 #ifndef barrier_data
-# define barrier_data(ptr) barrier()
+/*
+ * This version is i.e. to prevent dead stores elimination on @ptr
+ * where gcc and llvm may behave differently when otherwise using
+ * normal barrier(): while gcc behavior gets along with a normal
+ * barrier(), llvm needs an explicit input variable to be assumed
+ * clobbered. The issue is as follows: while the inline asm might
+ * access any memory it wants, the compiler could have fit all of
+ * @ptr into memory registers instead, and since @ptr never escaped
+ * from that, it proved that the inline asm wasn't touching any of
+ * it. This version works well with both compilers, i.e. we're telling
+ * the compiler that the inline asm absolutely may see the contents
+ * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495
+ */
+# define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory")
 #endif
 
 /* workaround for GCC PR82365 if needed */
-- 
cgit 


From 8b21ca0218d29cc6bb7028125c7e5a10dfb4730c Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Fri, 13 Nov 2020 22:52:13 -0800
Subject: mm: memcontrol: fix missing wakeup polling thread

When we poll the swap.events, we can miss being woken up when the swap
event occurs.  Because we didn't notify.

Fixes: f3a53a3a1e5b ("mm, memcontrol: implement memory.swap.events")
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Cc: Chris Down <chris@chrisdown.name>
Cc: Tejun Heo <tj@kernel.org>
Link: https://lkml.kernel.org/r/20201105161936.98312-1-songmuchun@bytedance.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e391e3c56de5..a80c59af2c60 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -900,12 +900,19 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 static inline void memcg_memory_event(struct mem_cgroup *memcg,
 				      enum memcg_memory_event event)
 {
+	bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
+			  event == MEMCG_SWAP_FAIL;
+
 	atomic_long_inc(&memcg->memory_events_local[event]);
-	cgroup_file_notify(&memcg->events_local_file);
+	if (!swap_event)
+		cgroup_file_notify(&memcg->events_local_file);
 
 	do {
 		atomic_long_inc(&memcg->memory_events[event]);
-		cgroup_file_notify(&memcg->events_file);
+		if (swap_event)
+			cgroup_file_notify(&memcg->swap_events_file);
+		else
+			cgroup_file_notify(&memcg->events_file);
 
 		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
 			break;
-- 
cgit 


From 30d6f8c15d2cd877c1f3d47d8a1064649ebe58e2 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 21 Oct 2020 18:21:46 +0200
Subject: clk: add api to get clk consumer from clk_hw

clk_register() is deprecated. Using 'clk' member of struct clk_hw is
discouraged. With this constraint, it is difficult for driver to
register clocks using the clk_hw API and then use the clock with
the consumer API

This adds a simple helper, clk_hw_get_clk(), to get a struct clk from
a struct clk_hw. Like other clk_get() variant, each call to this helper
must be balanced with a call to clk_put(). To make life easier on the
consumers, a memory managed version is provided as well.

Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lore.kernel.org/r/20201021162147.563655-3-jbrunet@baylibre.com
Tested-by: Kevin Hilman <khilman@baylibre.com>
[sboyd@kernel.org: Fix kernel-doc]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 61 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  5 ++++
 2 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index f70dc0ef1cdd..48931f442de8 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3667,6 +3667,24 @@ struct clk *clk_hw_create_clk(struct device *dev, struct clk_hw *hw,
 	return clk;
 }
 
+/**
+ * clk_hw_get_clk - get clk consumer given an clk_hw
+ * @hw: clk_hw associated with the clk being consumed
+ * @con_id: connection ID string on device
+ *
+ * Returns: new clk consumer
+ * This is the function to be used by providers which need
+ * to get a consumer clk and act on the clock element
+ * Calls to this function must be balanced with calls clk_put()
+ */
+struct clk *clk_hw_get_clk(struct clk_hw *hw, const char *con_id)
+{
+	struct device *dev = hw->core->dev;
+
+	return clk_hw_create_clk(dev, hw, dev_name(dev), con_id);
+}
+EXPORT_SYMBOL(clk_hw_get_clk);
+
 static int clk_cpy_name(const char **dst_p, const char *src, bool must_exist)
 {
 	const char *dst;
@@ -4187,6 +4205,49 @@ void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(devm_clk_hw_unregister);
 
+static void devm_clk_release(struct device *dev, void *res)
+{
+	clk_put(*(struct clk **)res);
+}
+
+/**
+ * devm_clk_hw_get_clk - resource managed clk_hw_get_clk()
+ * @dev: device that is registering this clock
+ * @hw: clk_hw associated with the clk being consumed
+ * @con_id: connection ID string on device
+ *
+ * Managed clk_hw_get_clk(). Clocks got with this function are
+ * automatically clk_put() on driver detach. See clk_put()
+ * for more information.
+ */
+struct clk *devm_clk_hw_get_clk(struct device *dev, struct clk_hw *hw,
+				const char *con_id)
+{
+	struct clk *clk;
+	struct clk **clkp;
+
+	/* This should not happen because it would mean we have drivers
+	 * passing around clk_hw pointers instead of having the caller use
+	 * proper clk_get() style APIs
+	 */
+	WARN_ON_ONCE(dev != hw->core->dev);
+
+	clkp = devres_alloc(devm_clk_release, sizeof(*clkp), GFP_KERNEL);
+	if (!clkp)
+		return ERR_PTR(-ENOMEM);
+
+	clk = clk_hw_get_clk(hw, con_id);
+	if (!IS_ERR(clk)) {
+		*clkp = clk;
+		devres_add(dev, clkp);
+	} else {
+		devres_free(clkp);
+	}
+
+	return clk;
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_get_clk);
+
 /*
  * clkdev helpers
  */
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 03a5de5f99f4..86b707520ec0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1088,6 +1088,11 @@ static inline struct clk_hw *__clk_get_hw(struct clk *clk)
 	return (struct clk_hw *)clk;
 }
 #endif
+
+struct clk *clk_hw_get_clk(struct clk_hw *hw, const char *con_id);
+struct clk *devm_clk_hw_get_clk(struct device *dev, struct clk_hw *hw,
+				const char *con_id);
+
 unsigned int clk_hw_get_num_parents(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent_by_index(const struct clk_hw *hw,
-- 
cgit 


From 6d30d50d037dfa092f9d5d1fffa348ab4abb7163 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 21 Oct 2020 18:38:46 +0200
Subject: clk: add devm variant of clk_notifier_register

Add a memory managed variant of clk_notifier_register() to make life easier
on clock consumers using notifiers

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lore.kernel.org/r/20201021163847.595189-2-jbrunet@baylibre.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c   | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/clk.h | 10 ++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 48931f442de8..6cf59e3c31b4 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -4395,6 +4395,42 @@ int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(clk_notifier_unregister);
 
+struct clk_notifier_devres {
+	struct clk *clk;
+	struct notifier_block *nb;
+};
+
+static void devm_clk_notifier_release(struct device *dev, void *res)
+{
+	struct clk_notifier_devres *devres = res;
+
+	clk_notifier_unregister(devres->clk, devres->nb);
+}
+
+int devm_clk_notifier_register(struct device *dev, struct clk *clk,
+			       struct notifier_block *nb)
+{
+	struct clk_notifier_devres *devres;
+	int ret;
+
+	devres = devres_alloc(devm_clk_notifier_release,
+			      sizeof(*devres), GFP_KERNEL);
+
+	if (!devres)
+		return -ENOMEM;
+
+	ret = clk_notifier_register(clk, nb);
+	if (!ret) {
+		devres->clk = clk;
+		devres->nb = nb;
+	} else {
+		devres_free(devres);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_clk_notifier_register);
+
 #ifdef CONFIG_OF
 static void clk_core_reparent_orphans(void)
 {
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 7fd6a1febcf4..f53afdf8198b 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -109,6 +109,16 @@ int clk_notifier_register(struct clk *clk, struct notifier_block *nb);
  */
 int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb);
 
+/**
+ * devm_clk_notifier_register - register a managed rate-change notifier callback
+ * @dev: device for clock "consumer"
+ * @clk: clock whose rate we are interested in
+ * @nb: notifier block with callback function pointer
+ *
+ * Returns 0 on success, -EERROR otherwise
+ */
+int devm_clk_notifier_register(struct device *dev, struct clk *clk, struct notifier_block *nb);
+
 /**
  * clk_get_accuracy - obtain the clock accuracy in ppb (parts per billion)
  *		      for a clock source.
-- 
cgit 


From e6fb7aee486c7fbd4d94f4894feaa6f0424c1740 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 21 Oct 2020 18:38:47 +0200
Subject: clk: meson: g12: use devm variant to register notifiers

Until now, nothing was done to unregister the dvfs clock notifiers of the
Amlogic g12 SoC family. This is not great but this driver was not really
expected to be unloaded. With the ongoing effort to build everything as
module for this platform, this needs to be cleanly handled.

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lore.kernel.org/r/20201021163847.595189-3-jbrunet@baylibre.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/meson/g12a.c | 34 ++++++++++++++++++++--------------
 include/linux/clk.h      | 10 +++++++++-
 2 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/meson/g12a.c b/drivers/clk/meson/g12a.c
index 235dcf72e34a..108e4491b1e2 100644
--- a/drivers/clk/meson/g12a.c
+++ b/drivers/clk/meson/g12a.c
@@ -5171,8 +5171,8 @@ static int meson_g12a_dvfs_setup_common(struct device *dev,
 	g12a_cpu_clk_postmux0_nb_data.xtal = xtal;
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_cpu_clk_postmux0.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk,
-				    &g12a_cpu_clk_postmux0_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_postmux0_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpu_clk_postmux0 notifier\n");
 		return ret;
@@ -5181,7 +5181,8 @@ static int meson_g12a_dvfs_setup_common(struct device *dev,
 	/* Setup clock notifier for cpu_clk_dyn mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_cpu_clk_dyn.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpu_clk_dyn notifier\n");
 		return ret;
@@ -5207,7 +5208,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 	/* Setup clock notifier for cpu_clk mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_cpu_clk.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpu_clk notifier\n");
 		return ret;
@@ -5216,8 +5218,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 	/* Setup clock notifier for sys1_pll */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_sys1_pll.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk,
-				    &g12b_cpu_clk_sys1_pll_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12b_cpu_clk_sys1_pll_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the sys1_pll notifier\n");
 		return ret;
@@ -5229,8 +5231,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 	g12b_cpub_clk_postmux0_nb_data.xtal = xtal;
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_cpub_clk_postmux0.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk,
-				    &g12b_cpub_clk_postmux0_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12b_cpub_clk_postmux0_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpub_clk_postmux0 notifier\n");
 		return ret;
@@ -5238,7 +5240,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for cpub_clk_dyn mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_cpub_clk_dyn.hw, "dvfs");
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpub_clk_dyn notifier\n");
 		return ret;
@@ -5246,7 +5249,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for cpub_clk mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_cpub_clk.hw, DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpub_clk notifier\n");
 		return ret;
@@ -5254,8 +5258,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for sys_pll */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_sys_pll.hw, DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk,
-				    &g12b_cpub_clk_sys_pll_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12b_cpub_clk_sys_pll_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the sys_pll notifier\n");
 		return ret;
@@ -5277,7 +5281,8 @@ static int meson_g12a_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for cpu_clk mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_cpu_clk.hw, DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+				    &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpu_clk notifier\n");
 		return ret;
@@ -5285,7 +5290,8 @@ static int meson_g12a_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for sys_pll */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_sys_pll.hw, DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_sys_pll_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_sys_pll_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the sys_pll notifier\n");
 		return ret;
diff --git a/include/linux/clk.h b/include/linux/clk.h
index f53afdf8198b..4ac766dc3daf 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -117,7 +117,8 @@ int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb);
  *
  * Returns 0 on success, -EERROR otherwise
  */
-int devm_clk_notifier_register(struct device *dev, struct clk *clk, struct notifier_block *nb);
+int devm_clk_notifier_register(struct device *dev, struct clk *clk,
+			       struct notifier_block *nb);
 
 /**
  * clk_get_accuracy - obtain the clock accuracy in ppb (parts per billion)
@@ -196,6 +197,13 @@ static inline int clk_notifier_unregister(struct clk *clk,
 	return -ENOTSUPP;
 }
 
+static inline int devm_clk_notifier_register(struct device *dev,
+					     struct clk *clk,
+					     struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
 static inline long clk_get_accuracy(struct clk *clk)
 {
 	return -ENOTSUPP;
-- 
cgit 


From f296dcd629aa412a80a53215e46087f53af87f08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 14 Nov 2020 22:01:45 +0100
Subject: genirq: Remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ

Commit bb9d812643d8 ("arch: remove tile port") removed the last user of
this cruft two years ago...

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/87eekvac06.fsf@nanos.tec.linutronix.de
---
 include/linux/irq.h  | 15 ---------------
 kernel/irq/Kconfig   |  5 -----
 kernel/irq/irqdesc.c | 51 ---------------------------------------------------
 3 files changed, 71 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c54365309e97..79ce314a603b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -954,21 +954,6 @@ static inline void irq_free_desc(unsigned int irq)
 	irq_free_descs(irq, 1);
 }
 
-#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
-unsigned int irq_alloc_hwirqs(int cnt, int node);
-static inline unsigned int irq_alloc_hwirq(int node)
-{
-	return irq_alloc_hwirqs(1, node);
-}
-void irq_free_hwirqs(unsigned int from, int cnt);
-static inline void irq_free_hwirq(unsigned int irq)
-{
-	return irq_free_hwirqs(irq, 1);
-}
-int arch_setup_hwirq(unsigned int irq, int node);
-void arch_teardown_hwirq(unsigned int irq);
-#endif
-
 #ifdef CONFIG_GENERIC_IRQ_LEGACY
 void irq_init_desc(unsigned int irq);
 #endif
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 10a5aff4eecc..f2cda6b0057f 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -26,11 +26,6 @@ config GENERIC_IRQ_SHOW_LEVEL
 config GENERIC_IRQ_EFFECTIVE_AFF_MASK
        bool
 
-# Facility to allocate a hardware interrupt. This is legacy support
-# and should not be used in new code. Use irq domains instead.
-config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
-       bool
-
 # Support for delayed migration from interrupt context
 config GENERIC_PENDING_IRQ
 	bool
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1a7723604399..e810eb9906ea 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -810,57 +810,6 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(__irq_alloc_descs);
 
-#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
-/**
- * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
- * @cnt:	number of interrupts to allocate
- * @node:	node on which to allocate
- *
- * Returns an interrupt number > 0 or 0, if the allocation fails.
- */
-unsigned int irq_alloc_hwirqs(int cnt, int node)
-{
-	int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);
-
-	if (irq < 0)
-		return 0;
-
-	for (i = irq; cnt > 0; i++, cnt--) {
-		if (arch_setup_hwirq(i, node))
-			goto err;
-		irq_clear_status_flags(i, _IRQ_NOREQUEST);
-	}
-	return irq;
-
-err:
-	for (i--; i >= irq; i--) {
-		irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
-		arch_teardown_hwirq(i);
-	}
-	irq_free_descs(irq, cnt);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
-
-/**
- * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
- * @from:	Free from irq number
- * @cnt:	number of interrupts to free
- *
- */
-void irq_free_hwirqs(unsigned int from, int cnt)
-{
-	int i, j;
-
-	for (i = from, j = cnt; j > 0; i++, j--) {
-		irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
-		arch_teardown_hwirq(i);
-	}
-	irq_free_descs(from, cnt);
-}
-EXPORT_SYMBOL_GPL(irq_free_hwirqs);
-#endif
-
 /**
  * irq_get_next_irq - get next allocated irq number
  * @offset:	where to start the search
-- 
cgit 


From e906a546bd8653ed2e7a14cb300fd17952d7f862 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 14 Nov 2020 23:36:28 +0100
Subject: genirq/irqdomain: Make irq_domain_disassociate() static

No users outside of the core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/87a6vja7mb.fsf@nanos.tec.linutronix.de
---
 include/linux/irqdomain.h | 2 --
 kernel/irq/irqdomain.c    | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 77bf7d84c673..5701a8b01726 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -387,8 +387,6 @@ extern int irq_domain_associate(struct irq_domain *domain, unsigned int irq,
 extern void irq_domain_associate_many(struct irq_domain *domain,
 				      unsigned int irq_base,
 				      irq_hw_number_t hwirq_base, int count);
-extern void irq_domain_disassociate(struct irq_domain *domain,
-				    unsigned int irq);
 
 extern unsigned int irq_create_mapping(struct irq_domain *host,
 				       irq_hw_number_t hwirq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9c9cb8829f7a..3d7463fd6453 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -496,7 +496,7 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
 	}
 }
 
-void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
 {
 	struct irq_data *irq_data = irq_get_irq_data(irq);
 	irq_hw_number_t hwirq;
-- 
cgit 


From c4d51a52c67a1e3a0fa3006e5ec21cdc07649cd6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 27 Oct 2020 14:39:43 +0000
Subject: sched/wait: Add add_wait_queue_priority()

This allows an exclusive wait_queue_entry to be added at the head of the
queue, instead of the tail as normal. Thus, it gets to consume events
first without allowing non-exclusive waiters to be woken at all.

The (first) intended use is for KVM IRQFD, which currently has
inconsistent behaviour depending on whether posted interrupts are
available or not. If they are, KVM will bypass the eventfd completely
and deliver interrupts directly to the appropriate vCPU. If not, events
are delivered through the eventfd and userspace will receive them when
polling on the eventfd.

By using add_wait_queue_priority(), KVM will be able to consistently
consume events within the kernel without accidentally exposing them
to userspace when they're supposed to be bypassed. This, in turn, means
that userspace doesn't have to jump through hoops to avoid listening
on the erroneously noisy eventfd and injecting duplicate interrupts.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20201027143944.648769-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/wait.h | 12 +++++++++++-
 kernel/sched/wait.c  | 17 ++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 27fb99cfeb02..fe10e8570a52 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -22,6 +22,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 #define WQ_FLAG_BOOKMARK	0x04
 #define WQ_FLAG_CUSTOM		0x08
 #define WQ_FLAG_DONE		0x10
+#define WQ_FLAG_PRIORITY	0x20
 
 /*
  * A single wait-queue entry structure:
@@ -164,11 +165,20 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
 
 extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 
 static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
-	list_add(&wq_entry->entry, &wq_head->head);
+	struct list_head *head = &wq_head->head;
+	struct wait_queue_entry *wq;
+
+	list_for_each_entry(wq, &wq_head->head, entry) {
+		if (!(wq->flags & WQ_FLAG_PRIORITY))
+			break;
+		head = &wq->entry;
+	}
+	list_add(&wq_entry->entry, head);
 }
 
 /*
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 01f5d3020589..183cc6ae68a6 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+	unsigned long flags;
+
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__add_wait_queue(wq_head, wq_entry);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
 void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
@@ -57,7 +68,11 @@ EXPORT_SYMBOL(remove_wait_queue);
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
+ * number) then we wake that number of exclusive tasks, and potentially all
+ * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
+ * the list and any non-exclusive tasks will be woken first. A priority task
+ * may be at the head of the list, and can consume the event without any other
+ * tasks being woken.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
-- 
cgit 


From 28f1326710555bbe666f64452d08f2d7dd657cae Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 27 Oct 2020 13:55:21 +0000
Subject: eventfd: Export eventfd_ctx_do_read()

Where events are consumed in the kernel, for example by KVM's
irqfd_wakeup() and VFIO's virqfd_wakeup(), they currently lack a
mechanism to drain the eventfd's counter.

Since the wait queue is already locked while the wakeup functions are
invoked, all they really need to do is call eventfd_ctx_do_read().

Add a check for the lock, and export it for them.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20201027135523.646811-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 fs/eventfd.c            | 5 ++++-
 include/linux/eventfd.h | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index df466ef81ddd..e265b6dd4f34 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -182,11 +182,14 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
-static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 {
+	lockdep_assert_held(&ctx->wqh.lock);
+
 	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 	ctx->count -= *cnt;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
 
 /**
  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index dc4fd8a6644d..fa0a524baed0 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -41,6 +41,7 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
 
 DECLARE_PER_CPU(int, eventfd_wake_count);
 
@@ -82,6 +83,11 @@ static inline bool eventfd_signal_count(void)
 	return false;
 }
 
+static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+{
+
+}
+
 #endif
 
 #endif /* _LINUX_EVENTFD_H */
-- 
cgit 


From 2f5414423ef577e9e8bdb227f32d0abdd34e4274 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 6 Nov 2020 05:25:09 -0500
Subject: KVM: remove kvm_clear_guest_page

kvm_clear_guest_page is not used anymore after "KVM: X86: Don't track dirty
for KVM_SET_[TSS_ADDR|IDENTITY_MAP_ADDR]", except from kvm_clear_guest.
We can just inline it in its sole user.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  1 -
 virt/kvm/kvm_main.c      | 11 ++---------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7f2e2a09ebbd..66a4324f329d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -792,7 +792,6 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			offset_in_page(__gpa), v);			\
 })
 
-int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2541a17ff1c4..1c7514579861 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2616,23 +2616,16 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
 
-int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
-{
-	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
-
-	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
-
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 {
+	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	int seg;
 	int offset = offset_in_page(gpa);
 	int ret;
 
 	while ((seg = next_segment(len, offset)) != 0) {
-		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+		ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
 		if (ret < 0)
 			return ret;
 		offset = 0;
-- 
cgit 


From 28bd726aa404c0da8fd6852fe69bb4538a103b71 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 30 Sep 2020 21:20:34 -0400
Subject: KVM: Pass in kvm pointer into mark_page_dirty_in_slot()

The context will be needed to implement the kvm dirty ring.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20201001012044.5151-5-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      | 30 +++++++++++++++++-------------
 2 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 66a4324f329d..ca7c1459a8e3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -797,7 +797,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
-void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1c7514579861..68598fdba226 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2196,7 +2196,8 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
 
-static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
+static void __kvm_unmap_gfn(struct kvm *kvm,
+			struct kvm_memory_slot *memslot,
 			struct kvm_host_map *map,
 			struct gfn_to_pfn_cache *cache,
 			bool dirty, bool atomic)
@@ -2221,7 +2222,7 @@ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
 #endif
 
 	if (dirty)
-		mark_page_dirty_in_slot(memslot, map->gfn);
+		mark_page_dirty_in_slot(kvm, memslot, map->gfn);
 
 	if (cache)
 		cache->dirty |= dirty;
@@ -2235,7 +2236,7 @@ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 
 		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
 {
-	__kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
+	__kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
 			cache, dirty, atomic);
 	return 0;
 }
@@ -2243,8 +2244,8 @@ EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
 
 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 {
-	__kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
-			dirty, false);
+	__kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
+			map, NULL, dirty, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
@@ -2418,7 +2419,8 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
 
-static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
+static int __kvm_write_guest_page(struct kvm *kvm,
+				  struct kvm_memory_slot *memslot, gfn_t gfn,
 			          const void *data, int offset, int len)
 {
 	int r;
@@ -2430,7 +2432,7 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
 	r = __copy_to_user((void __user *)addr + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
 	return 0;
 }
 
@@ -2439,7 +2441,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
 
@@ -2448,7 +2450,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
 
@@ -2567,7 +2569,7 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
+	mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
 
 	return 0;
 }
@@ -2636,7 +2638,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 
-void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn)
+void mark_page_dirty_in_slot(struct kvm *kvm,
+			     struct kvm_memory_slot *memslot,
+		 	     gfn_t gfn)
 {
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
@@ -2651,7 +2655,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 	struct kvm_memory_slot *memslot;
 
 	memslot = gfn_to_memslot(kvm, gfn);
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty);
 
@@ -2660,7 +2664,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 	struct kvm_memory_slot *memslot;
 
 	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
-- 
cgit 


From fb04a1eddb1a65b6588a021bdc132270d5ae48bb Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 30 Sep 2020 21:22:22 -0400
Subject: KVM: X86: Implement ring-based dirty memory tracking

This patch is heavily based on previous work from Lei Cao
<lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]

KVM currently uses large bitmaps to track dirty memory.  These bitmaps
are copied to userspace when userspace queries KVM for its dirty page
information.  The use of bitmaps is mostly sufficient for live
migration, as large parts of memory are be dirtied from one log-dirty
pass to another.  However, in a checkpointing system, the number of
dirty pages is small and in fact it is often bounded---the VM is
paused when it has dirtied a pre-defined number of pages. Traversing a
large, sparsely populated bitmap to find set bits is time-consuming,
as is copying the bitmap to user-space.

A similar issue will be there for live migration when the guest memory
is huge while the page dirty procedure is trivial.  In that case for
each dirty sync we need to pull the whole dirty bitmap to userspace
and analyse every bit even if it's mostly zeros.

The preferred data structure for above scenarios is a dense list of
guest frame numbers (GFN).  This patch series stores the dirty list in
kernel memory that can be memory mapped into userspace to allow speedy
harvesting.

This patch enables dirty ring for X86 only.  However it should be
easily extended to other archs as well.

[1] https://patchwork.kernel.org/patch/10471409/

Signed-off-by: Lei Cao <lei.cao@stratus.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20201001012222.5767-1-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  |  93 +++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/include/uapi/asm/kvm.h |   1 +
 arch/x86/kvm/Makefile           |   3 +-
 arch/x86/kvm/mmu/mmu.c          |   8 ++
 arch/x86/kvm/mmu/tdp_mmu.c      |   2 +-
 arch/x86/kvm/vmx/vmx.c          |   7 ++
 arch/x86/kvm/x86.c              |   9 ++
 include/linux/kvm_dirty_ring.h  | 103 +++++++++++++++++++++
 include/linux/kvm_host.h        |  13 +++
 include/trace/events/kvm.h      |  63 +++++++++++++
 include/uapi/linux/kvm.h        |  53 +++++++++++
 virt/kvm/dirty_ring.c           | 194 ++++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c             | 113 ++++++++++++++++++++++-
 14 files changed, 662 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/kvm_dirty_ring.h
 create mode 100644 virt/kvm/dirty_ring.c

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 81d54fe76a2d..e264ebc35e27 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -262,6 +262,18 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared
 memory region.  This ioctl returns the size of that region.  See the
 KVM_RUN documentation for details.
 
+Besides the size of the KVM_RUN communication region, other areas of
+the VCPU file descriptor can be mmap-ed, including:
+
+- if KVM_CAP_COALESCED_MMIO is available, a page at
+  KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons,
+  this page is included in the result of KVM_GET_VCPU_MMAP_SIZE.
+  KVM_CAP_COALESCED_MMIO is not documented yet.
+
+- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at
+  KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE.  For more information on
+  KVM_CAP_DIRTY_LOG_RING, see section 8.3.
+
 
 4.6 KVM_SET_MEMORY_REGION
 -------------------------
@@ -6396,3 +6408,84 @@ When enabled, KVM will disable paravirtual features provided to the
 guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf
 (0x40000001). Otherwise, a guest may use the paravirtual features
 regardless of what has actually been exposed through the CPUID leaf.
+
+
+8.29 KVM_CAP_DIRTY_LOG_RING
+---------------------------
+
+:Architectures: x86
+:Parameters: args[0] - size of the dirty log ring
+
+KVM is capable of tracking dirty memory using ring buffers that are
+mmaped into userspace; there is one dirty ring per vcpu.
+
+The dirty ring is available to userspace as an array of
+``struct kvm_dirty_gfn``.  Each dirty entry it's defined as::
+
+  struct kvm_dirty_gfn {
+          __u32 flags;
+          __u32 slot; /* as_id | slot_id */
+          __u64 offset;
+  };
+
+The following values are defined for the flags field to define the
+current state of the entry::
+
+  #define KVM_DIRTY_GFN_F_DIRTY           BIT(0)
+  #define KVM_DIRTY_GFN_F_RESET           BIT(1)
+  #define KVM_DIRTY_GFN_F_MASK            0x3
+
+Userspace should call KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM
+ioctl to enable this capability for the new guest and set the size of
+the rings.  Enabling the capability is only allowed before creating any
+vCPU, and the size of the ring must be a power of two.  The larger the
+ring buffer, the less likely the ring is full and the VM is forced to
+exit to userspace. The optimal size depends on the workload, but it is
+recommended that it be at least 64 KiB (4096 entries).
+
+Just like for dirty page bitmaps, the buffer tracks writes to
+all user memory regions for which the KVM_MEM_LOG_DIRTY_PAGES flag was
+set in KVM_SET_USER_MEMORY_REGION.  Once a memory region is registered
+with the flag set, userspace can start harvesting dirty pages from the
+ring buffer.
+
+An entry in the ring buffer can be unused (flag bits ``00``),
+dirty (flag bits ``01``) or harvested (flag bits ``1X``).  The
+state machine for the entry is as follows::
+
+          dirtied         harvested        reset
+     00 -----------> 01 -------------> 1X -------+
+      ^                                          |
+      |                                          |
+      +------------------------------------------+
+
+To harvest the dirty pages, userspace accesses the mmaped ring buffer
+to read the dirty GFNs.  If the flags has the DIRTY bit set (at this stage
+the RESET bit must be cleared), then it means this GFN is a dirty GFN.
+The userspace should harvest this GFN and mark the flags from state
+``01b`` to ``1Xb`` (bit 0 will be ignored by KVM, but bit 1 must be set
+to show that this GFN is harvested and waiting for a reset), and move
+on to the next GFN.  The userspace should continue to do this until the
+flags of a GFN have the DIRTY bit cleared, meaning that it has harvested
+all the dirty GFNs that were available.
+
+It's not necessary for userspace to harvest the all dirty GFNs at once.
+However it must collect the dirty GFNs in sequence, i.e., the userspace
+program cannot skip one dirty GFN to collect the one next to it.
+
+After processing one or more entries in the ring buffer, userspace
+calls the VM ioctl KVM_RESET_DIRTY_RINGS to notify the kernel about
+it, so that the kernel will reprotect those collected GFNs.
+Therefore, the ioctl must be called *before* reading the content of
+the dirty pages.
+
+The dirty ring can get full.  When it happens, the KVM_RUN of the
+vcpu will return with exit reason KVM_EXIT_DIRTY_LOG_FULL.
+
+The dirty ring interface has a major difference comparing to the
+KVM_GET_DIRTY_LOG interface in that, when reading the dirty ring from
+userspace, it's still possible that the kernel has not yet flushed the
+processor's dirty page buffers into the kernel buffer (with dirty bitmaps, the
+flushing is done by the KVM_GET_DIRTY_LOG ioctl).  To achieve that, one
+needs to kick the vcpu out of KVM_RUN using a signal.  The resulting
+vmexit ensures that all dirty GFNs are flushed to the dirty rings.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69e94aa716e9..f002cdb13a0b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1232,6 +1232,7 @@ struct kvm_x86_ops {
 	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
 					   struct kvm_memory_slot *slot,
 					   gfn_t offset, unsigned long mask);
+	int (*cpu_dirty_log_size)(void);
 
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
@@ -1744,4 +1745,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #define GET_SMSTATE(type, buf, offset)		\
 	(*(type *)((buf) + (offset) - 0x7e00))
 
+int kvm_cpu_dirty_log_size(void);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 89e5f3d1bba8..8e76d3701db3 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -12,6 +12,7 @@
 
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..4bd14ab01323 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -10,7 +10,8 @@ endif
 KVM := ../../../virt/kvm
 
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+				$(KVM)/dirty_ring.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5bb1939b65d8..12e5cfe0995e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1289,6 +1289,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
+int kvm_cpu_dirty_log_size(void)
+{
+	if (kvm_x86_ops.cpu_dirty_log_size)
+		return kvm_x86_ops.cpu_dirty_log_size();
+
+	return 0;
+}
+
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn)
 {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ff28a5c6abd6..cffa51c6049e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -185,7 +185,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
 	    is_writable_pte(new_spte)) {
 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
-		mark_page_dirty_in_slot(slot, gfn);
+		mark_page_dirty_in_slot(kvm, slot, gfn);
 	}
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 46b32aa43811..2b6d538454a6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7583,6 +7583,11 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
 	return supported & BIT(bit);
 }
 
+static int vmx_cpu_dirty_log_size(void)
+{
+	return enable_pml ? PML_ENTITY_NUM : 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.hardware_unsetup = hardware_unsetup,
 
@@ -7712,6 +7717,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.migrate_timers = vmx_migrate_timers,
 
 	.msr_filter_changed = vmx_msr_filter_changed,
+	.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
 };
 
 static __init int hardware_setup(void)
@@ -7829,6 +7835,7 @@ static __init int hardware_setup(void)
 		vmx_x86_ops.slot_disable_log_dirty = NULL;
 		vmx_x86_ops.flush_log_dirty = NULL;
 		vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
+		vmx_x86_ops.cpu_dirty_log_size = NULL;
 	}
 
 	if (!cpu_has_vmx_preemption_timer())
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4ac726526f8..6c704a597b7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8754,6 +8754,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	bool req_immediate_exit = false;
 
+	/* Forbid vmenter if vcpu dirty ring is soft-full */
+	if (unlikely(vcpu->kvm->dirty_ring_size &&
+		     kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
+		vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+		trace_kvm_dirty_ring_exit(vcpu);
+		r = 0;
+		goto out;
+	}
+
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
 			if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
new file mode 100644
index 000000000000..120e5e90fa1d
--- /dev/null
+++ b/include/linux/kvm_dirty_ring.h
@@ -0,0 +1,103 @@
+#ifndef KVM_DIRTY_RING_H
+#define KVM_DIRTY_RING_H
+
+#include <linux/kvm.h>
+
+/**
+ * kvm_dirty_ring: KVM internal dirty ring structure
+ *
+ * @dirty_index: free running counter that points to the next slot in
+ *               dirty_ring->dirty_gfns, where a new dirty page should go
+ * @reset_index: free running counter that points to the next dirty page
+ *               in dirty_ring->dirty_gfns for which dirty trap needs to
+ *               be reenabled
+ * @size:        size of the compact list, dirty_ring->dirty_gfns
+ * @soft_limit:  when the number of dirty pages in the list reaches this
+ *               limit, vcpu that owns this ring should exit to userspace
+ *               to allow userspace to harvest all the dirty pages
+ * @dirty_gfns:  the array to keep the dirty gfns
+ * @index:       index of this dirty ring
+ */
+struct kvm_dirty_ring {
+	u32 dirty_index;
+	u32 reset_index;
+	u32 size;
+	u32 soft_limit;
+	struct kvm_dirty_gfn *dirty_gfns;
+	int index;
+};
+
+#if (KVM_DIRTY_LOG_PAGE_OFFSET == 0)
+/*
+ * If KVM_DIRTY_LOG_PAGE_OFFSET not defined, kvm_dirty_ring.o should
+ * not be included as well, so define these nop functions for the arch.
+ */
+static inline u32 kvm_dirty_ring_get_rsvd_entries(void)
+{
+	return 0;
+}
+
+static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
+				       int index, u32 size)
+{
+	return 0;
+}
+
+static inline struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
+{
+	return NULL;
+}
+
+static inline int kvm_dirty_ring_reset(struct kvm *kvm,
+				       struct kvm_dirty_ring *ring)
+{
+	return 0;
+}
+
+static inline void kvm_dirty_ring_push(struct kvm_dirty_ring *ring,
+				       u32 slot, u64 offset)
+{
+}
+
+static inline struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring,
+						   u32 offset)
+{
+	return NULL;
+}
+
+static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
+{
+}
+
+static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
+{
+	return true;
+}
+
+#else /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */
+
+u32 kvm_dirty_ring_get_rsvd_entries(void);
+int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size);
+struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm);
+
+/*
+ * called with kvm->slots_lock held, returns the number of
+ * processed pages.
+ */
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring);
+
+/*
+ * returns =0: successfully pushed
+ *         <0: unable to push, need to wait
+ */
+void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset);
+
+/* for use in vm_operations_struct */
+struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset);
+
+void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
+bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
+
+#endif /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */
+
+#endif	/* KVM_DIRTY_RING_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ca7c1459a8e3..864b156391c8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -34,6 +34,7 @@
 #include <linux/kvm_types.h>
 
 #include <asm/kvm_host.h>
+#include <linux/kvm_dirty_ring.h>
 
 #ifndef KVM_MAX_VCPU_ID
 #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
@@ -319,6 +320,7 @@ struct kvm_vcpu {
 	bool preempted;
 	bool ready;
 	struct kvm_vcpu_arch arch;
+	struct kvm_dirty_ring dirty_ring;
 };
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -505,6 +507,7 @@ struct kvm {
 	struct srcu_struct irq_srcu;
 	pid_t userspace_pid;
 	unsigned int max_halt_poll_ns;
+	u32 dirty_ring_size;
 };
 
 #define kvm_err(fmt, ...) \
@@ -1477,4 +1480,14 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
 
+/*
+ * This defines how many reserved entries we want to keep before we
+ * kick the vcpu to the userspace to avoid dirty ring full.  This
+ * value can be tuned to higher if e.g. PML is enabled on the host.
+ */
+#define  KVM_DIRTY_RING_RSVD_ENTRIES  64
+
+/* Max number of entries allowed for each kvm dirty ring */
+#define  KVM_DIRTY_RING_MAX_ENTRIES  65536
+
 #endif
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 26cfb0fa8e7e..49d7d0fe29f6 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -399,6 +399,69 @@ TRACE_EVENT(kvm_halt_poll_ns,
 #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
 	trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
 
+TRACE_EVENT(kvm_dirty_ring_push,
+	TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset),
+	TP_ARGS(ring, slot, offset),
+
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(u32, dirty_index)
+		__field(u32, reset_index)
+		__field(u32, slot)
+		__field(u64, offset)
+	),
+
+	TP_fast_assign(
+		__entry->index          = ring->index;
+		__entry->dirty_index    = ring->dirty_index;
+		__entry->reset_index    = ring->reset_index;
+		__entry->slot           = slot;
+		__entry->offset         = offset;
+	),
+
+	TP_printk("ring %d: dirty 0x%x reset 0x%x "
+		  "slot %u offset 0x%llx (used %u)",
+		  __entry->index, __entry->dirty_index,
+		  __entry->reset_index,  __entry->slot, __entry->offset,
+		  __entry->dirty_index - __entry->reset_index)
+);
+
+TRACE_EVENT(kvm_dirty_ring_reset,
+	TP_PROTO(struct kvm_dirty_ring *ring),
+	TP_ARGS(ring),
+
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(u32, dirty_index)
+		__field(u32, reset_index)
+	),
+
+	TP_fast_assign(
+		__entry->index          = ring->index;
+		__entry->dirty_index    = ring->dirty_index;
+		__entry->reset_index    = ring->reset_index;
+	),
+
+	TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)",
+		  __entry->index, __entry->dirty_index, __entry->reset_index,
+		  __entry->dirty_index - __entry->reset_index)
+);
+
+TRACE_EVENT(kvm_dirty_ring_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+	    __field(int, vcpu_id)
+	),
+
+	TP_fast_assign(
+	    __entry->vcpu_id = vcpu->vcpu_id;
+	),
+
+	TP_printk("vcpu %d", __entry->vcpu_id)
+);
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 204afbe1240e..886802b8ffba 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -250,6 +250,7 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_ARM_NISV         28
 #define KVM_EXIT_X86_RDMSR        29
 #define KVM_EXIT_X86_WRMSR        30
+#define KVM_EXIT_DIRTY_RING_FULL  31
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -1054,6 +1055,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
 #define KVM_CAP_SYS_HYPERV_CPUID 191
+#define KVM_CAP_DIRTY_LOG_RING 192
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1558,6 +1560,9 @@ struct kvm_pv_cmd {
 /* Available with KVM_CAP_X86_MSR_FILTER */
 #define KVM_X86_SET_MSR_FILTER	_IOW(KVMIO,  0xc6, struct kvm_msr_filter)
 
+/* Available with KVM_CAP_DIRTY_LOG_RING */
+#define KVM_RESET_DIRTY_RINGS		_IO(KVMIO, 0xc7)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
@@ -1711,4 +1716,52 @@ struct kvm_hyperv_eventfd {
 #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE    (1 << 0)
 #define KVM_DIRTY_LOG_INITIALLY_SET            (1 << 1)
 
+/*
+ * Arch needs to define the macro after implementing the dirty ring
+ * feature.  KVM_DIRTY_LOG_PAGE_OFFSET should be defined as the
+ * starting page offset of the dirty ring structures.
+ */
+#ifndef KVM_DIRTY_LOG_PAGE_OFFSET
+#define KVM_DIRTY_LOG_PAGE_OFFSET 0
+#endif
+
+/*
+ * KVM dirty GFN flags, defined as:
+ *
+ * |---------------+---------------+--------------|
+ * | bit 1 (reset) | bit 0 (dirty) | Status       |
+ * |---------------+---------------+--------------|
+ * |             0 |             0 | Invalid GFN  |
+ * |             0 |             1 | Dirty GFN    |
+ * |             1 |             X | GFN to reset |
+ * |---------------+---------------+--------------|
+ *
+ * Lifecycle of a dirty GFN goes like:
+ *
+ *      dirtied         harvested        reset
+ * 00 -----------> 01 -------------> 1X -------+
+ *  ^                                          |
+ *  |                                          |
+ *  +------------------------------------------+
+ *
+ * The userspace program is only responsible for the 01->1X state
+ * conversion after harvesting an entry.  Also, it must not skip any
+ * dirty bits, so that dirty bits are always harvested in sequence.
+ */
+#define KVM_DIRTY_GFN_F_DIRTY           BIT(0)
+#define KVM_DIRTY_GFN_F_RESET           BIT(1)
+#define KVM_DIRTY_GFN_F_MASK            0x3
+
+/*
+ * KVM dirty rings should be mapped at KVM_DIRTY_LOG_PAGE_OFFSET of
+ * per-vcpu mmaped regions as an array of struct kvm_dirty_gfn.  The
+ * size of the gfn buffer is decided by the first argument when
+ * enabling KVM_CAP_DIRTY_LOG_RING.
+ */
+struct kvm_dirty_gfn {
+	__u32 flags;
+	__u32 slot;
+	__u64 offset;
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
new file mode 100644
index 000000000000..9d01299563ee
--- /dev/null
+++ b/virt/kvm/dirty_ring.c
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM dirty ring implementation
+ *
+ * Copyright 2019 Red Hat, Inc.
+ */
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_dirty_ring.h>
+#include <trace/events/kvm.h>
+
+int __weak kvm_cpu_dirty_log_size(void)
+{
+	return 0;
+}
+
+u32 kvm_dirty_ring_get_rsvd_entries(void)
+{
+	return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size();
+}
+
+static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
+{
+	return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
+}
+
+bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
+{
+	return kvm_dirty_ring_used(ring) >= ring->soft_limit;
+}
+
+static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring)
+{
+	return kvm_dirty_ring_used(ring) >= ring->size;
+}
+
+struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+	WARN_ON_ONCE(vcpu->kvm != kvm);
+
+	return &vcpu->dirty_ring;
+}
+
+static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
+{
+	struct kvm_memory_slot *memslot;
+	int as_id, id;
+
+	as_id = slot >> 16;
+	id = (u16)slot;
+
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+		return;
+
+	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+
+	if (!memslot || (offset + __fls(mask)) >= memslot->npages)
+		return;
+
+	spin_lock(&kvm->mmu_lock);
+	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
+	spin_unlock(&kvm->mmu_lock);
+}
+
+int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size)
+{
+	ring->dirty_gfns = vmalloc(size);
+	if (!ring->dirty_gfns)
+		return -ENOMEM;
+	memset(ring->dirty_gfns, 0, size);
+
+	ring->size = size / sizeof(struct kvm_dirty_gfn);
+	ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
+	ring->dirty_index = 0;
+	ring->reset_index = 0;
+	ring->index = index;
+
+	return 0;
+}
+
+static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn)
+{
+	gfn->flags = 0;
+}
+
+static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
+{
+	gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static inline bool kvm_dirty_gfn_invalid(struct kvm_dirty_gfn *gfn)
+{
+	return gfn->flags == 0;
+}
+
+static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
+{
+	return gfn->flags & KVM_DIRTY_GFN_F_RESET;
+}
+
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
+{
+	u32 cur_slot, next_slot;
+	u64 cur_offset, next_offset;
+	unsigned long mask;
+	int count = 0;
+	struct kvm_dirty_gfn *entry;
+	bool first_round = true;
+
+	/* This is only needed to make compilers happy */
+	cur_slot = cur_offset = mask = 0;
+
+	while (true) {
+		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
+
+		if (!kvm_dirty_gfn_harvested(entry))
+			break;
+
+		next_slot = READ_ONCE(entry->slot);
+		next_offset = READ_ONCE(entry->offset);
+
+		/* Update the flags to reflect that this GFN is reset */
+		kvm_dirty_gfn_set_invalid(entry);
+
+		ring->reset_index++;
+		count++;
+		/*
+		 * Try to coalesce the reset operations when the guest is
+		 * scanning pages in the same slot.
+		 */
+		if (!first_round && next_slot == cur_slot) {
+			s64 delta = next_offset - cur_offset;
+
+			if (delta >= 0 && delta < BITS_PER_LONG) {
+				mask |= 1ull << delta;
+				continue;
+			}
+
+			/* Backwards visit, careful about overflows!  */
+			if (delta > -BITS_PER_LONG && delta < 0 &&
+			    (mask << -delta >> -delta) == mask) {
+				cur_offset = next_offset;
+				mask = (mask << -delta) | 1;
+				continue;
+			}
+		}
+		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+		cur_slot = next_slot;
+		cur_offset = next_offset;
+		mask = 1;
+		first_round = false;
+	}
+
+	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
+	trace_kvm_dirty_ring_reset(ring);
+
+	return count;
+}
+
+void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
+{
+	struct kvm_dirty_gfn *entry;
+
+	/* It should never get full */
+	WARN_ON_ONCE(kvm_dirty_ring_full(ring));
+
+	entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
+
+	entry->slot = slot;
+	entry->offset = offset;
+	/*
+	 * Make sure the data is filled in before we publish this to
+	 * the userspace program.  There's no paired kernel-side reader.
+	 */
+	smp_wmb();
+	kvm_dirty_gfn_set_dirtied(entry);
+	ring->dirty_index++;
+	trace_kvm_dirty_ring_push(ring, slot, offset);
+}
+
+struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
+{
+	return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE);
+}
+
+void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
+{
+	vfree(ring->dirty_gfns);
+	ring->dirty_gfns = NULL;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 68598fdba226..78ef414512bf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -63,6 +63,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+#include <linux/kvm_dirty_ring.h>
+
 /* Worst case buffer size needed for holding an integer. */
 #define ITOA_MAX_LEN 12
 
@@ -415,6 +417,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 
 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	kvm_dirty_ring_free(&vcpu->dirty_ring);
 	kvm_arch_vcpu_destroy(vcpu);
 
 	/*
@@ -2644,8 +2647,13 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
 {
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
+		u32 slot = (memslot->as_id << 16) | memslot->id;
 
-		set_bit_le(rel_gfn, memslot->dirty_bitmap);
+		if (kvm->dirty_ring_size)
+			kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
+					    slot, rel_gfn);
+		else
+			set_bit_le(rel_gfn, memslot->dirty_bitmap);
 	}
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
@@ -3005,6 +3013,17 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
+static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
+{
+#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
+	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
+	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
+	     kvm->dirty_ring_size / PAGE_SIZE);
+#else
+	return false;
+#endif
+}
+
 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 {
 	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
@@ -3020,6 +3039,10 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
+	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
+		page = kvm_dirty_ring_get_page(
+		    &vcpu->dirty_ring,
+		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
 	else
 		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
@@ -3033,6 +3056,14 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = {
 
 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct kvm_vcpu *vcpu = file->private_data;
+	unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
+	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
+	    ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
+		return -EINVAL;
+
 	vma->vm_ops = &kvm_vcpu_vm_ops;
 	return 0;
 }
@@ -3126,6 +3157,13 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	if (r)
 		goto vcpu_free_run_page;
 
+	if (kvm->dirty_ring_size) {
+		r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
+					 id, kvm->dirty_ring_size);
+		if (r)
+			goto arch_vcpu_destroy;
+	}
+
 	mutex_lock(&kvm->lock);
 	if (kvm_get_vcpu_by_id(kvm, id)) {
 		r = -EEXIST;
@@ -3159,6 +3197,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 
 unlock_vcpu_destroy:
 	mutex_unlock(&kvm->lock);
+	kvm_dirty_ring_free(&vcpu->dirty_ring);
+arch_vcpu_destroy:
 	kvm_arch_vcpu_destroy(vcpu);
 vcpu_free_run_page:
 	free_page((unsigned long)vcpu->run);
@@ -3631,12 +3671,78 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #endif
 	case KVM_CAP_NR_MEMSLOTS:
 		return KVM_USER_MEM_SLOTS;
+	case KVM_CAP_DIRTY_LOG_RING:
+#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
+		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
+#else
+		return 0;
+#endif
 	default:
 		break;
 	}
 	return kvm_vm_ioctl_check_extension(kvm, arg);
 }
 
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
+{
+	int r;
+
+	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
+		return -EINVAL;
+
+	/* the size should be power of 2 */
+	if (!size || (size & (size - 1)))
+		return -EINVAL;
+
+	/* Should be bigger to keep the reserved entries, or a page */
+	if (size < kvm_dirty_ring_get_rsvd_entries() *
+	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
+		return -EINVAL;
+
+	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
+	    sizeof(struct kvm_dirty_gfn))
+		return -E2BIG;
+
+	/* We only allow it to set once */
+	if (kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	if (kvm->created_vcpus) {
+		/* We don't allow to change this value after vcpu created */
+		r = -EINVAL;
+	} else {
+		kvm->dirty_ring_size = size;
+		r = 0;
+	}
+
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+	int cleared = 0;
+
+	if (!kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	if (cleared)
+		kvm_flush_remote_tlbs(kvm);
+
+	return cleared;
+}
+
 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 						  struct kvm_enable_cap *cap)
 {
@@ -3667,6 +3773,8 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 		kvm->max_halt_poll_ns = cap->args[0];
 		return 0;
 	}
+	case KVM_CAP_DIRTY_LOG_RING:
+		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
 	default:
 		return kvm_vm_ioctl_enable_cap(kvm, cap);
 	}
@@ -3851,6 +3959,9 @@ static long kvm_vm_ioctl(struct file *filp,
 	case KVM_CHECK_EXTENSION:
 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
 		break;
+	case KVM_RESET_DIRTY_RINGS:
+		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
+		break;
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
-- 
cgit 


From 044c59c409b7fd753707dc437890e94d2b0bd819 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 30 Sep 2020 21:22:26 -0400
Subject: KVM: Don't allocate dirty bitmap if dirty ring is enabled

Because kvm dirty rings and kvm dirty log is used in an exclusive way,
Let's avoid creating the dirty_bitmap when kvm dirty ring is enabled.
At the meantime, since the dirty_bitmap will be conditionally created
now, we can't use it as a sign of "whether this memory slot enabled
dirty tracking".  Change users like that to check against the kvm
memory slot flags.

Note that there still can be chances where the kvm memory slot got its
dirty_bitmap allocated, _if_ the memory slots are created before
enabling of the dirty rings and at the same time with the dirty
tracking capability enabled, they'll still with the dirty_bitmap.
However it should not hurt much (e.g., the bitmaps will always be
freed if they are there), and the real users normally won't trigger
this because dirty bit tracking flag should in most cases only be
applied to kvm slots only before migration starts, that should be far
latter than kvm initializes (VM starts).

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20201001012226.5868-1-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c   | 2 +-
 include/linux/kvm_host.h | 5 +++++
 virt/kvm/kvm_main.c      | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 12e5cfe0995e..5dfe0ede0e81 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -820,7 +820,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return NULL;
-	if (no_dirty_log && slot->dirty_bitmap)
+	if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
 		return NULL;
 
 	return slot;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 864b156391c8..f3b1013fb22c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -351,6 +351,11 @@ struct kvm_memory_slot {
 	u16 as_id;
 };
 
+static inline bool kvm_slot_dirty_track_enabled(struct kvm_memory_slot *slot)
+{
+	return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
+}
+
 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
 {
 	return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 110aa5cc0c93..3abcb2ce5b7d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1365,7 +1365,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	/* Allocate/free page dirty bitmap as needed */
 	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
 		new.dirty_bitmap = NULL;
-	else if (!new.dirty_bitmap) {
+	else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
 		r = kvm_alloc_dirty_bitmap(&new);
 		if (r)
 			return r;
@@ -2657,7 +2657,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
 			     struct kvm_memory_slot *memslot,
 		 	     gfn_t gfn)
 {
-	if (memslot && memslot->dirty_bitmap) {
+	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 		u32 slot = (memslot->as_id << 16) | memslot->id;
 
-- 
cgit 


From 78a56e0494ad29feccd4c54c2b5682721f8cb988 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 4 Nov 2020 15:01:57 -0800
Subject: entry: Fix spelling/typo errors in irq entry code

s/reguired/required/
s/Interupts/Interrupts/
s/quiescient/quiescent/
s/assemenbly/assembly/

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201104230157.3378023-1-ira.weiny@intel.com
---
 include/linux/entry-common.h | 4 ++--
 kernel/entry/common.c        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 1a128baf3628..aab549026ab8 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -415,7 +415,7 @@ void irqentry_exit_cond_resched(void);
  * @state:	Return value from matching call to irqentry_enter()
  *
  * Depending on the return target (kernel/user) this runs the necessary
- * preemption and work checks if possible and reguired and returns to
+ * preemption and work checks if possible and required and returns to
  * the caller with interrupts disabled and no further work pending.
  *
  * This is the last action before returning to the low level ASM code which
@@ -438,7 +438,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs);
  * @regs:	Pointer to pt_regs (NMI entry regs)
  * @irq_state:	Return value from matching call to irqentry_nmi_enter()
  *
- * Last action before returning to the low level assmenbly code.
+ * Last action before returning to the low level assembly code.
  *
  * Counterpart to irqentry_nmi_enter().
  */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bc75c114c1b3..fa17baadf63e 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -304,7 +304,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	 * If this entry hit the idle task invoke rcu_irq_enter() whether
 	 * RCU is watching or not.
 	 *
-	 * Interupts can nest when the first interrupt invokes softirq
+	 * Interrupts can nest when the first interrupt invokes softirq
 	 * processing on return which enables interrupts.
 	 *
 	 * Scheduler ticks in the idle task can mark quiescent state and
@@ -315,7 +315,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
 	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
 	 * assume that it is the first interupt and eventually claim
-	 * quiescient state and end grace periods prematurely.
+	 * quiescent state and end grace periods prematurely.
 	 *
 	 * Unconditionally invoke rcu_irq_enter() so RCU state stays
 	 * consistent.
-- 
cgit 


From 23d89aa0c2192f2d4582198b381d8805492c7925 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@nxp.com>
Date: Wed, 11 Nov 2020 13:11:18 +0200
Subject: firmware: imx-dsp: Export functions to request/free channels

In order to save power, we only need to request a channel
when the communication with the DSP active.

For this we export the following functions:
	- imx_dsp_request_channel, gets a channel with a given index
	- imx_dsp_free_channel, frees a channel with a given index

Notice that we still request channels at probe to support devices
that do not have PM callbacks implemented.

More explanations about why requesting a channel has an effect
on power savings:
 - requesting an mailbox channel will call mailbox's startup
   function.
 - startup function calls pm_runtime_get_sync which increments device
   usage count and will keep the device active. Specifically, mailbox
   clock will be always ON when a mailbox channel is requested.

Signed-off-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Paul Olaru <paul.olaru@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 drivers/firmware/imx/imx-dsp.c   | 25 +++++++++++++++++++++++++
 include/linux/firmware/imx/dsp.h | 10 ++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/imx/imx-dsp.c b/drivers/firmware/imx/imx-dsp.c
index b6e95d6d34c0..a6c06d7476c3 100644
--- a/drivers/firmware/imx/imx-dsp.c
+++ b/drivers/firmware/imx/imx-dsp.c
@@ -60,6 +60,31 @@ static void imx_dsp_handle_rx(struct mbox_client *c, void *msg)
 	}
 }
 
+struct mbox_chan *imx_dsp_request_channel(struct imx_dsp_ipc *dsp_ipc, int idx)
+{
+	struct imx_dsp_chan *dsp_chan;
+
+	if (idx >= DSP_MU_CHAN_NUM)
+		return ERR_PTR(-EINVAL);
+
+	dsp_chan = &dsp_ipc->chans[idx];
+	dsp_chan->ch = mbox_request_channel_byname(&dsp_chan->cl, dsp_chan->name);
+	return dsp_chan->ch;
+}
+EXPORT_SYMBOL(imx_dsp_request_channel);
+
+void imx_dsp_free_channel(struct imx_dsp_ipc *dsp_ipc, int idx)
+{
+	struct imx_dsp_chan *dsp_chan;
+
+	if (idx >= DSP_MU_CHAN_NUM)
+		return;
+
+	dsp_chan = &dsp_ipc->chans[idx];
+	mbox_free_channel(dsp_chan->ch);
+}
+EXPORT_SYMBOL(imx_dsp_free_channel);
+
 static int imx_dsp_setup_channels(struct imx_dsp_ipc *dsp_ipc)
 {
 	struct device *dev = dsp_ipc->dev;
diff --git a/include/linux/firmware/imx/dsp.h b/include/linux/firmware/imx/dsp.h
index 7562099c9e46..4f7895a3b73c 100644
--- a/include/linux/firmware/imx/dsp.h
+++ b/include/linux/firmware/imx/dsp.h
@@ -55,6 +55,9 @@ static inline void *imx_dsp_get_data(struct imx_dsp_ipc *ipc)
 
 int imx_dsp_ring_doorbell(struct imx_dsp_ipc *dsp, unsigned int chan_idx);
 
+struct mbox_chan *imx_dsp_request_channel(struct imx_dsp_ipc *ipc, int idx);
+void imx_dsp_free_channel(struct imx_dsp_ipc *ipc, int idx);
+
 #else
 
 static inline int imx_dsp_ring_doorbell(struct imx_dsp_ipc *ipc,
@@ -63,5 +66,12 @@ static inline int imx_dsp_ring_doorbell(struct imx_dsp_ipc *ipc,
 	return -ENOTSUPP;
 }
 
+struct mbox_chan *imx_dsp_request_channel(struct imx_dsp_ipc *ipc, int idx)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+void imx_dsp_free_channel(struct imx_dsp_ipc *ipc, int idx) { }
+
 #endif
 #endif /* _IMX_DSP_IPC_H */
-- 
cgit 


From cfeeea60af2f01c13b94d57a9bb1291e7bc181da Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 16 Nov 2020 12:57:13 +0200
Subject: bus: ti-sysc: Implement GPMC debug quirk to drop platform data

We need to enable no-reset-on-init quirk for GPMC if the config
option for CONFIG_OMAP_GPMC_DEBUG is set. Otherwise the GPMC
driver code is unable to show the bootloader configured timings.

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 10 ++++++++++
 include/linux/platform_data/ti-sysc.h |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index 88a5d22091f3..691cc39bfc5c 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -1383,6 +1383,8 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 		   SYSC_QUIRK_CLKDM_NOAUTO),
 	SYSC_QUIRK("dwc3", 0x488c0000, 0, 0x10, -ENODEV, 0x500a0200, 0xffffffff,
 		   SYSC_QUIRK_CLKDM_NOAUTO),
+	SYSC_QUIRK("gpmc", 0, 0, 0x10, 0x14, 0x00000060, 0xffffffff,
+		   SYSC_QUIRK_GPMC_DEBUG),
 	SYSC_QUIRK("hdmi", 0, 0, 0x10, -ENODEV, 0x50030200, 0xffffffff,
 		   SYSC_QUIRK_OPT_CLKS_NEEDED),
 	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff,
@@ -1818,6 +1820,14 @@ static void sysc_init_module_quirks(struct sysc *ddata)
 		return;
 	}
 
+#ifdef CONFIG_OMAP_GPMC_DEBUG
+	if (ddata->cfg.quirks & SYSC_QUIRK_GPMC_DEBUG) {
+		ddata->cfg.quirks |= SYSC_QUIRK_NO_RESET_ON_INIT;
+
+		return;
+	}
+#endif
+
 	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_I2C) {
 		ddata->pre_reset_quirk = sysc_pre_reset_quirk_i2c;
 		ddata->post_reset_quirk = sysc_post_reset_quirk_i2c;
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 240dce553a0b..fafc1beea504 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -50,6 +50,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_QUIRK_GPMC_DEBUG		BIT(26)
 #define SYSC_MODULE_QUIRK_ENA_RESETDONE	BIT(25)
 #define SYSC_MODULE_QUIRK_PRUSS		BIT(24)
 #define SYSC_MODULE_QUIRK_DSS_RESET	BIT(23)
-- 
cgit 


From ead49373d2916080509f51fc6a4ee8f9bc021b9b Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 29 Oct 2020 09:57:16 +0800
Subject: mmc: core: Initial support for SD express card/host

In the SD specification v7.10 the SD express card has been added. This new
type of removable SD card, can be managed via a PCIe/NVMe based interface,
while also allowing backwards compatibility towards the legacy SD
interface.

To keep the backwards compatibility, it's required to start the
initialization through the legacy SD interface. If it turns out that the
mmc host and the SD card, both supports the PCIe/NVMe interface, then a
switch should be allowed.

Therefore, let's introduce some basic support for this type of SD cards to
the mmc core. The mmc host, should set MMC_CAP2_SD_EXP if it supports this
interface and MMC_CAP2_SD_EXP_1_2V, if also 1.2V is supported, as to inform
the core about it.

To deal with the switch to the PCIe/NVMe interface, the mmc host is
required to implement a new host ops, ->init_sd_express(). Based on the
initial communication between the host and the card, host->ios.timing is
set to either MMC_TIMING_SD_EXP or MMC_TIMING_SD_EXP_1_2V, depending on if
1.2V is supported or not. In this way, the mmc host can check these values
in its ->init_sd_express() ops, to know how to proceed with the handover.

Note that, to manage card insert/removal, the mmc core sticks with using
the ->get_cd() callback, which means it's the host's responsibility to make
sure it provides valid data, even if the card may be managed by PCIe/NVMe
at the moment. As long as the card seems to be present, the mmc core keeps
the card powered on.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Rui Feng <rui_feng@realsil.com.cn>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/1603936636-3126-1-git-send-email-rui_feng@realsil.com.cn
---
 drivers/mmc/core/core.c   | 15 +++++++++++++--
 drivers/mmc/core/host.h   |  6 ++++++
 drivers/mmc/core/sd_ops.c | 49 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/mmc/core/sd_ops.h |  1 +
 include/linux/mmc/host.h  |  7 +++++++
 5 files changed, 74 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index d42037f0f10d..19f1ee57fb34 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2147,8 +2147,12 @@ static int mmc_rescan_try_freq(struct mmc_host *host, unsigned freq)
 
 	mmc_go_idle(host);
 
-	if (!(host->caps2 & MMC_CAP2_NO_SD))
-		mmc_send_if_cond(host, host->ocr_avail);
+	if (!(host->caps2 & MMC_CAP2_NO_SD)) {
+		if (mmc_send_if_cond_pcie(host, host->ocr_avail))
+			goto out;
+		if (mmc_card_sd_express(host))
+			return 0;
+	}
 
 	/* Order's important: probe SDIO, then SD, then MMC */
 	if (!(host->caps2 & MMC_CAP2_NO_SDIO))
@@ -2163,6 +2167,7 @@ static int mmc_rescan_try_freq(struct mmc_host *host, unsigned freq)
 		if (!mmc_attach_mmc(host))
 			return 0;
 
+out:
 	mmc_power_off(host);
 	return -EIO;
 }
@@ -2290,6 +2295,12 @@ void mmc_rescan(struct work_struct *work)
 		goto out;
 	}
 
+	/* If an SD express card is present, then leave it as is. */
+	if (mmc_card_sd_express(host)) {
+		mmc_release_host(host);
+		goto out;
+	}
+
 	for (i = 0; i < ARRAY_SIZE(freqs); i++) {
 		unsigned int freq = freqs[i];
 		if (freq > host->f_max) {
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index 5e3b9534ffb2..ba407617ed23 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -77,5 +77,11 @@ static inline bool mmc_card_hs400es(struct mmc_card *card)
 	return card->host->ios.enhanced_strobe;
 }
 
+static inline bool mmc_card_sd_express(struct mmc_host *host)
+{
+	return host->ios.timing == MMC_TIMING_SD_EXP ||
+		host->ios.timing == MMC_TIMING_SD_EXP_1_2V;
+}
+
 #endif
 
diff --git a/drivers/mmc/core/sd_ops.c b/drivers/mmc/core/sd_ops.c
index 22bf528294b9..d61ff811218c 100644
--- a/drivers/mmc/core/sd_ops.c
+++ b/drivers/mmc/core/sd_ops.c
@@ -158,7 +158,8 @@ int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr)
 	return err;
 }
 
-int mmc_send_if_cond(struct mmc_host *host, u32 ocr)
+static int __mmc_send_if_cond(struct mmc_host *host, u32 ocr, u8 pcie_bits,
+			      u32 *resp)
 {
 	struct mmc_command cmd = {};
 	int err;
@@ -171,7 +172,7 @@ int mmc_send_if_cond(struct mmc_host *host, u32 ocr)
 	 * SD 1.0 cards.
 	 */
 	cmd.opcode = SD_SEND_IF_COND;
-	cmd.arg = ((ocr & 0xFF8000) != 0) << 8 | test_pattern;
+	cmd.arg = ((ocr & 0xFF8000) != 0) << 8 | pcie_bits << 8 | test_pattern;
 	cmd.flags = MMC_RSP_SPI_R7 | MMC_RSP_R7 | MMC_CMD_BCR;
 
 	err = mmc_wait_for_cmd(host, &cmd, 0);
@@ -186,6 +187,50 @@ int mmc_send_if_cond(struct mmc_host *host, u32 ocr)
 	if (result_pattern != test_pattern)
 		return -EIO;
 
+	if (resp)
+		*resp = cmd.resp[0];
+
+	return 0;
+}
+
+int mmc_send_if_cond(struct mmc_host *host, u32 ocr)
+{
+	return __mmc_send_if_cond(host, ocr, 0, NULL);
+}
+
+int mmc_send_if_cond_pcie(struct mmc_host *host, u32 ocr)
+{
+	u32 resp = 0;
+	u8 pcie_bits = 0;
+	int ret;
+
+	if (host->caps2 & MMC_CAP2_SD_EXP) {
+		/* Probe card for SD express support via PCIe. */
+		pcie_bits = 0x10;
+		if (host->caps2 & MMC_CAP2_SD_EXP_1_2V)
+			/* Probe also for 1.2V support. */
+			pcie_bits = 0x30;
+	}
+
+	ret = __mmc_send_if_cond(host, ocr, pcie_bits, &resp);
+	if (ret)
+		return 0;
+
+	/* Continue with the SD express init, if the card supports it. */
+	resp &= 0x3000;
+	if (pcie_bits && resp) {
+		if (resp == 0x3000)
+			host->ios.timing = MMC_TIMING_SD_EXP_1_2V;
+		else
+			host->ios.timing = MMC_TIMING_SD_EXP;
+
+		/*
+		 * According to the spec the clock shall also be gated, but
+		 * let's leave this to the host driver for more flexibility.
+		 */
+		return host->ops->init_sd_express(host, &host->ios);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/mmc/core/sd_ops.h b/drivers/mmc/core/sd_ops.h
index 2194cabfcfc5..3ba7b3cf4652 100644
--- a/drivers/mmc/core/sd_ops.h
+++ b/drivers/mmc/core/sd_ops.h
@@ -16,6 +16,7 @@ struct mmc_host;
 int mmc_app_set_bus_width(struct mmc_card *card, int width);
 int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr);
 int mmc_send_if_cond(struct mmc_host *host, u32 ocr);
+int mmc_send_if_cond_pcie(struct mmc_host *host, u32 ocr);
 int mmc_send_relative_addr(struct mmc_host *host, unsigned int *rca);
 int mmc_app_send_scr(struct mmc_card *card);
 int mmc_sd_switch(struct mmc_card *card, int mode, int group,
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index c079b932330f..01bba36545c5 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -60,6 +60,8 @@ struct mmc_ios {
 #define MMC_TIMING_MMC_DDR52	8
 #define MMC_TIMING_MMC_HS200	9
 #define MMC_TIMING_MMC_HS400	10
+#define MMC_TIMING_SD_EXP	11
+#define MMC_TIMING_SD_EXP_1_2V	12
 
 	unsigned char	signal_voltage;		/* signalling voltage (1.8V or 3.3V) */
 
@@ -173,6 +175,9 @@ struct mmc_host_ops {
 	 */
 	int	(*multi_io_quirk)(struct mmc_card *card,
 				  unsigned int direction, int blk_size);
+
+	/* Initialize an SD express card, mandatory for MMC_CAP2_SD_EXP. */
+	int	(*init_sd_express)(struct mmc_host *host, struct mmc_ios *ios);
 };
 
 struct mmc_cqe_ops {
@@ -358,6 +363,8 @@ struct mmc_host {
 #define MMC_CAP2_HS200_1_2V_SDR	(1 << 6)        /* can support */
 #define MMC_CAP2_HS200		(MMC_CAP2_HS200_1_8V_SDR | \
 				 MMC_CAP2_HS200_1_2V_SDR)
+#define MMC_CAP2_SD_EXP		(1 << 7)	/* SD express via PCIe */
+#define MMC_CAP2_SD_EXP_1_2V	(1 << 8)	/* SD express 1.2V */
 #define MMC_CAP2_CD_ACTIVE_HIGH	(1 << 10)	/* Card-detect signal active high */
 #define MMC_CAP2_RO_ACTIVE_HIGH	(1 << 11)	/* Write-protect signal active high */
 #define MMC_CAP2_NO_PRESCAN_POWERUP (1 << 14)	/* Don't power up before scan */
-- 
cgit 


From 5afe802132f242f5520d2acac09ea05d31e3c7cf Mon Sep 17 00:00:00 2001
From: Rui Feng <rui_feng@realsil.com.cn>
Date: Thu, 29 Oct 2020 09:57:48 +0800
Subject: misc: rtsx: Add SD Express mode support for RTS5261

RTS5261 support SD mode and PCIe/NVMe mode. The workflow is as follows.
1.RTS5261 work in SD mode and set MMC_CAPS2_SD_EXP flag.
2.If card is plugged in, Host send CMD8 to ask card's PCIe availability.
3.If the card has PCIe availability and WP is not set, init_sd_express() will be invoked,
RTS5261 switch to PCIe/NVMe mode.
4.Mmc driver handover it to NVMe driver.
5.If card is unplugged, RTS5261 will switch to SD mode.

Signed-off-by: Rui Feng <rui_feng@realsil.com.cn>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/1603936668-3363-1-git-send-email-rui_feng@realsil.com.cn
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/misc/cardreader/rts5261.c  |  4 ++++
 drivers/misc/cardreader/rts5261.h  | 23 -----------------------
 drivers/misc/cardreader/rtsx_pcr.c |  5 +++++
 include/linux/rtsx_pci.h           | 23 +++++++++++++++++++++++
 4 files changed, 32 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/cardreader/rts5261.c b/drivers/misc/cardreader/rts5261.c
index 471961487ff8..536c90d4fd76 100644
--- a/drivers/misc/cardreader/rts5261.c
+++ b/drivers/misc/cardreader/rts5261.c
@@ -738,8 +738,12 @@ void rts5261_init_params(struct rtsx_pcr *pcr)
 {
 	struct rtsx_cr_option *option = &pcr->option;
 	struct rtsx_hw_param *hw_param = &pcr->hw_param;
+	u8 val;
 
 	pcr->extra_caps = EXTRA_CAPS_SD_SDR50 | EXTRA_CAPS_SD_SDR104;
+	rtsx_pci_read_register(pcr, RTS5261_FW_STATUS, &val);
+	if (!(val & RTS5261_EXPRESS_LINK_FAIL_MASK))
+		pcr->extra_caps |= EXTRA_CAPS_SD_EXPRESS;
 	pcr->num_slots = 1;
 	pcr->ops = &rts5261_pcr_ops;
 
diff --git a/drivers/misc/cardreader/rts5261.h b/drivers/misc/cardreader/rts5261.h
index ebfdd236a553..8d80f0d5d5d6 100644
--- a/drivers/misc/cardreader/rts5261.h
+++ b/drivers/misc/cardreader/rts5261.h
@@ -65,23 +65,6 @@
 #define RTS5261_FW_EXPRESS_TEST_MASK	(0x01<<0)
 #define RTS5261_FW_EA_MODE_MASK		(0x01<<5)
 
-/* FW config register */
-#define RTS5261_FW_CFG0			0xFF54
-#define RTS5261_FW_ENTER_EXPRESS	(0x01<<0)
-
-#define RTS5261_FW_CFG1			0xFF55
-#define RTS5261_SYS_CLK_SEL_MCU_CLK	(0x01<<7)
-#define RTS5261_CRC_CLK_SEL_MCU_CLK	(0x01<<6)
-#define RTS5261_FAKE_MCU_CLOCK_GATING	(0x01<<5)
-/*MCU_bus_mode_sel: 0=real 8051 1=fake mcu*/
-#define RTS5261_MCU_BUS_SEL_MASK	(0x01<<4)
-/*MCU_clock_sel:VerA 00=aux16M 01=aux400K 1x=REFCLK100M*/
-/*MCU_clock_sel:VerB 00=aux400K 01=aux16M 10=REFCLK100M*/
-#define RTS5261_MCU_CLOCK_SEL_MASK	(0x03<<2)
-#define RTS5261_MCU_CLOCK_SEL_16M	(0x01<<2)
-#define RTS5261_MCU_CLOCK_GATING	(0x01<<1)
-#define RTS5261_DRIVER_ENABLE_FW	(0x01<<0)
-
 /* FW status register */
 #define RTS5261_FW_STATUS		0xFF56
 #define RTS5261_EXPRESS_LINK_FAIL_MASK	(0x01<<7)
@@ -121,12 +104,6 @@
 #define RTS5261_DV3318_19		(0x04<<4)
 #define RTS5261_DV3318_33		(0x07<<4)
 
-#define RTS5261_LDO1_CFG0		0xFF72
-#define RTS5261_LDO1_OCP_THD_MASK	(0x07<<5)
-#define RTS5261_LDO1_OCP_EN		(0x01<<4)
-#define RTS5261_LDO1_OCP_LMT_THD_MASK	(0x03<<2)
-#define RTS5261_LDO1_OCP_LMT_EN		(0x01<<1)
-
 /* CRD6603-433 190319 request changed */
 #define RTS5261_LDO1_OCP_THD_740	(0x00<<5)
 #define RTS5261_LDO1_OCP_THD_800	(0x01<<5)
diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c
index 5d15607027e9..56f63fa7f646 100644
--- a/drivers/misc/cardreader/rtsx_pcr.c
+++ b/drivers/misc/cardreader/rtsx_pcr.c
@@ -990,6 +990,11 @@ static irqreturn_t rtsx_pci_isr(int irq, void *dev_id)
 		} else {
 			pcr->card_removed |= SD_EXIST;
 			pcr->card_inserted &= ~SD_EXIST;
+			if (PCI_PID(pcr) == PID_5261) {
+				rtsx_pci_write_register(pcr, RTS5261_FW_STATUS,
+					RTS5261_EXPRESS_LINK_FAIL_MASK, 0);
+				pcr->extra_caps |= EXTRA_CAPS_SD_EXPRESS;
+			}
 		}
 		pcr->dma_error_count = 0;
 	}
diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h
index 745f5e73f99a..b47959f48ccd 100644
--- a/include/linux/rtsx_pci.h
+++ b/include/linux/rtsx_pci.h
@@ -658,6 +658,19 @@
 #define   PM_WAKE_EN			0x01
 #define PM_CTRL4			0xFF47
 
+#define RTS5261_FW_CFG0			0xFF54
+#define   RTS5261_FW_ENTER_EXPRESS	(0x01 << 0)
+
+#define RTS5261_FW_CFG1			0xFF55
+#define   RTS5261_SYS_CLK_SEL_MCU_CLK	(0x01 << 7)
+#define   RTS5261_CRC_CLK_SEL_MCU_CLK	(0x01 << 6)
+#define   RTS5261_FAKE_MCU_CLOCK_GATING	(0x01 << 5)
+#define   RTS5261_MCU_BUS_SEL_MASK	(0x01 << 4)
+#define   RTS5261_MCU_CLOCK_SEL_MASK	(0x03 << 2)
+#define   RTS5261_MCU_CLOCK_SEL_16M	(0x01 << 2)
+#define   RTS5261_MCU_CLOCK_GATING	(0x01 << 1)
+#define   RTS5261_DRIVER_ENABLE_FW	(0x01 << 0)
+
 #define REG_CFG_OOBS_OFF_TIMER 0xFEA6
 #define REG_CFG_OOBS_ON_TIMER 0xFEA7
 #define REG_CFG_VCM_ON_TIMER 0xFEA8
@@ -701,6 +714,13 @@
 #define   RTS5260_DVCC_TUNE_MASK	0x70
 #define   RTS5260_DVCC_33		0x70
 
+/*RTS5261*/
+#define RTS5261_LDO1_CFG0		0xFF72
+#define   RTS5261_LDO1_OCP_THD_MASK	(0x07 << 5)
+#define   RTS5261_LDO1_OCP_EN		(0x01 << 4)
+#define   RTS5261_LDO1_OCP_LMT_THD_MASK	(0x03 << 2)
+#define   RTS5261_LDO1_OCP_LMT_EN	(0x01 << 1)
+
 #define LDO_VCC_CFG1			0xFF73
 #define   LDO_VCC_REF_TUNE_MASK		0x30
 #define   LDO_VCC_REF_1V2		0x20
@@ -741,6 +761,8 @@
 
 #define RTS5260_AUTOLOAD_CFG4		0xFF7F
 #define   RTS5260_MIMO_DISABLE		0x8A
+/*RTS5261*/
+#define   RTS5261_AUX_CLK_16M_EN		(1 << 5)
 
 #define RTS5260_REG_GPIO_CTL0		0xFC1A
 #define   RTS5260_REG_GPIO_MASK		0x01
@@ -1191,6 +1213,7 @@ struct rtsx_pcr {
 #define EXTRA_CAPS_MMC_HS200		(1 << 4)
 #define EXTRA_CAPS_MMC_8BIT		(1 << 5)
 #define EXTRA_CAPS_NO_MMC		(1 << 7)
+#define EXTRA_CAPS_SD_EXPRESS		(1 << 8)
 	u32				extra_caps;
 
 #define IC_VER_A			0
-- 
cgit 


From 6b7b58f425c3359787483479d73c0bb98ffc65b8 Mon Sep 17 00:00:00 2001
From: Rui Feng <rui_feng@realsil.com.cn>
Date: Tue, 3 Nov 2020 17:54:29 +0800
Subject: mmc: rtsx: Add test mode for RTS5261

This patch add test mode for RTS5261.
If test mode is set, reader will switch to SD Express mode
mandatorily, and this mode is used by factory testing only.

Signed-off-by: Rui Feng <rui_feng@realsil.com.cn>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/1604397269-2780-1-git-send-email-rui_feng@realsil.com.cn
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/misc/cardreader/rts5261.h |  5 -----
 drivers/mmc/host/rtsx_pci_sdmmc.c | 19 ++++++++++++++++---
 include/linux/rtsx_pci.h          |  4 ++++
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/cardreader/rts5261.h b/drivers/misc/cardreader/rts5261.h
index 8d80f0d5d5d6..80179353bc46 100644
--- a/drivers/misc/cardreader/rts5261.h
+++ b/drivers/misc/cardreader/rts5261.h
@@ -60,11 +60,6 @@
 /* DMACTL 0xFE2C */
 #define RTS5261_DMA_PACK_SIZE_MASK	0xF0
 
-/* FW config info register */
-#define RTS5261_FW_CFG_INFO0		0xFF50
-#define RTS5261_FW_EXPRESS_TEST_MASK	(0x01<<0)
-#define RTS5261_FW_EA_MODE_MASK		(0x01<<5)
-
 /* FW status register */
 #define RTS5261_FW_STATUS		0xFF56
 #define RTS5261_EXPRESS_LINK_FAIL_MASK	(0x01<<7)
diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c
index c584070b39fe..e9ea703f08c5 100644
--- a/drivers/mmc/host/rtsx_pci_sdmmc.c
+++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
@@ -47,6 +47,8 @@ struct realtek_pci_sdmmc {
 	bool			using_cookie;
 };
 
+static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios *ios);
+
 static inline struct device *sdmmc_dev(struct realtek_pci_sdmmc *host)
 {
 	return &(host->pdev->dev);
@@ -898,6 +900,7 @@ static int sd_power_on(struct realtek_pci_sdmmc *host)
 	struct mmc_host *mmc = host->mmc;
 	int err;
 	u32 val;
+	u8 test_mode;
 
 	if (host->power_state == SDMMC_POWER_ON)
 		return 0;
@@ -925,6 +928,15 @@ static int sd_power_on(struct realtek_pci_sdmmc *host)
 		return err;
 
 	if (PCI_PID(pcr) == PID_5261) {
+		/*
+		 * If test mode is set switch to SD Express mandatorily,
+		 * this is only for factory testing.
+		 */
+		rtsx_pci_read_register(pcr, RTS5261_FW_CFG_INFO0, &test_mode);
+		if (test_mode & RTS5261_FW_EXPRESS_TEST_MASK) {
+			sdmmc_init_sd_express(mmc, NULL);
+			return 0;
+		}
 		if (pcr->extra_caps & EXTRA_CAPS_SD_EXPRESS)
 			mmc->caps2 |= MMC_CAP2_SD_EXP | MMC_CAP2_SD_EXP_1_2V;
 		/*
@@ -1354,11 +1366,12 @@ static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios *ios)
 		RTS5261_AUX_CLK_16M_EN, RTS5261_AUX_CLK_16M_EN);
 	rtsx_pci_write_register(pcr, RTS5261_FW_CFG0,
 		RTS5261_FW_ENTER_EXPRESS, RTS5261_FW_ENTER_EXPRESS);
+	rtsx_pci_write_register(pcr, RTS5261_FW_CFG1,
+		RTS5261_MCU_CLOCK_GATING, RTS5261_MCU_CLOCK_GATING);
 	rtsx_pci_write_register(pcr, RTS5261_FW_CFG1,
 		RTS5261_MCU_BUS_SEL_MASK | RTS5261_MCU_CLOCK_SEL_MASK
-		| RTS5261_MCU_CLOCK_GATING | RTS5261_DRIVER_ENABLE_FW,
-		RTS5261_MCU_CLOCK_SEL_16M | RTS5261_MCU_CLOCK_GATING
-		| RTS5261_DRIVER_ENABLE_FW);
+		| RTS5261_DRIVER_ENABLE_FW,
+		RTS5261_MCU_CLOCK_SEL_16M | RTS5261_DRIVER_ENABLE_FW);
 	host->eject = true;
 	return 0;
 }
diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h
index b47959f48ccd..db249e8707f3 100644
--- a/include/linux/rtsx_pci.h
+++ b/include/linux/rtsx_pci.h
@@ -658,6 +658,10 @@
 #define   PM_WAKE_EN			0x01
 #define PM_CTRL4			0xFF47
 
+/* FW config info register */
+#define RTS5261_FW_CFG_INFO0		0xFF50
+#define   RTS5261_FW_EXPRESS_TEST_MASK	(0x01 << 0)
+#define   RTS5261_FW_EA_MODE_MASK	(0x01 << 5)
 #define RTS5261_FW_CFG0			0xFF54
 #define   RTS5261_FW_ENTER_EXPRESS	(0x01 << 0)
 
-- 
cgit 


From 1672617d512880f31d1d43ca0eb0d13d50b8c680 Mon Sep 17 00:00:00 2001
From: Rui Feng <rui_feng@realsil.com.cn>
Date: Tue, 3 Nov 2020 17:55:12 +0800
Subject: misc: rtsx: Add hardware auto power off for RTS5261

This patch enable hardware auto power off when card is removed.

Signed-off-by: Rui Feng <rui_feng@realsil.com.cn>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/1604397312-2991-1-git-send-email-rui_feng@realsil.com.cn
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/misc/cardreader/rts5261.c | 9 +++++++++
 include/linux/rtsx_pci.h          | 3 +++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/cardreader/rts5261.c b/drivers/misc/cardreader/rts5261.c
index 0ef6b3e04a8d..2ada973a0f33 100644
--- a/drivers/misc/cardreader/rts5261.c
+++ b/drivers/misc/cardreader/rts5261.c
@@ -176,6 +176,8 @@ static int rts5261_card_power_on(struct rtsx_pcr *pcr, int card)
 	if (option->ocp_en)
 		rtsx_pci_enable_ocp(pcr);
 
+	rtsx_pci_write_register(pcr, REG_CRC_DUMMY_0,
+		CFG_SD_POW_AUTO_PD, CFG_SD_POW_AUTO_PD);
 
 	rtsx_pci_write_register(pcr, RTS5261_LDO1_CFG1,
 			RTS5261_LDO1_TUNE_MASK, RTS5261_LDO1_33);
@@ -303,6 +305,8 @@ static int rts5261_card_power_off(struct rtsx_pcr *pcr, int card)
 	err = rtsx_pci_write_register(pcr, RTS5261_LDO1233318_POW_CTL,
 				RTS5261_LDO_POWERON_MASK, 0);
 
+	rtsx_pci_write_register(pcr, REG_CRC_DUMMY_0,
+		CFG_SD_POW_AUTO_PD, 0);
 	if (pcr->option.ocp_en)
 		rtsx_pci_disable_ocp(pcr);
 
@@ -475,6 +479,7 @@ static void rts5261_init_from_cfg(struct rtsx_pcr *pcr)
 static int rts5261_extra_init_hw(struct rtsx_pcr *pcr)
 {
 	struct rtsx_cr_option *option = &pcr->option;
+	u32 val;
 
 	rtsx_pci_write_register(pcr, RTS5261_AUTOLOAD_CFG1,
 			CD_RESUME_EN_MASK, CD_RESUME_EN_MASK);
@@ -489,6 +494,10 @@ static int rts5261_extra_init_hw(struct rtsx_pcr *pcr)
 			AUX_CLK_ACTIVE_SEL_MASK, MAC_CKSW_DONE);
 	rtsx_pci_write_register(pcr, L1SUB_CONFIG3, 0xFF, 0);
 
+	if (is_version_higher_than(pcr, PID_5261, IC_VER_B)) {
+		val = rtsx_pci_readl(pcr, RTSX_DUM_REG);
+		rtsx_pci_writel(pcr, RTSX_DUM_REG, val | 0x1);
+	}
 	rtsx_pci_write_register(pcr, RTS5261_AUTOLOAD_CFG4,
 			RTS5261_AUX_CLK_16M_EN, 0);
 
diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h
index db249e8707f3..fcaadc7c9df1 100644
--- a/include/linux/rtsx_pci.h
+++ b/include/linux/rtsx_pci.h
@@ -82,6 +82,7 @@
 #define   MS_OC_INT_EN			(1 << 23)
 #define   SD_OC_INT_EN			(1 << 22)
 
+#define RTSX_DUM_REG			0x1C
 
 /*
  * macros for easy use
@@ -1272,6 +1273,8 @@ struct rtsx_pcr {
 #define PCI_PID(pcr)			((pcr)->pci->device)
 #define is_version(pcr, pid, ver)				\
 	(CHK_PCI_PID(pcr, pid) && (pcr)->ic_version == (ver))
+#define is_version_higher_than(pcr, pid, ver)			\
+	(CHK_PCI_PID(pcr, pid) && (pcr)->ic_version > (ver))
 #define pcr_dbg(pcr, fmt, arg...)				\
 	dev_dbg(&(pcr)->pci->dev, fmt, ##arg)
 
-- 
cgit 


From 13daf48978280ea8bce38f1e0598b913b09f5395 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 9 Nov 2020 22:53:16 +0200
Subject: gpiolib: Replace unsigned by unsigned int

Replace unsigned by unsigned int in GPIO library code.
Note, legacy API left untouched.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/gpio/gpiolib.c        | 16 ++++++++--------
 include/linux/gpio/consumer.h |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index c980ddcda833..fe31e7f1fb6e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -211,7 +211,7 @@ static int gpiochip_find_base(int ngpio)
 int gpiod_get_direction(struct gpio_desc *desc)
 {
 	struct gpio_chip *gc;
-	unsigned offset;
+	unsigned int offset;
 	int ret;
 
 	gc = gpiod_to_chip(desc);
@@ -1333,7 +1333,7 @@ void gpiochip_irq_domain_deactivate(struct irq_domain *domain,
 }
 EXPORT_SYMBOL_GPL(gpiochip_irq_domain_deactivate);
 
-static int gpiochip_to_irq(struct gpio_chip *gc, unsigned offset)
+static int gpiochip_to_irq(struct gpio_chip *gc, unsigned int offset)
 {
 	struct irq_domain *domain = gc->irq.domain;
 
@@ -1635,7 +1635,7 @@ static inline void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc)
  * @gc: the gpiochip owning the GPIO
  * @offset: the offset of the GPIO to request for GPIO function
  */
-int gpiochip_generic_request(struct gpio_chip *gc, unsigned offset)
+int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset)
 {
 #ifdef CONFIG_PINCTRL
 	if (list_empty(&gc->gpiodev->pin_ranges))
@@ -1651,7 +1651,7 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_request);
  * @gc: the gpiochip to request the gpio function for
  * @offset: the offset of the GPIO to free from GPIO function
  */
-void gpiochip_generic_free(struct gpio_chip *gc, unsigned offset)
+void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset)
 {
 	pinctrl_gpio_free(gc->gpiodev->base + offset);
 }
@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_free);
  * @offset: the offset of the GPIO to apply the configuration
  * @config: the configuration to be applied
  */
-int gpiochip_generic_config(struct gpio_chip *gc, unsigned offset,
+int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset,
 			    unsigned long config)
 {
 	return pinctrl_gpio_set_config(gc->gpiodev->base + offset, config);
@@ -1993,7 +1993,7 @@ void gpiod_free(struct gpio_desc *desc)
  * help with diagnostics, and knowing that the signal is used as a GPIO
  * can help avoid accidentally multiplexing it to another controller.
  */
-const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned offset)
+const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset)
 {
 	struct gpio_desc *desc;
 
@@ -2097,7 +2097,7 @@ static int gpio_set_config(struct gpio_desc *desc, enum pin_config_param mode)
 {
 	struct gpio_chip *gc = desc->gdev->chip;
 	unsigned long config;
-	unsigned arg;
+	unsigned int arg;
 
 	switch (mode) {
 	case PIN_CONFIG_BIAS_PULL_DOWN:
@@ -2353,7 +2353,7 @@ EXPORT_SYMBOL_GPL(gpiod_set_config);
  * 0 on success, %-ENOTSUPP if the controller doesn't support setting the
  * debounce time.
  */
-int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce)
+int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce)
 {
 	unsigned long config;
 
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 901aab89d025..ef49307611d2 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -158,7 +158,7 @@ int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 				       unsigned long *value_bitmap);
 
 int gpiod_set_config(struct gpio_desc *desc, unsigned long config);
-int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce);
+int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce);
 int gpiod_set_transitory(struct gpio_desc *desc, bool transitory);
 void gpiod_toggle_active_low(struct gpio_desc *desc);
 
@@ -481,7 +481,7 @@ static inline int gpiod_set_config(struct gpio_desc *desc, unsigned long config)
 	return -ENOSYS;
 }
 
-static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce)
+static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce)
 {
 	/* GPIO can never have been requested */
 	WARN_ON(desc);
-- 
cgit 


From cc947f2b9c04113d84eeef67cc7c6326e1982019 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Nov 2020 10:53:38 +0100
Subject: timers: Make run_local_timers() static

No users outside of the timer code. Move the caller below this function to
avoid a pointless forward declaration.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |  1 -
 kernel/time/timer.c   | 48 ++++++++++++++++++++++++------------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index d10bc7e73b41..fda13c9d1256 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -193,7 +193,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer);
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
 
 extern void init_timers(void);
-extern void run_local_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af9ddfbd447d..ebf3b26d2501 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1705,29 +1705,6 @@ void timer_clear_idle(void)
 }
 #endif
 
-/*
- * Called from the timer interrupt handler to charge one tick to the current
- * process.  user_tick is 1 if the tick is user time, 0 for system.
- */
-void update_process_times(int user_tick)
-{
-	struct task_struct *p = current;
-
-	PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
-
-	/* Note: this timer irq context must be accounted for as well. */
-	account_process_tick(p, user_tick);
-	run_local_timers();
-	rcu_sched_clock_irq(user_tick);
-#ifdef CONFIG_IRQ_WORK
-	if (in_irq())
-		irq_work_tick();
-#endif
-	scheduler_tick();
-	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
-		run_posix_cpu_timers();
-}
-
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
@@ -1777,7 +1754,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
 /*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
-void run_local_timers(void)
+static void run_local_timers(void)
 {
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
@@ -1794,6 +1771,29 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
+/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process.  user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+	struct task_struct *p = current;
+
+	PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
+
+	/* Note: this timer irq context must be accounted for as well. */
+	account_process_tick(p, user_tick);
+	run_local_timers();
+	rcu_sched_clock_irq(user_tick);
+#ifdef CONFIG_IRQ_WORK
+	if (in_irq())
+		irq_work_tick();
+#endif
+	scheduler_tick();
+	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+		run_posix_cpu_timers();
+}
+
 /*
  * Since schedule_timeout()'s timer is defined on the stack, it must store
  * the target task on the stack as well.
-- 
cgit 


From 66981c37b3199d293c58f84cf2366e86a06e1a3d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 16 Nov 2020 11:18:14 +0100
Subject: hrtimer: Fix kernel-doc markups

The hrtimer_get_remaining() markup is documenting, instead,
__hrtimer_get_remaining(), as it is placed at the C file.

In order to properly document it, a kernel-doc markup is needed together
with the function prototype. So, add a new one, while preserving the
existing one, just fixing the function name.

The hrtimer_is_queued prototype has a typo: it is using
'=' instead of '-' to split: identifier - description
as required by kernel-doc markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/9dc87808c2fd07b7e050bafcd033c5ef05808fea.1605521731.git.mchehab+huawei@kernel.org
---
 include/linux/hrtimer.h | 6 +++++-
 kernel/time/hrtimer.c   | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 107cedd7019a..bb5e7b0a4274 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -447,6 +447,10 @@ static inline void hrtimer_restart(struct hrtimer *timer)
 /* Query timers: */
 extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
 
+/**
+ * hrtimer_get_remaining - get remaining time for the timer
+ * @timer:	the timer to read
+ */
 static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 {
 	return __hrtimer_get_remaining(timer, false);
@@ -458,7 +462,7 @@ extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);
 extern bool hrtimer_active(const struct hrtimer *timer);
 
 /**
- * hrtimer_is_queued = check, whether the timer is on one of the queues
+ * hrtimer_is_queued - check, whether the timer is on one of the queues
  * @timer:	Timer to check
  *
  * Returns: True if the timer is queued, false otherwise
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3624b9b5835d..61c39ff68439 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1289,7 +1289,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
 
 /**
- * hrtimer_get_remaining - get remaining time for the timer
+ * __hrtimer_get_remaining - get remaining time for the timer
  * @timer:	the timer to read
  * @adjust:	adjust relative timers when CONFIG_TIME_LOW_RES=y
  */
-- 
cgit 


From e00adcadf3af7a8335026d71ab9f0e0a922191ac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Nov 2020 11:00:11 +0100
Subject: block: add a new set_read_only method

Add a new method to allow for driver-specific processing when setting or
clearing the block device read-only state.  This allows to replace the
cumbersome and error-prone override of the whole ioctl implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c          | 5 +++++
 include/linux/blkdev.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index c6d8863f0409..a6fa16b97705 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -389,6 +389,11 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode,
 		return ret;
 	if (get_user(n, (int __user *)arg))
 		return -EFAULT;
+	if (bdev->bd_disk->fops->set_read_only) {
+		ret = bdev->bd_disk->fops->set_read_only(bdev, n);
+		if (ret)
+			return ret;
+	}
 	set_device_ro(bdev, n);
 	return 0;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 639cae2c158b..5c1ba8a8d2bc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1850,6 +1850,7 @@ struct block_device_operations {
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
+	int (*set_read_only)(struct block_device *bdev, bool ro);
 	/* this callback is with swap_lock and sometimes page table lock held */
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	int (*report_zones)(struct gendisk *, sector_t sector,
-- 
cgit 


From 98f49b63e84d4ee1a5c327d0b5f4e8699f6c70fe Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Nov 2020 11:00:17 +0100
Subject: block: remove set_device_ro

Fold set_device_ro into its only remaining caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 7 -------
 block/ioctl.c         | 2 +-
 include/linux/genhd.h | 1 -
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 9387f050c248..b85db1f2233c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1846,13 +1846,6 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
 }
 
-void set_device_ro(struct block_device *bdev, int flag)
-{
-	bdev->bd_part->policy = flag;
-}
-
-EXPORT_SYMBOL(set_device_ro);
-
 void set_disk_ro(struct gendisk *disk, int flag)
 {
 	struct disk_part_iter piter;
diff --git a/block/ioctl.c b/block/ioctl.c
index 96cb45447364..04255dc5f3bf 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -371,7 +371,7 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode,
 		if (ret)
 			return ret;
 	}
-	set_device_ro(bdev, n);
+	bdev->bd_part->policy = n;
 	return 0;
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 03da3f603d30..52eb592c874a 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -304,7 +304,6 @@ extern void del_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 
-extern void set_device_ro(struct block_device *bdev, int flag);
 extern void set_disk_ro(struct gendisk *disk, int flag);
 
 static inline int get_disk_ro(struct gendisk *disk)
-- 
cgit 


From a7cb3d2f09c8405aed59d97a7d02cebea43cd3c7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Nov 2020 11:00:18 +0100
Subject: block: remove __blkdev_driver_ioctl

Just open code it in the few callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c               | 25 +++++--------------------
 drivers/block/pktcdvd.c     |  6 ++++--
 drivers/md/bcache/request.c |  5 +++--
 drivers/md/dm.c             |  5 ++++-
 include/linux/blkdev.h      |  2 --
 5 files changed, 16 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 04255dc5f3bf..6b785181344f 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -219,23 +219,6 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val)
 }
 #endif
 
-int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
-			unsigned cmd, unsigned long arg)
-{
-	struct gendisk *disk = bdev->bd_disk;
-
-	if (disk->fops->ioctl)
-		return disk->fops->ioctl(bdev, mode, cmd, arg);
-
-	return -ENOTTY;
-}
-/*
- * For the record: _GPL here is only because somebody decided to slap it
- * on the previous export.  Sheer idiocy, since it wasn't copyrightable
- * at all and could be open-coded without any exports by anybody who cares.
- */
-EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
-
 #ifdef CONFIG_COMPAT
 /*
  * This is the equivalent of compat_ptr_ioctl(), to be used by block
@@ -594,10 +577,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	}
 
 	ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
-	if (ret == -ENOIOCTLCMD)
-		return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+	if (ret != -ENOIOCTLCMD)
+		return ret;
 
-	return ret;
+	if (!bdev->bd_disk->fops->ioctl)
+		return -ENOTTY;
+	return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
 }
 EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 467dbd06b7cd..ef1c1f094ea4 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2584,9 +2584,11 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	case CDROM_LAST_WRITTEN:
 	case CDROM_SEND_PACKET:
 	case SCSI_IOCTL_SEND_COMMAND:
-		ret = __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg);
+		if (!bdev->bd_disk->fops->ioctl)
+			ret = -ENOTTY;
+		else
+			ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
 		break;
-
 	default:
 		pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 214326383145..afac8d07c1bd 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1230,8 +1230,9 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
 
 	if (dc->io_disable)
 		return -EIO;
-
-	return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
+	if (!dc->bdev->bd_disk->fops->ioctl)
+		return -ENOTTY;
+	return dc->bdev->bd_disk->fops->ioctl(dc->bdev, mode, cmd, arg);
 }
 
 void bch_cached_dev_request_init(struct cached_dev *dc)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c18fc2548518..6db395c3d28b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -570,7 +570,10 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 		}
 	}
 
-	r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+	if (!bdev->bd_disk->fops->ioctl)
+		r = -ENOTTY;
+	else
+		r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
 out:
 	dm_unprepare_ioctl(md, srcu_idx);
 	return r;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c1ba8a8d2bc..05b346a68c2e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1867,8 +1867,6 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t,
 #define blkdev_compat_ptr_ioctl NULL
 #endif
 
-extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
-				 unsigned long);
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
-- 
cgit 


From a160c6159d4a0cf82f28bc1658a958e278ec3688 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Oct 2020 15:58:28 +0100
Subject: block: add an optional probe callback to major_names

Add a callback to the major_names array that allows a driver to override
how to probe for dev_t that doesn't currently have a gendisk registered.
This will help separating the lookup of the gendisk by dev_t vs probe
action for a not currently registered dev_t.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 21 ++++++++++++++++++---
 include/linux/genhd.h |  5 ++++-
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 81017bd3b333..20521163fd06 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -402,6 +402,7 @@ static struct blk_major_name {
 	struct blk_major_name *next;
 	int major;
 	char name[16];
+	void (*probe)(dev_t devt);
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 static DEFINE_MUTEX(major_names_lock);
 
@@ -444,7 +445,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  * See Documentation/admin-guide/devices.txt for the list of allocated
  * major numbers.
  */
-int register_blkdev(unsigned int major, const char *name)
+int __register_blkdev(unsigned int major, const char *name,
+		void (*probe)(dev_t devt))
 {
 	struct blk_major_name **n, *p;
 	int index, ret = 0;
@@ -483,6 +485,7 @@ int register_blkdev(unsigned int major, const char *name)
 	}
 
 	p->major = major;
+	p->probe = probe;
 	strlcpy(p->name, name, sizeof(p->name));
 	p->next = NULL;
 	index = major_to_index(major);
@@ -505,8 +508,7 @@ out:
 	mutex_unlock(&major_names_lock);
 	return ret;
 }
-
-EXPORT_SYMBOL(register_blkdev);
+EXPORT_SYMBOL(__register_blkdev);
 
 void unregister_blkdev(unsigned int major, const char *name)
 {
@@ -1033,6 +1035,19 @@ static ssize_t disk_badblocks_store(struct device *dev,
 
 static void request_gendisk_module(dev_t devt)
 {
+	unsigned int major = MAJOR(devt);
+	struct blk_major_name **n;
+
+	mutex_lock(&major_names_lock);
+	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
+		if ((*n)->major == major && (*n)->probe) {
+			(*n)->probe(devt);
+			mutex_unlock(&major_names_lock);
+			return;
+		}
+	}
+	mutex_unlock(&major_names_lock);
+
 	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
 		/* Make old-style 2.4 aliases work */
 		request_module("block-major-%d", MAJOR(devt));
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 52eb592c874a..811d0f83c4cf 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -367,7 +367,10 @@ extern void blk_unregister_region(dev_t devt, unsigned long range);
 
 #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)
 
-int register_blkdev(unsigned int major, const char *name);
+int __register_blkdev(unsigned int major, const char *name,
+		void (*probe)(dev_t devt));
+#define register_blkdev(major, name) \
+	__register_blkdev(major, name, NULL)
 void unregister_blkdev(unsigned int major, const char *name);
 
 void revalidate_disk_size(struct gendisk *disk, bool verbose);
-- 
cgit 


From d18e8b1bf9e2ee814a7f886a156bf762d52e178b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Oct 2020 15:58:29 +0100
Subject: ide: remove ide_{,un}register_region

There is no need to ever register the fake gendisk used for ide-tape.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-probe.c | 32 --------------------------------
 drivers/ide/ide-tape.c  |  2 --
 include/linux/ide.h     |  3 ---
 3 files changed, 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 1ddc45a04418..076d34b38172 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -929,38 +929,6 @@ static struct kobject *ata_probe(dev_t dev, int *part, void *data)
 	return NULL;
 }
 
-static struct kobject *exact_match(dev_t dev, int *part, void *data)
-{
-	struct gendisk *p = data;
-	*part &= (1 << PARTN_BITS) - 1;
-	return &disk_to_dev(p)->kobj;
-}
-
-static int exact_lock(dev_t dev, void *data)
-{
-	struct gendisk *p = data;
-
-	if (!get_disk_and_module(p))
-		return -1;
-	return 0;
-}
-
-void ide_register_region(struct gendisk *disk)
-{
-	blk_register_region(MKDEV(disk->major, disk->first_minor),
-			    disk->minors, NULL, exact_match, exact_lock, disk);
-}
-
-EXPORT_SYMBOL_GPL(ide_register_region);
-
-void ide_unregister_region(struct gendisk *disk)
-{
-	blk_unregister_region(MKDEV(disk->major, disk->first_minor),
-			      disk->minors);
-}
-
-EXPORT_SYMBOL_GPL(ide_unregister_region);
-
 void ide_init_disk(struct gendisk *disk, ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 6f26634b22bb..88b96437b22e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1822,7 +1822,6 @@ static void ide_tape_remove(ide_drive_t *drive)
 
 	ide_proc_unregister_driver(drive, tape->driver);
 	device_del(&tape->dev);
-	ide_unregister_region(tape->disk);
 
 	mutex_lock(&idetape_ref_mutex);
 	put_device(&tape->dev);
@@ -2026,7 +2025,6 @@ static int ide_tape_probe(ide_drive_t *drive)
 		      "n%s", tape->name);
 
 	g->fops = &idetape_block_ops;
-	ide_register_region(g);
 
 	return 0;
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 62653769509f..2c300689a51a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1493,9 +1493,6 @@ static inline void ide_acpi_port_init_devices(ide_hwif_t *hwif) { ; }
 static inline void ide_acpi_set_state(ide_hwif_t *hwif, int on) {}
 #endif
 
-void ide_register_region(struct gendisk *);
-void ide_unregister_region(struct gendisk *);
-
 void ide_check_nien_quirk_list(ide_drive_t *);
 void ide_undecoded_slave(ide_drive_t *);
 
-- 
cgit 


From e418de3abcda8b102f737919e830024d1455938f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Oct 2020 15:58:41 +0100
Subject: block: switch gendisk lookup to a simple xarray

Now that bdev_map is only used for finding gendisks, we can use
a simple xarray instead of the regions tracking structure for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 208 +++++++++-----------------------------------------
 include/linux/genhd.h |   7 --
 2 files changed, 37 insertions(+), 178 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 20521163fd06..01d146598fe7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -27,15 +27,7 @@
 
 static struct kobject *block_depr;
 
-struct bdev_map {
-	struct bdev_map *next;
-	dev_t dev;
-	unsigned long range;
-	struct module *owner;
-	struct kobject *(*probe)(dev_t, int *, void *);
-	int (*lock)(dev_t, void *);
-	void *data;
-} *bdev_map[255];
+static DEFINE_XARRAY(bdev_map);
 static DEFINE_MUTEX(bdev_map_lock);
 
 /* for extended dynamic devt allocation, currently only one major is used */
@@ -649,85 +641,26 @@ static char *bdevt_str(dev_t devt, char *buf)
 	return buf;
 }
 
-/*
- * Register device numbers dev..(dev+range-1)
- * range must be nonzero
- * The hash chain is sorted on range, so that subranges can override.
- */
-void blk_register_region(dev_t devt, unsigned long range, struct module *module,
-			 struct kobject *(*probe)(dev_t, int *, void *),
-			 int (*lock)(dev_t, void *), void *data)
-{
-	unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1;
-	unsigned index = MAJOR(devt);
-	unsigned i;
-	struct bdev_map *p;
-
-	n = min(n, 255u);
-	p = kmalloc_array(n, sizeof(struct bdev_map), GFP_KERNEL);
-	if (p == NULL)
-		return;
-
-	for (i = 0; i < n; i++, p++) {
-		p->owner = module;
-		p->probe = probe;
-		p->lock = lock;
-		p->dev = devt;
-		p->range = range;
-		p->data = data;
-	}
+static void blk_register_region(struct gendisk *disk)
+{
+	int i;
 
 	mutex_lock(&bdev_map_lock);
-	for (i = 0, p -= n; i < n; i++, p++, index++) {
-		struct bdev_map **s = &bdev_map[index % 255];
-		while (*s && (*s)->range < range)
-			s = &(*s)->next;
-		p->next = *s;
-		*s = p;
+	for (i = 0; i < disk->minors; i++) {
+		if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL))
+			WARN_ON_ONCE(1);
 	}
 	mutex_unlock(&bdev_map_lock);
 }
-EXPORT_SYMBOL(blk_register_region);
 
-void blk_unregister_region(dev_t devt, unsigned long range)
+static void blk_unregister_region(struct gendisk *disk)
 {
-	unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1;
-	unsigned index = MAJOR(devt);
-	unsigned i;
-	struct bdev_map *found = NULL;
+	int i;
 
 	mutex_lock(&bdev_map_lock);
-	for (i = 0; i < min(n, 255u); i++, index++) {
-		struct bdev_map **s;
-		for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) {
-			struct bdev_map *p = *s;
-			if (p->dev == devt && p->range == range) {
-				*s = p->next;
-				if (!found)
-					found = p;
-				break;
-			}
-		}
-	}
+	for (i = 0; i < disk->minors; i++)
+		xa_erase(&bdev_map, disk_devt(disk) + i);
 	mutex_unlock(&bdev_map_lock);
-	kfree(found);
-}
-EXPORT_SYMBOL(blk_unregister_region);
-
-static struct kobject *exact_match(dev_t devt, int *partno, void *data)
-{
-	struct gendisk *p = data;
-
-	return &disk_to_dev(p)->kobj;
-}
-
-static int exact_lock(dev_t devt, void *data)
-{
-	struct gendisk *p = data;
-
-	if (!get_disk_and_module(p))
-		return -1;
-	return 0;
 }
 
 static void disk_scan_partitions(struct gendisk *disk)
@@ -873,8 +806,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
 		ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
 		WARN_ON(ret);
 		bdi_set_owner(bdi, dev);
-		blk_register_region(disk_devt(disk), disk->minors, NULL,
-				    exact_match, exact_lock, disk);
+		blk_register_region(disk);
 	}
 	register_disk(parent, disk, groups);
 	if (register_queue)
@@ -987,7 +919,7 @@ void del_gendisk(struct gendisk *disk)
 	blk_unregister_queue(disk);
 	
 	if (!(disk->flags & GENHD_FL_HIDDEN))
-		blk_unregister_region(disk_devt(disk), disk->minors);
+		blk_unregister_region(disk);
 	/*
 	 * Remove gendisk pointer from idr so that it cannot be looked up
 	 * while RCU period before freeing gendisk is running to prevent
@@ -1053,54 +985,22 @@ static void request_gendisk_module(dev_t devt)
 		request_module("block-major-%d", MAJOR(devt));
 }
 
-static struct gendisk *lookup_gendisk(dev_t dev, int *partno)
+static bool get_disk_and_module(struct gendisk *disk)
 {
-	struct kobject *kobj;
-	struct bdev_map *p;
-	unsigned long best = ~0UL;
-
-retry:
-	mutex_lock(&bdev_map_lock);
-	for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) {
-		struct kobject *(*probe)(dev_t, int *, void *);
-		struct module *owner;
-		void *data;
-
-		if (p->dev > dev || p->dev + p->range - 1 < dev)
-			continue;
-		if (p->range - 1 >= best)
-			break;
-		if (!try_module_get(p->owner))
-			continue;
-		owner = p->owner;
-		data = p->data;
-		probe = p->probe;
-		best = p->range - 1;
-		*partno = dev - p->dev;
-
-		if (!probe) {
-			mutex_unlock(&bdev_map_lock);
-			module_put(owner);
-			request_gendisk_module(dev);
-			goto retry;
-		}
+	struct module *owner;
 
-		if (p->lock && p->lock(dev, data) < 0) {
-			module_put(owner);
-			continue;
-		}
-		mutex_unlock(&bdev_map_lock);
-		kobj = probe(dev, partno, data);
-		/* Currently ->owner protects _only_ ->probe() itself. */
+	if (!disk->fops)
+		return false;
+	owner = disk->fops->owner;
+	if (owner && !try_module_get(owner))
+		return false;
+	if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) {
 		module_put(owner);
-		if (kobj)
-			return dev_to_disk(kobj_to_dev(kobj));
-		goto retry;
+		return false;
 	}
-	mutex_unlock(&bdev_map_lock);
-	return NULL;
-}
+	return true;
 
+}
 
 /**
  * get_gendisk - get partitioning information for a given device
@@ -1119,7 +1019,19 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 	might_sleep();
 
 	if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
-		disk = lookup_gendisk(devt, partno);
+		mutex_lock(&bdev_map_lock);
+		disk = xa_load(&bdev_map, devt);
+		if (!disk) {
+			mutex_unlock(&bdev_map_lock);
+			request_gendisk_module(devt);
+			mutex_lock(&bdev_map_lock);
+			disk = xa_load(&bdev_map, devt);
+		}
+		if (disk && !get_disk_and_module(disk))
+			disk = NULL;
+		if (disk)
+			*partno = devt - disk_devt(disk);
+		mutex_unlock(&bdev_map_lock);
 	} else {
 		struct hd_struct *part;
 
@@ -1323,21 +1235,6 @@ static const struct seq_operations partitions_op = {
 };
 #endif
 
-static void bdev_map_init(void)
-{
-	struct bdev_map *base;
-	int i;
-
-	base = kzalloc(sizeof(*base), GFP_KERNEL);
-	if (!base)
-		panic("cannot allocate bdev_map");
-
-	base->dev = 1;
-	base->range = ~0 ;
-	for (i = 0; i < 255; i++)
-		bdev_map[i] = base;
-}
-
 static int __init genhd_device_init(void)
 {
 	int error;
@@ -1346,7 +1243,6 @@ static int __init genhd_device_init(void)
 	error = class_register(&block_class);
 	if (unlikely(error))
 		return error;
-	bdev_map_init();
 	blk_dev_init();
 
 	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
@@ -1895,35 +1791,6 @@ out_free_disk:
 }
 EXPORT_SYMBOL(__alloc_disk_node);
 
-/**
- * get_disk_and_module - increments the gendisk and gendisk fops module refcount
- * @disk: the struct gendisk to increment the refcount for
- *
- * This increments the refcount for the struct gendisk, and the gendisk's
- * fops module owner.
- *
- * Context: Any context.
- */
-struct kobject *get_disk_and_module(struct gendisk *disk)
-{
-	struct module *owner;
-	struct kobject *kobj;
-
-	if (!disk->fops)
-		return NULL;
-	owner = disk->fops->owner;
-	if (owner && !try_module_get(owner))
-		return NULL;
-	kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
-	if (kobj == NULL) {
-		module_put(owner);
-		return NULL;
-	}
-	return kobj;
-
-}
-EXPORT_SYMBOL(get_disk_and_module);
-
 /**
  * put_disk - decrements the gendisk refcount
  * @disk: the struct gendisk to decrement the refcount for
@@ -1960,7 +1827,6 @@ void put_disk_and_module(struct gendisk *disk)
 		module_put(owner);
 	}
 }
-EXPORT_SYMBOL(put_disk_and_module);
 
 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 {
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 811d0f83c4cf..3cbc2781ef34 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -339,15 +339,8 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev);
 int blk_drop_partitions(struct block_device *bdev);
 
 extern struct gendisk *__alloc_disk_node(int minors, int node_id);
-extern struct kobject *get_disk_and_module(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
 extern void put_disk_and_module(struct gendisk *disk);
-extern void blk_register_region(dev_t devt, unsigned long range,
-			struct module *module,
-			struct kobject *(*probe)(dev_t, int *, void *),
-			int (*lock)(dev_t, void *),
-			void *data);
-extern void blk_unregister_region(dev_t devt, unsigned long range);
 
 #define alloc_disk_node(minors, node_id)				\
 ({									\
-- 
cgit 


From 7a089ec7d77fe7d50f6bb7b178fa25eec9fd822b Mon Sep 17 00:00:00 2001
From: Peilin Ye <yepeilin.cs@gmail.com>
Date: Thu, 12 Nov 2020 07:04:03 -0500
Subject: console: Delete unused con_font_copy() callback implementations

Recently in commit 3c4e0dff2095 ("vt: Disable KD_FONT_OP_COPY") we
disabled the KD_FONT_OP_COPY ioctl() option. Delete all the
con_font_copy() callbacks, since we no longer use them.

Mark KD_FONT_OP_COPY as "obsolete" in include/uapi/linux/kd.h, just like
what we have done for PPPIOCDETACH in commit af8d3c7c001a ("ppp: remove
the PPPIOCDETACH ioctl").

Signed-off-by: Peilin Ye <yepeilin.cs@gmail.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/c8d28007edf50de4387e1532eb3eb736db716f73.1605169912.git.yepeilin.cs@gmail.com
---
 drivers/usb/misc/sisusbvga/sisusb_con.c |  6 ------
 drivers/video/console/dummycon.c        |  6 ------
 drivers/video/fbdev/core/fbcon.c        | 11 -----------
 include/linux/console.h                 |  1 -
 include/uapi/linux/kd.h                 |  2 +-
 5 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/misc/sisusbvga/sisusb_con.c b/drivers/usb/misc/sisusbvga/sisusb_con.c
index c63e545fb105..fd9954381fbf 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_con.c
+++ b/drivers/usb/misc/sisusbvga/sisusb_con.c
@@ -1358,11 +1358,6 @@ static int sisusbdummycon_font_default(struct vc_data *vc,
 	return 0;
 }
 
-static int sisusbdummycon_font_copy(struct vc_data *vc, int con)
-{
-	return 0;
-}
-
 static const struct consw sisusb_dummy_con = {
 	.owner =		THIS_MODULE,
 	.con_startup =		sisusbdummycon_startup,
@@ -1377,7 +1372,6 @@ static const struct consw sisusb_dummy_con = {
 	.con_blank =		sisusbdummycon_blank,
 	.con_font_set =		sisusbdummycon_font_set,
 	.con_font_default =	sisusbdummycon_font_default,
-	.con_font_copy =	sisusbdummycon_font_copy,
 };
 
 int
diff --git a/drivers/video/console/dummycon.c b/drivers/video/console/dummycon.c
index 2a0d0bda7faa..ab3df752fb57 100644
--- a/drivers/video/console/dummycon.c
+++ b/drivers/video/console/dummycon.c
@@ -136,11 +136,6 @@ static int dummycon_font_default(struct vc_data *vc,
 	return 0;
 }
 
-static int dummycon_font_copy(struct vc_data *vc, int con)
-{
-	return 0;
-}
-
 /*
  *  The console `switch' structure for the dummy console
  *
@@ -161,6 +156,5 @@ const struct consw dummy_con = {
 	.con_blank =	dummycon_blank,
 	.con_font_set =	dummycon_font_set,
 	.con_font_default =	dummycon_font_default,
-	.con_font_copy =	dummycon_font_copy,
 };
 EXPORT_SYMBOL_GPL(dummy_con);
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index cef437817b0d..26d1b0916692 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2451,16 +2451,6 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h,
 	return 0;
 }
 
-static int fbcon_copy_font(struct vc_data *vc, int con)
-{
-	struct fbcon_display *od = &fb_display[con];
-	struct console_font *f = &vc->vc_font;
-
-	if (od->fontdata == f->data)
-		return 0;	/* already the same font... */
-	return fbcon_do_set_font(vc, f->width, f->height, od->fontdata, od->userfont);
-}
-
 /*
  *  User asked to set font; we are guaranteed that
  *	a) width and height are in range 1..32
@@ -3111,7 +3101,6 @@ static const struct consw fb_con = {
 	.con_font_set 		= fbcon_set_font,
 	.con_font_get 		= fbcon_get_font,
 	.con_font_default	= fbcon_set_def_font,
-	.con_font_copy 		= fbcon_copy_font,
 	.con_set_palette 	= fbcon_set_palette,
 	.con_invert_region 	= fbcon_invert_region,
 	.con_screen_pos 	= fbcon_screen_pos,
diff --git a/include/linux/console.h b/include/linux/console.h
index 4b1e26c4cb42..20874db50bc8 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -62,7 +62,6 @@ struct consw {
 	int	(*con_font_get)(struct vc_data *vc, struct console_font *font);
 	int	(*con_font_default)(struct vc_data *vc,
 			struct console_font *font, char *name);
-	int	(*con_font_copy)(struct vc_data *vc, int con);
 	int     (*con_resize)(struct vc_data *vc, unsigned int width,
 			unsigned int height, unsigned int user);
 	void	(*con_set_palette)(struct vc_data *vc,
diff --git a/include/uapi/linux/kd.h b/include/uapi/linux/kd.h
index 4616b31f84da..ee929ece4112 100644
--- a/include/uapi/linux/kd.h
+++ b/include/uapi/linux/kd.h
@@ -173,7 +173,7 @@ struct console_font {
 #define KD_FONT_OP_SET		0	/* Set font */
 #define KD_FONT_OP_GET		1	/* Get font */
 #define KD_FONT_OP_SET_DEFAULT	2	/* Set font to default, data points to name / NULL */
-#define KD_FONT_OP_COPY		3	/* Copy from another console */
+#define KD_FONT_OP_COPY		3	/* Obsolete, do not use */
 
 #define KD_FONT_FLAG_DONT_RECALC 	1	/* Don't recalculate hw charcell size [compat] */
 
-- 
cgit 


From 4ee573086bd88ff3060dda07873bf755d332e9ba Mon Sep 17 00:00:00 2001
From: Peilin Ye <yepeilin.cs@gmail.com>
Date: Thu, 12 Nov 2020 07:13:34 -0500
Subject: Fonts: Add charcount field to font_desc

Subsystems are hard-coding the number of characters of our built-in fonts
as 256. Include that information in our kernel font descriptor, `struct
font_desc`.

Signed-off-by: Peilin Ye <yepeilin.cs@gmail.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/65952296d1d9486093bd955d1536f7dcd11112c6.1605169912.git.yepeilin.cs@gmail.com
---
 include/linux/font.h       | 1 +
 lib/fonts/font_10x18.c     | 1 +
 lib/fonts/font_6x10.c      | 1 +
 lib/fonts/font_6x11.c      | 1 +
 lib/fonts/font_6x8.c       | 1 +
 lib/fonts/font_7x14.c      | 1 +
 lib/fonts/font_8x16.c      | 1 +
 lib/fonts/font_8x8.c       | 1 +
 lib/fonts/font_acorn_8x8.c | 1 +
 lib/fonts/font_mini_4x6.c  | 1 +
 lib/fonts/font_pearl_8x8.c | 1 +
 lib/fonts/font_sun12x22.c  | 1 +
 lib/fonts/font_sun8x16.c   | 1 +
 lib/fonts/font_ter16x32.c  | 1 +
 14 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/font.h b/include/linux/font.h
index 4f50d736ea72..abf1442ce719 100644
--- a/include/linux/font.h
+++ b/include/linux/font.h
@@ -17,6 +17,7 @@ struct font_desc {
     int idx;
     const char *name;
     unsigned int width, height;
+    unsigned int charcount;
     const void *data;
     int pref;
 };
diff --git a/lib/fonts/font_10x18.c b/lib/fonts/font_10x18.c
index e02f9df24d1e..5d940db626e7 100644
--- a/lib/fonts/font_10x18.c
+++ b/lib/fonts/font_10x18.c
@@ -5137,6 +5137,7 @@ const struct font_desc font_10x18 = {
 	.name	= "10x18",
 	.width	= 10,
 	.height	= 18,
+	.charcount = 256,
 	.data	= fontdata_10x18.data,
 #ifdef __sparc__
 	.pref	= 5,
diff --git a/lib/fonts/font_6x10.c b/lib/fonts/font_6x10.c
index 6e3c4b7691c8..e65df019e0d2 100644
--- a/lib/fonts/font_6x10.c
+++ b/lib/fonts/font_6x10.c
@@ -3083,6 +3083,7 @@ const struct font_desc font_6x10 = {
 	.name	= "6x10",
 	.width	= 6,
 	.height	= 10,
+	.charcount = 256,
 	.data	= fontdata_6x10.data,
 	.pref	= 0,
 };
diff --git a/lib/fonts/font_6x11.c b/lib/fonts/font_6x11.c
index 2d22a24e816f..bd76b3f6b635 100644
--- a/lib/fonts/font_6x11.c
+++ b/lib/fonts/font_6x11.c
@@ -3346,6 +3346,7 @@ const struct font_desc font_vga_6x11 = {
 	.name	= "ProFont6x11",
 	.width	= 6,
 	.height	= 11,
+	.charcount = 256,
 	.data	= fontdata_6x11.data,
 	/* Try avoiding this font if possible unless on MAC */
 	.pref	= -2000,
diff --git a/lib/fonts/font_6x8.c b/lib/fonts/font_6x8.c
index e7442a0d183d..06ace7792521 100644
--- a/lib/fonts/font_6x8.c
+++ b/lib/fonts/font_6x8.c
@@ -2571,6 +2571,7 @@ const struct font_desc font_6x8 = {
 	.name	= "6x8",
 	.width	= 6,
 	.height	= 8,
+	.charcount = 256,
 	.data	= fontdata_6x8.data,
 	.pref	= 0,
 };
diff --git a/lib/fonts/font_7x14.c b/lib/fonts/font_7x14.c
index 9cc7ae2e03f7..a2f561c9fa04 100644
--- a/lib/fonts/font_7x14.c
+++ b/lib/fonts/font_7x14.c
@@ -4113,6 +4113,7 @@ const struct font_desc font_7x14 = {
 	.name	= "7x14",
 	.width	= 7,
 	.height	= 14,
+	.charcount = 256,
 	.data	= fontdata_7x14.data,
 	.pref	= 0,
 };
diff --git a/lib/fonts/font_8x16.c b/lib/fonts/font_8x16.c
index bab25dc59e8d..06ae14088514 100644
--- a/lib/fonts/font_8x16.c
+++ b/lib/fonts/font_8x16.c
@@ -4627,6 +4627,7 @@ const struct font_desc font_vga_8x16 = {
 	.name	= "VGA8x16",
 	.width	= 8,
 	.height	= 16,
+	.charcount = 256,
 	.data	= fontdata_8x16.data,
 	.pref	= 0,
 };
diff --git a/lib/fonts/font_8x8.c b/lib/fonts/font_8x8.c
index 109d0572368f..69570b8c31af 100644
--- a/lib/fonts/font_8x8.c
+++ b/lib/fonts/font_8x8.c
@@ -2578,6 +2578,7 @@ const struct font_desc font_vga_8x8 = {
 	.name	= "VGA8x8",
 	.width	= 8,
 	.height	= 8,
+	.charcount = 256,
 	.data	= fontdata_8x8.data,
 	.pref	= 0,
 };
diff --git a/lib/fonts/font_acorn_8x8.c b/lib/fonts/font_acorn_8x8.c
index fb395f0d4031..18755c33d249 100644
--- a/lib/fonts/font_acorn_8x8.c
+++ b/lib/fonts/font_acorn_8x8.c
@@ -270,6 +270,7 @@ const struct font_desc font_acorn_8x8 = {
 	.name	= "Acorn8x8",
 	.width	= 8,
 	.height	= 8,
+	.charcount = 256,
 	.data	= acorndata_8x8.data,
 #ifdef CONFIG_ARCH_ACORN
 	.pref	= 20,
diff --git a/lib/fonts/font_mini_4x6.c b/lib/fonts/font_mini_4x6.c
index 592774a90917..8d39fd447952 100644
--- a/lib/fonts/font_mini_4x6.c
+++ b/lib/fonts/font_mini_4x6.c
@@ -2152,6 +2152,7 @@ const struct font_desc font_mini_4x6 = {
 	.name	= "MINI4x6",
 	.width	= 4,
 	.height	= 6,
+	.charcount = 256,
 	.data	= fontdata_mini_4x6.data,
 	.pref	= 3,
 };
diff --git a/lib/fonts/font_pearl_8x8.c b/lib/fonts/font_pearl_8x8.c
index a6f95ebce950..b1678ed0017c 100644
--- a/lib/fonts/font_pearl_8x8.c
+++ b/lib/fonts/font_pearl_8x8.c
@@ -2582,6 +2582,7 @@ const struct font_desc font_pearl_8x8 = {
 	.name	= "PEARL8x8",
 	.width	= 8,
 	.height	= 8,
+	.charcount = 256,
 	.data	= fontdata_pearl8x8.data,
 	.pref	= 2,
 };
diff --git a/lib/fonts/font_sun12x22.c b/lib/fonts/font_sun12x22.c
index a5b65bd49604..91daf5ab8b6b 100644
--- a/lib/fonts/font_sun12x22.c
+++ b/lib/fonts/font_sun12x22.c
@@ -6156,6 +6156,7 @@ const struct font_desc font_sun_12x22 = {
 	.name	= "SUN12x22",
 	.width	= 12,
 	.height	= 22,
+	.charcount = 256,
 	.data	= fontdata_sun12x22.data,
 #ifdef __sparc__
 	.pref	= 5,
diff --git a/lib/fonts/font_sun8x16.c b/lib/fonts/font_sun8x16.c
index e577e76a6a7c..81bb4eeae04e 100644
--- a/lib/fonts/font_sun8x16.c
+++ b/lib/fonts/font_sun8x16.c
@@ -268,6 +268,7 @@ const struct font_desc font_sun_8x16 = {
 	.name	= "SUN8x16",
 	.width	= 8,
 	.height	= 16,
+	.charcount = 256,
 	.data	= fontdata_sun8x16.data,
 #ifdef __sparc__
 	.pref	= 10,
diff --git a/lib/fonts/font_ter16x32.c b/lib/fonts/font_ter16x32.c
index f7c3abb6b99e..1955d624177c 100644
--- a/lib/fonts/font_ter16x32.c
+++ b/lib/fonts/font_ter16x32.c
@@ -2062,6 +2062,7 @@ const struct font_desc font_ter_16x32 = {
 	.name	= "TER16x32",
 	.width	= 16,
 	.height = 32,
+	.charcount = 256,
 	.data	= fontdata_ter16x32.data,
 #ifdef __sparc__
 	.pref	= 5,
-- 
cgit 


From 449f4ec9892ebc2f37a7eae6d97db2cf7c65e09a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Nov 2020 15:56:56 +0100
Subject: block: remove the update_bdev parameter to
 set_capacity_revalidate_and_notify

The update_bdev argument is always set to true, so remove it.  Also
rename the function to the slighly less verbose set_capacity_and_notify,
as propagating the disk size to the block device isn't really
revalidation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c                | 13 +++++--------
 drivers/block/loop.c         |  2 +-
 drivers/block/virtio_blk.c   |  2 +-
 drivers/block/xen-blkfront.c |  2 +-
 drivers/nvme/host/core.c     |  2 +-
 drivers/scsi/sd.c            |  5 ++---
 include/linux/genhd.h        |  3 +--
 7 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index ec2a24799cd9..4e039524f92b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -47,17 +47,15 @@ static void disk_del_events(struct gendisk *disk);
 static void disk_release_events(struct gendisk *disk);
 
 /*
- * Set disk capacity and notify if the size is not currently
- * zero and will not be set to zero
+ * Set disk capacity and notify if the size is not currently zero and will not
+ * be set to zero.  Returns true if a uevent was sent, otherwise false.
  */
-bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
-					bool update_bdev)
+bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
 {
 	sector_t capacity = get_capacity(disk);
 
 	set_capacity(disk, size);
-	if (update_bdev)
-		revalidate_disk_size(disk, true);
+	revalidate_disk_size(disk, true);
 
 	if (capacity != size && capacity != 0 && size != 0) {
 		char *envp[] = { "RESIZE=1", NULL };
@@ -68,8 +66,7 @@ bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
 
 	return false;
 }
-
-EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify);
+EXPORT_SYMBOL_GPL(set_capacity_and_notify);
 
 /*
  * Format the device name of the indicated disk into the supplied buffer and
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fcc5e32f0993..9a27d4f1c08a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -251,7 +251,7 @@ loop_validate_block_size(unsigned short bsize)
  */
 static void loop_set_size(struct loop_device *lo, loff_t size)
 {
-	if (!set_capacity_revalidate_and_notify(lo->lo_disk, size, true))
+	if (!set_capacity_and_notify(lo->lo_disk, size))
 		kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
 }
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index a314b9382442..3e812b4c32e6 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -470,7 +470,7 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
 		   cap_str_10,
 		   cap_str_2);
 
-	set_capacity_revalidate_and_notify(vblk->disk, capacity, true);
+	set_capacity_and_notify(vblk->disk, capacity);
 }
 
 static void virtblk_config_changed_work(struct work_struct *work)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 48629d3433b4..79521e33d30e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2370,7 +2370,7 @@ static void blkfront_connect(struct blkfront_info *info)
 			return;
 		printk(KERN_INFO "Setting capacity to %Lu\n",
 		       sectors);
-		set_capacity_revalidate_and_notify(info->gd, sectors, true);
+		set_capacity_and_notify(info->gd, sectors);
 
 		return;
 	case BLKIF_STATE_SUSPENDED:
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f6c6479da0e9..6c144e748f8c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2053,7 +2053,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
 			capacity = 0;
 	}
 
-	set_capacity_revalidate_and_notify(disk, capacity, true);
+	set_capacity_and_notify(disk, capacity);
 
 	nvme_config_discard(disk, ns);
 	nvme_config_write_zeroes(disk, ns);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index e9b898a1a480..679c2c025047 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3261,8 +3261,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 
 	sdkp->first_scan = 0;
 
-	set_capacity_revalidate_and_notify(disk,
-		logical_to_sectors(sdp, sdkp->capacity), true);
+	set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity));
 	sd_config_write_same(sdkp);
 	kfree(buffer);
 
@@ -3272,7 +3271,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	 * capacity to 0.
 	 */
 	if (sd_zbc_revalidate_zones(sdkp))
-		set_capacity_revalidate_and_notify(disk, 0, true);
+		set_capacity_and_notify(disk, 0);
 
  out:
 	return 0;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3cbc2781ef34..46553d6d6025 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -314,8 +314,7 @@ static inline int get_disk_ro(struct gendisk *disk)
 extern void disk_block_events(struct gendisk *disk);
 extern void disk_unblock_events(struct gendisk *disk);
 extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
-bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
-		bool update_bdev);
+bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
 
 /* drivers/char/random.c */
 extern void add_disk_randomness(struct gendisk *disk) __latent_entropy;
-- 
cgit 


From cef397038167ac15d085914493d6c86385773709 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 11 Nov 2020 17:52:58 +0100
Subject: arch: pgtable: define MAX_POSSIBLE_PHYSMEM_BITS where needed

Stefan Agner reported a bug when using zsram on 32-bit Arm machines
with RAM above the 4GB address boundary:

  Unable to handle kernel NULL pointer dereference at virtual address 00000000
  pgd = a27bd01c
  [00000000] *pgd=236a0003, *pmd=1ffa64003
  Internal error: Oops: 207 [#1] SMP ARM
  Modules linked in: mdio_bcm_unimac(+) brcmfmac cfg80211 brcmutil raspberrypi_hwmon hci_uart crc32_arm_ce bcm2711_thermal phy_generic genet
  CPU: 0 PID: 123 Comm: mkfs.ext4 Not tainted 5.9.6 #1
  Hardware name: BCM2711
  PC is at zs_map_object+0x94/0x338
  LR is at zram_bvec_rw.constprop.0+0x330/0xa64
  pc : [<c0602b38>]    lr : [<c0bda6a0>]    psr: 60000013
  sp : e376bbe0  ip : 00000000  fp : c1e2921c
  r10: 00000002  r9 : c1dda730  r8 : 00000000
  r7 : e8ff7a00  r6 : 00000000  r5 : 02f9ffa0  r4 : e3710000
  r3 : 000fdffe  r2 : c1e0ce80  r1 : ebf979a0  r0 : 00000000
  Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
  Control: 30c5383d  Table: 235c2a80  DAC: fffffffd
  Process mkfs.ext4 (pid: 123, stack limit = 0x495a22e6)
  Stack: (0xe376bbe0 to 0xe376c000)

As it turns out, zsram needs to know the maximum memory size, which
is defined in MAX_PHYSMEM_BITS when CONFIG_SPARSEMEM is set, or in
MAX_POSSIBLE_PHYSMEM_BITS on the x86 architecture.

The same problem will be hit on all 32-bit architectures that have a
physical address space larger than 4GB and happen to not enable sparsemem
and include asm/sparsemem.h from asm/pgtable.h.

After the initial discussion, I suggested just always defining
MAX_POSSIBLE_PHYSMEM_BITS whenever CONFIG_PHYS_ADDR_T_64BIT is
set, or provoking a build error otherwise. This addresses all
configurations that can currently have this runtime bug, but
leaves all other configurations unchanged.

I looked up the possible number of bits in source code and
datasheets, here is what I found:

 - on ARC, CONFIG_ARC_HAS_PAE40 controls whether 32 or 40 bits are used
 - on ARM, CONFIG_LPAE enables 40 bit addressing, without it we never
   support more than 32 bits, even though supersections in theory allow
   up to 40 bits as well.
 - on MIPS, some MIPS32r1 or later chips support 36 bits, and MIPS32r5
   XPA supports up to 60 bits in theory, but 40 bits are more than
   anyone will ever ship
 - On PowerPC, there are three different implementations of 36 bit
   addressing, but 32-bit is used without CONFIG_PTE_64BIT
 - On RISC-V, the normal page table format can support 34 bit
   addressing. There is no highmem support on RISC-V, so anything
   above 2GB is unused, but it might be useful to eventually support
   CONFIG_ZRAM for high pages.

Fixes: 61989a80fb3a ("staging: zsmalloc: zsmalloc memory allocation library")
Fixes: 02390b87a945 ("mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS")
Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Reviewed-by: Stefan Agner <stefan@agner.ch>
Tested-by: Stefan Agner <stefan@agner.ch>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Link: https://lore.kernel.org/linux-mm/bdfa44bf1c570b05d6c70898e2bbb0acf234ecdf.1604762181.git.stefan@agner.ch/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arc/include/asm/pgtable.h               |  2 ++
 arch/arm/include/asm/pgtable-2level.h        |  2 ++
 arch/arm/include/asm/pgtable-3level.h        |  2 ++
 arch/mips/include/asm/pgtable-32.h           |  3 +++
 arch/powerpc/include/asm/book3s/32/pgtable.h |  2 ++
 arch/powerpc/include/asm/nohash/32/pgtable.h |  2 ++
 arch/riscv/include/asm/pgtable-32.h          |  2 ++
 include/linux/pgtable.h                      | 13 +++++++++++++
 8 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index f1ed17edb085..163641726a2b 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -134,8 +134,10 @@
 
 #ifdef CONFIG_ARC_HAS_PAE40
 #define PTE_BITS_NON_RWX_IN_PD1	(0xff00000000 | PAGE_MASK | _PAGE_CACHEABLE)
+#define MAX_POSSIBLE_PHYSMEM_BITS 40
 #else
 #define PTE_BITS_NON_RWX_IN_PD1	(PAGE_MASK | _PAGE_CACHEABLE)
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
 #endif
 
 /**************************************************************************
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
index 3502c2f746ca..baf7d0204eb5 100644
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -75,6 +75,8 @@
 #define PTE_HWTABLE_OFF		(PTE_HWTABLE_PTRS * sizeof(pte_t))
 #define PTE_HWTABLE_SIZE	(PTRS_PER_PTE * sizeof(u32))
 
+#define MAX_POSSIBLE_PHYSMEM_BITS	32
+
 /*
  * PMD_SHIFT determines the size of the area a second-level page table can map
  * PGDIR_SHIFT determines what a third-level page table entry can map
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index fbb6693c3352..2b85d175e999 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -25,6 +25,8 @@
 #define PTE_HWTABLE_OFF		(0)
 #define PTE_HWTABLE_SIZE	(PTRS_PER_PTE * sizeof(u64))
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 40
+
 /*
  * PGDIR_SHIFT determines the size a top-level page table entry can map.
  */
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index a950fc1ddb4d..6c0532d7b211 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -154,6 +154,7 @@ static inline void pmd_clear(pmd_t *pmdp)
 
 #if defined(CONFIG_XPA)
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 40
 #define pte_pfn(x)		(((unsigned long)((x).pte_high >> _PFN_SHIFT)) | (unsigned long)((x).pte_low << _PAGE_PRESENT_SHIFT))
 static inline pte_t
 pfn_pte(unsigned long pfn, pgprot_t prot)
@@ -169,6 +170,7 @@ pfn_pte(unsigned long pfn, pgprot_t prot)
 
 #elif defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
 #define pte_pfn(x)		((unsigned long)((x).pte_high >> 6))
 
 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
@@ -183,6 +185,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 
 #else
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
 #ifdef CONFIG_CPU_VR41XX
 #define pte_pfn(x)		((unsigned long)((x).pte >> (PAGE_SHIFT + 2)))
 #define pfn_pte(pfn, prot)	__pte(((pfn) << (PAGE_SHIFT + 2)) | pgprot_val(prot))
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 36443cda8dcf..1376be95e975 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -36,8 +36,10 @@ static inline bool pte_user(pte_t pte)
  */
 #ifdef CONFIG_PTE_64BIT
 #define PTE_RPN_MASK	(~((1ULL << PTE_RPN_SHIFT) - 1))
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
 #else
 #define PTE_RPN_MASK	(~((1UL << PTE_RPN_SHIFT) - 1))
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
 #endif
 
 /*
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index ee2243ba96cf..96522f7f0618 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -153,8 +153,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
  */
 #if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
 #define PTE_RPN_MASK	(~((1ULL << PTE_RPN_SHIFT) - 1))
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
 #else
 #define PTE_RPN_MASK	(~((1UL << PTE_RPN_SHIFT) - 1))
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
 #endif
 
 /*
diff --git a/arch/riscv/include/asm/pgtable-32.h b/arch/riscv/include/asm/pgtable-32.h
index b0ab66e5fdb1..5b2e79e5bfa5 100644
--- a/arch/riscv/include/asm/pgtable-32.h
+++ b/arch/riscv/include/asm/pgtable-32.h
@@ -14,4 +14,6 @@
 #define PGDIR_SIZE      (_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK      (~(PGDIR_SIZE - 1))
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 34
+
 #endif /* _ASM_RISCV_PGTABLE_32_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 71125a4676c4..e237004d498d 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1427,6 +1427,19 @@ typedef unsigned int pgtbl_mod_mask;
 
 #endif /* !__ASSEMBLY__ */
 
+#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+/*
+ * ZSMALLOC needs to know the highest PFN on 32-bit architectures
+ * with physical address space extension, but falls back to
+ * BITS_PER_LONG otherwise.
+ */
+#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
+#else
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+#endif
+#endif
+
 #ifndef has_transparent_hugepage
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define has_transparent_hugepage() 1
-- 
cgit 


From 557acb3d2cd9c82de19f944f6cc967a347735385 Mon Sep 17 00:00:00 2001
From: Amjad Ouled-Ameur <aouledameur@baylibre.com>
Date: Fri, 13 Nov 2020 00:00:43 +0100
Subject: reset: make shared pulsed reset controls re-triggerable

The current reset framework API does not allow to release what is done by
reset_control_reset(), IOW decrement triggered_count. Add the new
reset_control_rearm() call to do so.

When reset_control_reset() has been called once, the counter
triggered_count, in the reset framework, is incremented i.e the resource
under the reset is in-use and the reset should not be done again.
reset_control_rearm() would be the way to state that the resource is
no longer used and, that from the caller's perspective, the reset can be
fired again if necessary.

Signed-off-by: Amjad Ouled-Ameur <aouledameur@baylibre.com>
Reported-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/reset.h |  1 +
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index a2df88e90011..34e89aa0fb5e 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -208,6 +208,39 @@ static int reset_control_array_reset(struct reset_control_array *resets)
 	return 0;
 }
 
+static int reset_control_array_rearm(struct reset_control_array *resets)
+{
+	struct reset_control *rstc;
+	int i;
+
+	for (i = 0; i < resets->num_rstcs; i++) {
+		rstc = resets->rstc[i];
+
+		if (!rstc)
+			continue;
+
+		if (WARN_ON(IS_ERR(rstc)))
+			return -EINVAL;
+
+		if (rstc->shared) {
+			if (WARN_ON(atomic_read(&rstc->deassert_count) != 0))
+				return -EINVAL;
+		} else {
+			if (!rstc->acquired)
+				return -EPERM;
+		}
+	}
+
+	for (i = 0; i < resets->num_rstcs; i++) {
+		rstc = resets->rstc[i];
+
+		if (rstc && rstc->shared)
+			WARN_ON(atomic_dec_return(&rstc->triggered_count) < 0);
+	}
+
+	return 0;
+}
+
 static int reset_control_array_assert(struct reset_control_array *resets)
 {
 	int ret, i;
@@ -325,6 +358,46 @@ int reset_control_reset(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_reset);
 
+/**
+ * reset_control_rearm - allow shared reset line to be re-triggered"
+ * @rstc: reset controller
+ *
+ * On a shared reset line the actual reset pulse is only triggered once for the
+ * lifetime of the reset_control instance, except if this call is used.
+ *
+ * Calls to this function must be balanced with calls to reset_control_reset,
+ * a warning is thrown in case triggered_count ever dips below 0.
+ *
+ * Consumers must not use reset_control_(de)assert on shared reset lines when
+ * reset_control_reset or reset_control_rearm have been used.
+ *
+ * If rstc is NULL the function will just return 0.
+ */
+int reset_control_rearm(struct reset_control *rstc)
+{
+	if (!rstc)
+		return 0;
+
+	if (WARN_ON(IS_ERR(rstc)))
+		return -EINVAL;
+
+	if (reset_control_is_array(rstc))
+		return reset_control_array_rearm(rstc_to_array(rstc));
+
+	if (rstc->shared) {
+		if (WARN_ON(atomic_read(&rstc->deassert_count) != 0))
+			return -EINVAL;
+
+		WARN_ON(atomic_dec_return(&rstc->triggered_count) < 0);
+	} else {
+		if (!rstc->acquired)
+			return -EPERM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(reset_control_rearm);
+
 /**
  * reset_control_assert - asserts the reset line
  * @rstc: reset controller
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 05aa9f440f48..439fec7112a9 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -13,6 +13,7 @@ struct reset_control;
 #ifdef CONFIG_RESET_CONTROLLER
 
 int reset_control_reset(struct reset_control *rstc);
+int reset_control_rearm(struct reset_control *rstc);
 int reset_control_assert(struct reset_control *rstc);
 int reset_control_deassert(struct reset_control *rstc);
 int reset_control_status(struct reset_control *rstc);
-- 
cgit 


From 872f690341948b502c93318f806d821c56772c42 Mon Sep 17 00:00:00 2001
From: Francis Laniel <laniel_francis@privacyrequired.com>
Date: Sun, 15 Nov 2020 18:08:06 +0100
Subject: treewide: rename nla_strlcpy to nla_strscpy.

Calls to nla_strlcpy are now replaced by calls to nla_strscpy which is the new
name of this function.

Signed-off-by: Francis Laniel <laniel_francis@privacyrequired.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/infiniband/core/nldev.c            | 10 +++++-----
 drivers/net/can/vxcan.c                    |  4 ++--
 drivers/net/veth.c                         |  4 ++--
 include/linux/genl_magic_struct.h          |  2 +-
 include/net/netlink.h                      |  4 ++--
 include/net/pkt_cls.h                      |  2 +-
 kernel/taskstats.c                         |  2 +-
 lib/nlattr.c                               |  6 +++---
 net/core/fib_rules.c                       |  4 ++--
 net/core/rtnetlink.c                       | 12 ++++++------
 net/decnet/dn_dev.c                        |  2 +-
 net/ieee802154/nl-mac.c                    |  2 +-
 net/ipv4/devinet.c                         |  2 +-
 net/ipv4/fib_semantics.c                   |  2 +-
 net/ipv4/metrics.c                         |  2 +-
 net/netfilter/ipset/ip_set_hash_netiface.c |  4 ++--
 net/netfilter/nf_tables_api.c              |  6 +++---
 net/netfilter/nfnetlink_acct.c             |  2 +-
 net/netfilter/nfnetlink_cthelper.c         |  4 ++--
 net/netfilter/nft_ct.c                     |  2 +-
 net/netfilter/nft_log.c                    |  2 +-
 net/netlabel/netlabel_mgmt.c               |  2 +-
 net/nfc/netlink.c                          |  2 +-
 net/sched/act_api.c                        |  2 +-
 net/sched/act_ipt.c                        |  2 +-
 net/sched/act_simple.c                     |  4 ++--
 net/sched/cls_api.c                        |  2 +-
 net/sched/sch_api.c                        |  2 +-
 net/tipc/netlink_compat.c                  |  2 +-
 29 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 12d29d54a081..08366e254b1d 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -932,7 +932,7 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) {
 		char name[IB_DEVICE_NAME_MAX] = {};
 
-		nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+		nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
 			    IB_DEVICE_NAME_MAX);
 		if (strlen(name) == 0) {
 			err = -EINVAL;
@@ -1529,13 +1529,13 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
 		return -EINVAL;
 
-	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+	nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
 		    sizeof(ibdev_name));
 	if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0)
 		return -EINVAL;
 
-	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
-	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
+	nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
+	nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
 		    sizeof(ndev_name));
 
 	ndev = dev_get_by_name(sock_net(skb->sk), ndev_name);
@@ -1602,7 +1602,7 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
 		return -EINVAL;
 
-	nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
+	nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
 		    sizeof(client_name));
 
 	if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) {
diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index d6ba9426be4d..fa47bab510bb 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -186,7 +186,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
 	}
 
 	if (ifmp && tbp[IFLA_IFNAME]) {
-		nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
+		nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
 		name_assign_type = NET_NAME_USER;
 	} else {
 		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
@@ -223,7 +223,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
 
 	/* register first device */
 	if (tb[IFLA_IFNAME])
-		nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
+		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
 		snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 8c737668008a..359d3ab33c4d 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1329,7 +1329,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	}
 
 	if (ifmp && tbp[IFLA_IFNAME]) {
-		nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
+		nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
 		name_assign_type = NET_NAME_USER;
 	} else {
 		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
@@ -1379,7 +1379,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 		eth_hw_addr_random(dev);
 
 	if (tb[IFLA_IFNAME])
-		nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
+		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
 		snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
 
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h
index eeae59d3ceb7..35d21fddaf2d 100644
--- a/include/linux/genl_magic_struct.h
+++ b/include/linux/genl_magic_struct.h
@@ -89,7 +89,7 @@ static inline int nla_put_u64_0pad(struct sk_buff *skb, int attrtype, u64 value)
 			nla_get_u64, nla_put_u64_0pad, false)
 #define __str_field(attr_nr, attr_flag, name, maxlen) \
 	__array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \
-			nla_strlcpy, nla_put, false)
+			nla_strscpy, nla_put, false)
 #define __bin_field(attr_nr, attr_flag, name, maxlen) \
 	__array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \
 			nla_memcpy, nla_put, false)
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 446ca182e13d..1ceec518ab49 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -142,7 +142,7 @@
  * Attribute Misc:
  *   nla_memcpy(dest, nla, count)	copy attribute into memory
  *   nla_memcmp(nla, data, size)	compare attribute with memory area
- *   nla_strlcpy(dst, nla, size)	copy attribute to a sized string
+ *   nla_strscpy(dst, nla, size)	copy attribute to a sized string
  *   nla_strcmp(nla, str)		compare attribute with string
  *
  * Attribute Parsing:
@@ -506,7 +506,7 @@ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
 		struct netlink_ext_ack *extack);
 int nla_policy_len(const struct nla_policy *, int);
 struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
-ssize_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize);
+ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize);
 char *nla_strdup(const struct nlattr *nla, gfp_t flags);
 int nla_memcpy(void *dest, const struct nlattr *src, int count);
 int nla_memcmp(const struct nlattr *nla, const void *data, size_t size);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index db9a828f4f4f..133f9ad4d4f9 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -512,7 +512,7 @@ tcf_change_indev(struct net *net, struct nlattr *indev_tlv,
 	char indev[IFNAMSIZ];
 	struct net_device *dev;
 
-	if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) < 0) {
+	if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) {
 		NL_SET_ERR_MSG_ATTR(extack, indev_tlv,
 				    "Interface name too long");
 		return -EINVAL;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index a2802b6ff4bb..2b4898b4752e 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -346,7 +346,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
 	data = kmalloc(len, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
-	nla_strlcpy(data, na, len);
+	nla_strscpy(data, na, len);
 	ret = cpulist_parse(data, mask);
 	kfree(data);
 	return ret;
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 447182543c03..09aa181569e0 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -709,7 +709,7 @@ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype)
 EXPORT_SYMBOL(nla_find);
 
 /**
- * nla_strlcpy - Copy string attribute payload into a sized buffer
+ * nla_strscpy - Copy string attribute payload into a sized buffer
  * @dst: Where to copy the string to.
  * @nla: Attribute to copy the string from.
  * @dstsize: Size of destination buffer.
@@ -722,7 +722,7 @@ EXPORT_SYMBOL(nla_find);
  * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater
  *            than @dstsize.
  */
-ssize_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize)
+ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize)
 {
 	size_t srclen = nla_len(nla);
 	char *src = nla_data(nla);
@@ -749,7 +749,7 @@ ssize_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize)
 
 	return ret;
 }
-EXPORT_SYMBOL(nla_strlcpy);
+EXPORT_SYMBOL(nla_strscpy);
 
 /**
  * nla_strdup - Copy string attribute payload into a newly allocated buffer
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 7bcfb16854cb..cd80ffed6d26 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -563,7 +563,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		struct net_device *dev;
 
 		nlrule->iifindex = -1;
-		nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+		nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
 		dev = __dev_get_by_name(net, nlrule->iifname);
 		if (dev)
 			nlrule->iifindex = dev->ifindex;
@@ -573,7 +573,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		struct net_device *dev;
 
 		nlrule->oifindex = -1;
-		nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+		nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
 		dev = __dev_get_by_name(net, nlrule->oifname);
 		if (dev)
 			nlrule->oifindex = dev->ifindex;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 7d7223691783..60917ff4a00b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1939,7 +1939,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla
 	if (linfo[IFLA_INFO_KIND]) {
 		char kind[MODULE_NAME_LEN];
 
-		nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+		nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
 		ops = rtnl_link_ops_get(kind);
 	}
 
@@ -2953,9 +2953,9 @@ static struct net_device *rtnl_dev_get(struct net *net,
 	if (!ifname) {
 		ifname = buffer;
 		if (ifname_attr)
-			nla_strlcpy(ifname, ifname_attr, IFNAMSIZ);
+			nla_strscpy(ifname, ifname_attr, IFNAMSIZ);
 		else if (altifname_attr)
-			nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ);
+			nla_strscpy(ifname, altifname_attr, ALTIFNAMSIZ);
 		else
 			return NULL;
 	}
@@ -2983,7 +2983,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout;
 
 	if (tb[IFLA_IFNAME])
-		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+		nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
 		ifname[0] = '\0';
 
@@ -3264,7 +3264,7 @@ replay:
 		return err;
 
 	if (tb[IFLA_IFNAME])
-		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+		nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
 		ifname[0] = '\0';
 
@@ -3296,7 +3296,7 @@ replay:
 		memset(linkinfo, 0, sizeof(linkinfo));
 
 	if (linkinfo[IFLA_INFO_KIND]) {
-		nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+		nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
 		ops = rtnl_link_ops_get(kind);
 	} else {
 		kind[0] = '\0';
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 15d42353f1a3..d1c50a48614b 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -658,7 +658,7 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	ifa->ifa_dev = dn_db;
 
 	if (tb[IFA_LABEL])
-		nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+		nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
 	else
 		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
 
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 6d091e419d3e..9c640d670ffe 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -149,7 +149,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info)
 	if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
 		char name[IFNAMSIZ + 1];
 
-		nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
+		nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
 			    sizeof(name));
 		dev = dev_get_by_name(&init_net, name);
 	} else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 43e04382c593..75f67994fc85 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -880,7 +880,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
 		ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
 
 	if (tb[IFA_LABEL])
-		nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+		nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
 	else
 		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 7612ff6111a7..b5400cec4f69 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -973,7 +973,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
 			char tmp[TCP_CA_NAME_MAX];
 			bool ecn_ca = false;
 
-			nla_strlcpy(tmp, nla, sizeof(tmp));
+			nla_strscpy(tmp, nla, sizeof(tmp));
 			val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
 		} else {
 			if (nla_len(nla) != sizeof(u32))
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 3205d5f7c8c9..25ea6ac44db9 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -31,7 +31,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
 		if (type == RTAX_CC_ALGO) {
 			char tmp[TCP_CA_NAME_MAX];
 
-			nla_strlcpy(tmp, nla, sizeof(tmp));
+			nla_strscpy(tmp, nla, sizeof(tmp));
 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
 			if (val == TCP_CA_UNSPEC) {
 				NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 3d74169b794c..ddd51c2e1cb3 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -226,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 		if (e.cidr > HOST_MASK)
 			return -IPSET_ERR_INVALID_CIDR;
 	}
-	nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+	nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -443,7 +443,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	ip6_netmask(&e.ip, e.cidr);
 
-	nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+	nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bd0e12bf5770..65aa98fc5eb6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1282,7 +1282,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net,
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	nla_strlcpy(search, nla, sizeof(search));
+	nla_strscpy(search, nla, sizeof(search));
 
 	WARN_ON(!rcu_read_lock_held() &&
 		!lockdep_commit_lock_is_held(net));
@@ -1722,7 +1722,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
 		goto err_hook_alloc;
 	}
 
-	nla_strlcpy(ifname, attr, IFNAMSIZ);
+	nla_strscpy(ifname, attr, IFNAMSIZ);
 	dev = __dev_get_by_name(net, ifname);
 	if (!dev) {
 		err = -ENOENT;
@@ -5735,7 +5735,7 @@ struct nft_object *nft_obj_lookup(const struct net *net,
 	struct rhlist_head *tmp, *list;
 	struct nft_object *obj;
 
-	nla_strlcpy(search, nla, sizeof(search));
+	nla_strscpy(search, nla, sizeof(search));
 	k.name = search;
 
 	WARN_ON_ONCE(!rcu_read_lock_held() &&
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 5bfec829c12f..5e511df8d709 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -112,7 +112,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
 		nfacct->flags = flags;
 	}
 
-	nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
+	nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
 
 	if (tb[NFACCT_BYTES]) {
 		atomic64_set(&nfacct->bytes,
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 5b0d0a77379c..0f94fce1d3ed 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -146,7 +146,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
 	    !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
 		return -EINVAL;
 
-	nla_strlcpy(expect_policy->name,
+	nla_strscpy(expect_policy->name,
 		    tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
 	expect_policy->max_expected =
 		ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
@@ -233,7 +233,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
 	if (ret < 0)
 		goto err1;
 
-	nla_strlcpy(helper->name,
+	nla_strscpy(helper->name,
 		    tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN);
 	size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
 	if (size > sizeof_field(struct nf_conn_help, data)) {
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 322bd674963e..a8c4d442231c 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -990,7 +990,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
 	if (!priv->l4proto)
 		return -ENOENT;
 
-	nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
+	nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
 
 	if (tb[NFTA_CT_HELPER_L3PROTO])
 		family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO]));
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 57899454a530..a06a46b039c5 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -152,7 +152,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
 		if (priv->prefix == NULL)
 			return -ENOMEM;
-		nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+		nla_strscpy(priv->prefix, nla, nla_len(nla) + 1);
 	} else {
 		priv->prefix = (char *)nft_log_null_prefix;
 	}
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index eb1d66d20afb..df1b41ed73fd 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -95,7 +95,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
 			ret_val = -ENOMEM;
 			goto add_free_entry;
 		}
-		nla_strlcpy(entry->domain,
+		nla_strscpy(entry->domain,
 			    info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size);
 	}
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 8709f3d4e7c4..573b38ad2f8e 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1226,7 +1226,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info)
 	if (!dev)
 		return -ENODEV;
 
-	nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME],
+	nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME],
 		    sizeof(firmware_name));
 
 	rc = nfc_fw_download(dev, firmware_name);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index fe540a89b16c..fc23f46a315c 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -939,7 +939,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 			NL_SET_ERR_MSG(extack, "TC action kind must be specified");
 			goto err_out;
 		}
-		if (nla_strlcpy(act_name, kind, IFNAMSIZ) < 0) {
+		if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) {
 			NL_SET_ERR_MSG(extack, "TC action name too long");
 			goto err_out;
 		}
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8dc3bec0d325..ac7297f42355 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -166,7 +166,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 	if (unlikely(!tname))
 		goto err1;
 	if (tb[TCA_IPT_TABLE] == NULL ||
-	    nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
+	    nla_strscpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
 		strcpy(tname, "mangle");
 
 	t = kmemdup(td, td->u.target_size, GFP_KERNEL);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index a4f3d0f0daa9..726cc956d06f 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -52,7 +52,7 @@ static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata)
 	d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
 	if (unlikely(!d->tcfd_defdata))
 		return -ENOMEM;
-	nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
 	return 0;
 }
 
@@ -71,7 +71,7 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata,
 	spin_lock_bh(&d->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch);
 	memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
-	nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
 	spin_unlock_bh(&d->tcf_lock);
 	if (goto_ch)
 		tcf_chain_put_by_act(goto_ch);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index c2e9661e20d3..ff3e943febaa 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -223,7 +223,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 static bool tcf_proto_check_kind(struct nlattr *kind, char *name)
 {
 	if (kind)
-		return nla_strlcpy(name, kind, IFNAMSIZ) < 0;
+		return nla_strscpy(name, kind, IFNAMSIZ) < 0;
 	memset(name, 0, IFNAMSIZ);
 	return false;
 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 05449286d889..1a2d2471b078 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1170,7 +1170,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 #ifdef CONFIG_MODULES
 	if (ops == NULL && kind != NULL) {
 		char name[IFNAMSIZ];
-		if (nla_strlcpy(name, kind, IFNAMSIZ) >= 0) {
+		if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
 			/* We dropped the RTNL semaphore in order to
 			 * perform the module load.  So, even if we
 			 * succeeded in loading the module we have to
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 5c6206c9d6a8..82f154989418 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -696,7 +696,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
 
 	link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
 	link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
-	nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME],
+	nla_strscpy(link_info.str, link[TIPC_NLA_LINK_NAME],
 		    TIPC_MAX_LINK_NAME);
 
 	return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO,
-- 
cgit 


From dd8088d5a8969dc2b42f71d7bc01c25c61a78066 Mon Sep 17 00:00:00 2001
From: Zhang Qilong <zhangqilong3@huawei.com>
Date: Tue, 10 Nov 2020 17:29:32 +0800
Subject: PM: runtime: Add pm_runtime_resume_and_get to deal with usage counter

In many case, we need to check return value of pm_runtime_get_sync, but
it brings a trouble to the usage counter processing. Many callers forget
to decrease the usage counter when it failed, which could resulted in
reference leak. It has been discussed a lot[0][1]. So we add a function
to deal with the usage counter for better coding.

[0]https://lkml.org/lkml/2020/6/14/88
[1]https://patchwork.ozlabs.org/project/linux-tegra/list/?series=178139
Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com>
Acked-by: Rafael J. Wysocki  <rafael.j.wysocki@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/pm_runtime.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 4b708f4e8eed..b492ae00cc90 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -386,6 +386,27 @@ static inline int pm_runtime_get_sync(struct device *dev)
 	return __pm_runtime_resume(dev, RPM_GET_PUT);
 }
 
+/**
+ * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it.
+ * @dev: Target device.
+ *
+ * Resume @dev synchronously and if that is successful, increment its runtime
+ * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been
+ * incremented or a negative error code otherwise.
+ */
+static inline int pm_runtime_resume_and_get(struct device *dev)
+{
+	int ret;
+
+	ret = __pm_runtime_resume(dev, RPM_GET_PUT);
+	if (ret < 0) {
+		pm_runtime_put_noidle(dev);
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
  * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0.
  * @dev: Target device.
-- 
cgit 


From 3136b93c3fb2b7c19e853e049203ff8f2b9dd2cd Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 16 Nov 2020 12:41:58 -0500
Subject: entry: Expose helpers to migrate TIF to SYSCALL_WORK flags

With the goal to split the syscall work related flags into a separate
field that is architecture independent, expose transitional helpers that
resolve to either the TIF flags or to the corresponding SYSCALL_WORK
flags.  This will allow architectures to migrate only when they port to
the generic syscall entry code.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20201116174206.2639648-3-krisman@collabora.com
---
 include/linux/thread_info.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e93e249a4e9b..0e9fb15d6b42 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -97,6 +97,38 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 #define test_thread_flag(flag) \
 	test_ti_thread_flag(current_thread_info(), flag)
 
+#ifdef CONFIG_GENERIC_ENTRY
+#define set_syscall_work(fl) \
+	set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
+#define test_syscall_work(fl) \
+	test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
+#define clear_syscall_work(fl) \
+	clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
+
+#define set_task_syscall_work(t, fl) \
+	set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
+#define test_task_syscall_work(t, fl) \
+	test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
+#define clear_task_syscall_work(t, fl) \
+	clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
+
+#else /* CONFIG_GENERIC_ENTRY */
+
+#define set_syscall_work(fl)						\
+	set_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl)
+#define test_syscall_work(fl) \
+	test_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl)
+#define clear_syscall_work(fl) \
+	clear_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl)
+
+#define set_task_syscall_work(t, fl) \
+	set_ti_thread_flag(task_thread_info(t), TIF_##fl)
+#define test_task_syscall_work(t, fl) \
+	test_ti_thread_flag(task_thread_info(t), TIF_##fl)
+#define clear_task_syscall_work(t, fl) \
+	clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
+#endif /* !CONFIG_GENERIC_ENTRY */
+
 #define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
 
 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
-- 
cgit 


From b86678cf0f1d76062aa964c5f0c6c89fe5a6dcfd Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 16 Nov 2020 12:41:59 -0500
Subject: entry: Wire up syscall_work in common entry code

Prepare the common entry code to use the SYSCALL_WORK flags. They will
be defined in subsequent patches for each type of syscall
work. SYSCALL_WORK_ENTRY/EXIT are defined for the transition, as they
will replace the TIF_ equivalent defines.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20201116174206.2639648-4-krisman@collabora.com
---
 include/linux/entry-common.h |  3 +++
 kernel/entry/common.c        | 15 +++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index aab549026ab8..3fe8f868f15e 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -64,6 +64,9 @@
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |			\
 	 _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK)
 
+#define SYSCALL_WORK_ENTER	(0)
+#define SYSCALL_WORK_EXIT	(0)
+
 /*
  * TIF flags handled in exit_to_user_mode_loop()
  */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index fa17baadf63e..e7a11e38daba 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -42,7 +42,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
 }
 
 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
-				unsigned long ti_work)
+				unsigned long ti_work, unsigned long work)
 {
 	long ret = 0;
 
@@ -74,11 +74,12 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 static __always_inline long
 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
 {
+	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
 	unsigned long ti_work;
 
 	ti_work = READ_ONCE(current_thread_info()->flags);
-	if (ti_work & SYSCALL_ENTER_WORK)
-		syscall = syscall_trace_enter(regs, syscall, ti_work);
+	if (work & SYSCALL_WORK_ENTER || ti_work & SYSCALL_ENTER_WORK)
+		syscall = syscall_trace_enter(regs, syscall, ti_work, work);
 
 	return syscall;
 }
@@ -225,7 +226,8 @@ static inline bool report_single_step(unsigned long ti_work)
 }
 #endif
 
-static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
+static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work,
+			      unsigned long work)
 {
 	bool step;
 
@@ -245,6 +247,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
  */
 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 {
+	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
 	u32 cached_flags = READ_ONCE(current_thread_info()->flags);
 	unsigned long nr = syscall_get_nr(current, regs);
 
@@ -262,8 +265,8 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 	 * enabled, we want to run them exactly once per syscall exit with
 	 * interrupts enabled.
 	 */
-	if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
-		syscall_exit_work(regs, cached_flags);
+	if (unlikely(work & SYSCALL_WORK_EXIT || cached_flags & SYSCALL_EXIT_WORK))
+		syscall_exit_work(regs, cached_flags, work);
 }
 
 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
-- 
cgit 


From 23d67a54857a768acdb0804cdd6037c324a50ecd Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 16 Nov 2020 12:42:00 -0500
Subject: seccomp: Migrate to use SYSCALL_WORK flag

On architectures using the generic syscall entry code the architecture
independent syscall work is moved to flags in thread_info::syscall_work.
This removes architecture dependencies and frees up TIF bits.

Define SYSCALL_WORK_SECCOMP, use it in the generic entry code and convert
the code which uses the TIF specific helper functions to use the new
*_syscall_work() helpers which either resolve to the new mode for users of
the generic entry code or to the TIF based functions for the other
architectures.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20201116174206.2639648-5-krisman@collabora.com
---
 include/asm-generic/syscall.h | 2 +-
 include/linux/entry-common.h  | 8 ++------
 include/linux/seccomp.h       | 2 +-
 include/linux/thread_info.h   | 6 ++++++
 kernel/entry/common.c         | 2 +-
 kernel/fork.c                 | 2 +-
 kernel/seccomp.c              | 6 +++---
 7 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
index f3135e734387..524d8e68ff5e 100644
--- a/include/asm-generic/syscall.h
+++ b/include/asm-generic/syscall.h
@@ -135,7 +135,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
  * Returns the AUDIT_ARCH_* based on the system call convention in use.
  *
  * It's only valid to call this when @task is stopped on entry to a system
- * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %TIF_SECCOMP.
+ * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP.
  *
  * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must
  * provide an implementation of this.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 3fe8f868f15e..fa3cdb102dbf 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -21,10 +21,6 @@
 # define _TIF_SYSCALL_TRACEPOINT	(0)
 #endif
 
-#ifndef _TIF_SECCOMP
-# define _TIF_SECCOMP			(0)
-#endif
-
 #ifndef _TIF_SYSCALL_AUDIT
 # define _TIF_SYSCALL_AUDIT		(0)
 #endif
@@ -49,7 +45,7 @@
 #endif
 
 #define SYSCALL_ENTER_WORK						\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP |	\
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT  |			\
 	 _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU |			\
 	 ARCH_SYSCALL_ENTER_WORK)
 
@@ -64,7 +60,7 @@
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |			\
 	 _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK)
 
-#define SYSCALL_WORK_ENTER	(0)
+#define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP)
 #define SYSCALL_WORK_EXIT	(0)
 
 /*
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 02aef2844c38..47763f3999f7 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -42,7 +42,7 @@ struct seccomp {
 extern int __secure_computing(const struct seccomp_data *sd);
 static inline int secure_computing(void)
 {
-	if (unlikely(test_thread_flag(TIF_SECCOMP)))
+	if (unlikely(test_syscall_work(SECCOMP)))
 		return  __secure_computing(NULL);
 	return 0;
 }
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 0e9fb15d6b42..a308ba4ef07b 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -35,6 +35,12 @@ enum {
 	GOOD_STACK,
 };
 
+enum syscall_work_bit {
+	SYSCALL_WORK_BIT_SECCOMP,
+};
+
+#define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
+
 #include <asm/thread_info.h>
 
 #ifdef __KERNEL__
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index e7a11e38daba..5747a6eb2c48 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -54,7 +54,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 	}
 
 	/* Do seccomp after ptrace, to catch any tracer changes. */
-	if (ti_work & _TIF_SECCOMP) {
+	if (work & SYSCALL_WORK_SECCOMP) {
 		ret = __secure_computing(NULL);
 		if (ret == -1L)
 			return ret;
diff --git a/kernel/fork.c b/kernel/fork.c
index 32083db7a2a2..bc5b1090f415 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1625,7 +1625,7 @@ static void copy_seccomp(struct task_struct *p)
 	 * to manually enable the seccomp thread flag here.
 	 */
 	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
-		set_tsk_thread_flag(p, TIF_SECCOMP);
+		set_task_syscall_work(p, SECCOMP);
 #endif
 }
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 8ad7a293255a..f67e92d11ad7 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -356,14 +356,14 @@ static inline void seccomp_assign_mode(struct task_struct *task,
 
 	task->seccomp.mode = seccomp_mode;
 	/*
-	 * Make sure TIF_SECCOMP cannot be set before the mode (and
+	 * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and
 	 * filter) is set.
 	 */
 	smp_mb__before_atomic();
 	/* Assume default seccomp processes want spec flaw mitigation. */
 	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
 		arch_seccomp_spec_mitigate(task);
-	set_tsk_thread_flag(task, TIF_SECCOMP);
+	set_task_syscall_work(task, SECCOMP);
 }
 
 #ifdef CONFIG_SECCOMP_FILTER
@@ -929,7 +929,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 	/*
 	 * Make sure that any changes to mode from another thread have
-	 * been seen after TIF_SECCOMP was seen.
+	 * been seen after SYSCALL_WORK_SECCOMP was seen.
 	 */
 	rmb();
 
-- 
cgit 


From 524666cb5de7c38a1925e7401a6e59d68682dd8c Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 16 Nov 2020 12:42:01 -0500
Subject: tracepoints: Migrate to use SYSCALL_WORK flag

On architectures using the generic syscall entry code the architecture
independent syscall work is moved to flags in thread_info::syscall_work.
This removes architecture dependencies and frees up TIF bits.

Define SYSCALL_WORK_SYSCALL_TRACEPOINT, use it in the generic entry code
and convert the code which uses the TIF specific helper functions to use
the new *_syscall_work() helpers which either resolve to the new mode for
users of the generic entry code or to the TIF based functions for the other
architectures.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20201116174206.2639648-6-krisman@collabora.com
---
 include/linux/entry-common.h | 13 +++++--------
 include/linux/thread_info.h  |  2 ++
 include/trace/syscall.h      |  6 +++---
 kernel/entry/common.c        |  4 ++--
 kernel/trace/trace_events.c  |  8 ++++----
 kernel/tracepoint.c          |  4 ++--
 6 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index fa3cdb102dbf..2a01eee2dbba 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -17,10 +17,6 @@
 # define _TIF_SYSCALL_EMU		(0)
 #endif
 
-#ifndef _TIF_SYSCALL_TRACEPOINT
-# define _TIF_SYSCALL_TRACEPOINT	(0)
-#endif
-
 #ifndef _TIF_SYSCALL_AUDIT
 # define _TIF_SYSCALL_AUDIT		(0)
 #endif
@@ -46,7 +42,7 @@
 
 #define SYSCALL_ENTER_WORK						\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT  |			\
-	 _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU |			\
+	 _TIF_SYSCALL_EMU |						\
 	 ARCH_SYSCALL_ENTER_WORK)
 
 /*
@@ -58,10 +54,11 @@
 
 #define SYSCALL_EXIT_WORK						\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |			\
-	 _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK)
+	 ARCH_SYSCALL_EXIT_WORK)
 
-#define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP)
-#define SYSCALL_WORK_EXIT	(0)
+#define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP |			\
+				 SYSCALL_WORK_SYSCALL_TRACEPOINT)
+#define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT)
 
 /*
  * TIF flags handled in exit_to_user_mode_loop()
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index a308ba4ef07b..c232043c12d3 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -37,9 +37,11 @@ enum {
 
 enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SECCOMP,
+	SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
 };
 
 #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
+#define SYSCALL_WORK_SYSCALL_TRACEPOINT	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
 
 #include <asm/thread_info.h>
 
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index dc8ac27d27c1..8e193f3a33b3 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -37,10 +37,10 @@ struct syscall_metadata {
 #if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS)
 static inline void syscall_tracepoint_update(struct task_struct *p)
 {
-	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
-		set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
+	if (test_syscall_work(SYSCALL_TRACEPOINT))
+		set_task_syscall_work(p, SYSCALL_TRACEPOINT);
 	else
-		clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
+		clear_task_syscall_work(p, SYSCALL_TRACEPOINT);
 }
 #else
 static inline void syscall_tracepoint_update(struct task_struct *p)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 5747a6eb2c48..f651967847ec 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -63,7 +63,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 	/* Either of the above might have changed the syscall number */
 	syscall = syscall_get_nr(current, regs);
 
-	if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
+	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, syscall);
 
 	syscall_enter_audit(regs, syscall);
@@ -233,7 +233,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work,
 
 	audit_syscall_exit(regs);
 
-	if (ti_work & _TIF_SYSCALL_TRACEPOINT)
+	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
 		trace_sys_exit(regs, syscall_get_return_value(current, regs));
 
 	step = report_single_step(ti_work);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 47a71f96e5bc..adf65b502453 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3428,10 +3428,10 @@ static __init int event_trace_enable(void)
  * initialize events and perhaps start any events that are on the
  * command line. Unfortunately, there are some events that will not
  * start this early, like the system call tracepoints that need
- * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
- * is called before pid 1 starts, and this flag is never set, making
- * the syscall tracepoint never get reached, but the event is enabled
- * regardless (and not doing anything).
+ * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But
+ * event_trace_enable() is called before pid 1 starts, and this flag
+ * is never set, making the syscall tracepoint never get reached, but
+ * the event is enabled regardless (and not doing anything).
  */
 static __init int event_trace_enable_again(void)
 {
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 3f659f855074..7261fa0f5e3c 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -594,7 +594,7 @@ int syscall_regfunc(void)
 	if (!sys_tracepoint_refcount) {
 		read_lock(&tasklist_lock);
 		for_each_process_thread(p, t) {
-			set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+			set_task_syscall_work(t, SYSCALL_TRACEPOINT);
 		}
 		read_unlock(&tasklist_lock);
 	}
@@ -611,7 +611,7 @@ void syscall_unregfunc(void)
 	if (!sys_tracepoint_refcount) {
 		read_lock(&tasklist_lock);
 		for_each_process_thread(p, t) {
-			clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+			clear_task_syscall_work(t, SYSCALL_TRACEPOINT);
 		}
 		read_unlock(&tasklist_lock);
 	}
-- 
cgit 


From 64c19ba29b66e98af9306b4a7525fb22c895d252 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 16 Nov 2020 12:42:02 -0500
Subject: ptrace: Migrate to use SYSCALL_TRACE flag

On architectures using the generic syscall entry code the architecture
independent syscall work is moved to flags in thread_info::syscall_work.
This removes architecture dependencies and frees up TIF bits.

Define SYSCALL_WORK_SYSCALL_TRACE, use it in the generic entry code and
convert the code which uses the TIF specific helper functions to use the
new *_syscall_work() helpers which either resolve to the new mode for users
of the generic entry code or to the TIF based functions for the other
architectures.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20201116174206.2639648-7-krisman@collabora.com
---
 include/asm-generic/syscall.h | 15 ++++++++-------
 include/linux/entry-common.h  | 10 ++++++----
 include/linux/thread_info.h   |  2 ++
 include/linux/tracehook.h     | 17 +++++++++--------
 kernel/entry/common.c         |  4 ++--
 kernel/fork.c                 |  2 +-
 kernel/ptrace.c               |  6 +++---
 7 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
index 524d8e68ff5e..ed94e5658d0c 100644
--- a/include/asm-generic/syscall.h
+++ b/include/asm-generic/syscall.h
@@ -43,7 +43,7 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs);
  * @regs:	task_pt_regs() of @task
  *
  * It's only valid to call this when @task is stopped for system
- * call exit tracing (due to TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT),
+ * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or TIF_SYSCALL_AUDIT),
  * after tracehook_report_syscall_entry() returned nonzero to prevent
  * the system call from taking place.
  *
@@ -63,7 +63,7 @@ void syscall_rollback(struct task_struct *task, struct pt_regs *regs);
  * Returns 0 if the system call succeeded, or -ERRORCODE if it failed.
  *
  * It's only valid to call this when @task is stopped for tracing on exit
- * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
  */
 long syscall_get_error(struct task_struct *task, struct pt_regs *regs);
 
@@ -76,7 +76,7 @@ long syscall_get_error(struct task_struct *task, struct pt_regs *regs);
  * This value is meaningless if syscall_get_error() returned nonzero.
  *
  * It's only valid to call this when @task is stopped for tracing on exit
- * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
  */
 long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs);
 
@@ -93,7 +93,7 @@ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs);
  * code; the user sees a failed system call with this errno code.
  *
  * It's only valid to call this when @task is stopped for tracing on exit
- * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
  */
 void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs,
 			      int error, long val);
@@ -108,7 +108,7 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs,
 *  @args[0], and so on.
  *
  * It's only valid to call this when @task is stopped for tracing on
- * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
  */
 void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
 			   unsigned long *args);
@@ -123,7 +123,7 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
  * The first argument gets value @args[0], and so on.
  *
  * It's only valid to call this when @task is stopped for tracing on
- * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
  */
 void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 			   const unsigned long *args);
@@ -135,7 +135,8 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
  * Returns the AUDIT_ARCH_* based on the system call convention in use.
  *
  * It's only valid to call this when @task is stopped on entry to a system
- * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP.
+ * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or
+ * %SYSCALL_WORK_SECCOMP.
  *
  * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must
  * provide an implementation of this.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 2a01eee2dbba..ae426ab9c372 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -41,7 +41,7 @@
 #endif
 
 #define SYSCALL_ENTER_WORK						\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT  |			\
+	(_TIF_SYSCALL_AUDIT  |						\
 	 _TIF_SYSCALL_EMU |						\
 	 ARCH_SYSCALL_ENTER_WORK)
 
@@ -53,12 +53,14 @@
 #endif
 
 #define SYSCALL_EXIT_WORK						\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |			\
+	(_TIF_SYSCALL_AUDIT |						\
 	 ARCH_SYSCALL_EXIT_WORK)
 
 #define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP |			\
-				 SYSCALL_WORK_SYSCALL_TRACEPOINT)
-#define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT)
+				 SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
+				 SYSCALL_WORK_SYSCALL_TRACE)
+#define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
+				 SYSCALL_WORK_SYSCALL_TRACE)
 
 /*
  * TIF flags handled in exit_to_user_mode_loop()
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index c232043c12d3..761a4590d554 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -38,10 +38,12 @@ enum {
 enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SECCOMP,
 	SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
+	SYSCALL_WORK_BIT_SYSCALL_TRACE,
 };
 
 #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
 #define SYSCALL_WORK_SYSCALL_TRACEPOINT	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
+#define SYSCALL_WORK_SYSCALL_TRACE	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
 
 #include <asm/thread_info.h>
 
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index f7d82e4fafd6..3f20368afe9e 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -83,11 +83,12 @@ static inline int ptrace_report_syscall(struct pt_regs *regs,
  * tracehook_report_syscall_entry - task is about to attempt a system call
  * @regs:		user register state of current task
  *
- * This will be called if %TIF_SYSCALL_TRACE or %TIF_SYSCALL_EMU have been set,
- * when the current task has just entered the kernel for a system call.
- * Full user register state is available here.  Changing the values
- * in @regs can affect the system call number and arguments to be tried.
- * It is safe to block here, preventing the system call from beginning.
+ * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or
+ * %TIF_SYSCALL_EMU have been set, when the current task has just
+ * entered the kernel for a system call.  Full user register state is
+ * available here.  Changing the values in @regs can affect the system
+ * call number and arguments to be tried.  It is safe to block here,
+ * preventing the system call from beginning.
  *
  * Returns zero normally, or nonzero if the calling arch code should abort
  * the system call.  That must prevent normal entry so no system call is
@@ -109,15 +110,15 @@ static inline __must_check int tracehook_report_syscall_entry(
  * @regs:		user register state of current task
  * @step:		nonzero if simulating single-step or block-step
  *
- * This will be called if %TIF_SYSCALL_TRACE has been set, when the
- * current task has just finished an attempted system call.  Full
+ * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when
+ * the current task has just finished an attempted system call.  Full
  * user register state is available here.  It is safe to block here,
  * preventing signals from being processed.
  *
  * If @step is nonzero, this report is also in lieu of the normal
  * trap that would follow the system call instruction because
  * user_enable_block_step() or user_enable_single_step() was used.
- * In this case, %TIF_SYSCALL_TRACE might not be set.
+ * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set.
  *
  * Called without locks, just before checking for pending signals.
  */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f651967847ec..917328a9edaa 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -47,7 +47,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 	long ret = 0;
 
 	/* Handle ptrace */
-	if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
+	if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) {
 		ret = arch_syscall_enter_tracehook(regs);
 		if (ret || (ti_work & _TIF_SYSCALL_EMU))
 			return -1L;
@@ -237,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work,
 		trace_sys_exit(regs, syscall_get_return_value(current, regs));
 
 	step = report_single_step(ti_work);
-	if (step || ti_work & _TIF_SYSCALL_TRACE)
+	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
 		arch_syscall_exit_tracehook(regs, step);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index bc5b1090f415..99f68c20f2ff 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2158,7 +2158,7 @@ static __latent_entropy struct task_struct *copy_process(
 	 * child regardless of CLONE_PTRACE.
 	 */
 	user_disable_single_step(p);
-	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+	clear_task_syscall_work(p, SYSCALL_TRACE);
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 43d6179508d6..55a2bc3186a7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -117,7 +117,7 @@ void __ptrace_unlink(struct task_struct *child)
 	const struct cred *old_cred;
 	BUG_ON(!child->ptrace);
 
-	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+	clear_task_syscall_work(child, SYSCALL_TRACE);
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
@@ -812,9 +812,9 @@ static int ptrace_resume(struct task_struct *child, long request,
 		return -EIO;
 
 	if (request == PTRACE_SYSCALL)
-		set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		set_task_syscall_work(child, SYSCALL_TRACE);
 	else
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		clear_task_syscall_work(child, SYSCALL_TRACE);
 
 #ifdef TIF_SYSCALL_EMU
 	if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
-- 
cgit 


From 64eb35f701f04b30706e21d1b02636b5d31a37d2 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 16 Nov 2020 12:42:03 -0500
Subject: ptrace: Migrate TIF_SYSCALL_EMU to use SYSCALL_WORK flag

On architectures using the generic syscall entry code the architecture
independent syscall work is moved to flags in thread_info::syscall_work.
This removes architecture dependencies and frees up TIF bits.

Define SYSCALL_WORK_SYSCALL_EMU, use it in the generic entry code and
convert the code which uses the TIF specific helper functions to use the
new *_syscall_work() helpers which either resolve to the new mode for users
of the generic entry code or to the TIF based functions for the other
architectures.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20201116174206.2639648-8-krisman@collabora.com
---
 include/linux/entry-common.h |  8 ++------
 include/linux/thread_info.h  |  2 ++
 include/linux/tracehook.h    |  2 +-
 kernel/entry/common.c        | 19 ++++++++++---------
 kernel/fork.c                |  4 ++--
 kernel/ptrace.c              | 10 +++++-----
 6 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index ae426ab9c372..b30f82bed92b 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -13,10 +13,6 @@
  * Define dummy _TIF work flags if not defined by the architecture or for
  * disabled functionality.
  */
-#ifndef _TIF_SYSCALL_EMU
-# define _TIF_SYSCALL_EMU		(0)
-#endif
-
 #ifndef _TIF_SYSCALL_AUDIT
 # define _TIF_SYSCALL_AUDIT		(0)
 #endif
@@ -42,7 +38,6 @@
 
 #define SYSCALL_ENTER_WORK						\
 	(_TIF_SYSCALL_AUDIT  |						\
-	 _TIF_SYSCALL_EMU |						\
 	 ARCH_SYSCALL_ENTER_WORK)
 
 /*
@@ -58,7 +53,8 @@
 
 #define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP |			\
 				 SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
-				 SYSCALL_WORK_SYSCALL_TRACE)
+				 SYSCALL_WORK_SYSCALL_TRACE |		\
+				 SYSCALL_WORK_SYSCALL_EMU)
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE)
 
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 761a4590d554..85b8a4216168 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -39,11 +39,13 @@ enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SECCOMP,
 	SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
 	SYSCALL_WORK_BIT_SYSCALL_TRACE,
+	SYSCALL_WORK_BIT_SYSCALL_EMU,
 };
 
 #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
 #define SYSCALL_WORK_SYSCALL_TRACEPOINT	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
 #define SYSCALL_WORK_SYSCALL_TRACE	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
+#define SYSCALL_WORK_SYSCALL_EMU	BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
 
 #include <asm/thread_info.h>
 
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 3f20368afe9e..54b925224a13 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -84,7 +84,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs,
  * @regs:		user register state of current task
  *
  * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or
- * %TIF_SYSCALL_EMU have been set, when the current task has just
+ * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just
  * entered the kernel for a system call.  Full user register state is
  * available here.  Changing the values in @regs can affect the system
  * call number and arguments to be tried.  It is safe to block here,
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 917328a9edaa..90533f34ea99 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -47,9 +47,9 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 	long ret = 0;
 
 	/* Handle ptrace */
-	if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) {
+	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
 		ret = arch_syscall_enter_tracehook(regs);
-		if (ret || (ti_work & _TIF_SYSCALL_EMU))
+		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
 			return -1L;
 	}
 
@@ -208,21 +208,22 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
 }
 
 #ifndef _TIF_SINGLESTEP
-static inline bool report_single_step(unsigned long ti_work)
+static inline bool report_single_step(unsigned long work)
 {
 	return false;
 }
 #else
 /*
- * If TIF_SYSCALL_EMU is set, then the only reason to report is when
+ * If SYSCALL_EMU is set, then the only reason to report is when
  * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
  * instruction has been already reported in syscall_enter_from_user_mode().
  */
-#define SYSEMU_STEP	(_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
-
-static inline bool report_single_step(unsigned long ti_work)
+static inline bool report_single_step(unsigned long work)
 {
-	return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP;
+	if (!(work & SYSCALL_WORK_SYSCALL_EMU))
+		return false;
+
+	return !!(current_thread_info()->flags & _TIF_SINGLESTEP);
 }
 #endif
 
@@ -236,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work,
 	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
 		trace_sys_exit(regs, syscall_get_return_value(current, regs));
 
-	step = report_single_step(ti_work);
+	step = report_single_step(work);
 	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
 		arch_syscall_exit_tracehook(regs, step);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 99f68c20f2ff..02b689a23457 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2159,8 +2159,8 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	user_disable_single_step(p);
 	clear_task_syscall_work(p, SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
-	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+	clear_task_syscall_work(p, SYSCALL_EMU);
 #endif
 	clear_tsk_latency_tracing(p);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 55a2bc3186a7..237bcd6d255c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,8 +118,8 @@ void __ptrace_unlink(struct task_struct *child)
 	BUG_ON(!child->ptrace);
 
 	clear_task_syscall_work(child, SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
-	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+	clear_task_syscall_work(child, SYSCALL_EMU);
 #endif
 
 	child->parent = child->real_parent;
@@ -816,11 +816,11 @@ static int ptrace_resume(struct task_struct *child, long request,
 	else
 		clear_task_syscall_work(child, SYSCALL_TRACE);
 
-#ifdef TIF_SYSCALL_EMU
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
 	if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
-		set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		set_task_syscall_work(child, SYSCALL_EMU);
 	else
-		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		clear_task_syscall_work(child, SYSCALL_EMU);
 #endif
 
 	if (is_singleblock(request)) {
-- 
cgit 


From 785dc4eb7fd74e3b7f4eac468457b633117e1aea Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 16 Nov 2020 12:42:04 -0500
Subject: audit: Migrate to use SYSCALL_WORK flag

On architectures using the generic syscall entry code the architecture
independent syscall work is moved to flags in thread_info::syscall_work.
This removes architecture dependencies and frees up TIF bits.

Define SYSCALL_WORK_SYSCALL_AUDIT, use it in the generic entry code and
convert the code which uses the TIF specific helper functions to use the
new *_syscall_work() helpers which either resolve to the new mode for users
of the generic entry code or to the TIF based functions for the other
architectures.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20201116174206.2639648-9-krisman@collabora.com
---
 include/asm-generic/syscall.h | 23 ++++++++++++++---------
 include/linux/entry-common.h  | 18 ++++++------------
 include/linux/thread_info.h   |  2 ++
 kernel/auditsc.c              |  4 ++--
 4 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
index ed94e5658d0c..524218ae3825 100644
--- a/include/asm-generic/syscall.h
+++ b/include/asm-generic/syscall.h
@@ -43,9 +43,9 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs);
  * @regs:	task_pt_regs() of @task
  *
  * It's only valid to call this when @task is stopped for system
- * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or TIF_SYSCALL_AUDIT),
- * after tracehook_report_syscall_entry() returned nonzero to prevent
- * the system call from taking place.
+ * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or
+ * %SYSCALL_WORK_SYSCALL_AUDIT), after tracehook_report_syscall_entry()
+ * returned nonzero to prevent the system call from taking place.
  *
  * This rolls back the register state in @regs so it's as if the
  * system call instruction was a no-op.  The registers containing
@@ -63,7 +63,8 @@ void syscall_rollback(struct task_struct *task, struct pt_regs *regs);
  * Returns 0 if the system call succeeded, or -ERRORCODE if it failed.
  *
  * It's only valid to call this when @task is stopped for tracing on exit
- * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or
+ * %SYSCALL_WORK_SYSCALL_AUDIT.
  */
 long syscall_get_error(struct task_struct *task, struct pt_regs *regs);
 
@@ -76,7 +77,8 @@ long syscall_get_error(struct task_struct *task, struct pt_regs *regs);
  * This value is meaningless if syscall_get_error() returned nonzero.
  *
  * It's only valid to call this when @task is stopped for tracing on exit
- * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or
+ * %SYSCALL_WORK_SYSCALL_AUDIT.
  */
 long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs);
 
@@ -93,7 +95,8 @@ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs);
  * code; the user sees a failed system call with this errno code.
  *
  * It's only valid to call this when @task is stopped for tracing on exit
- * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or
+ * %SYSCALL_WORK_SYSCALL_AUDIT.
  */
 void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs,
 			      int error, long val);
@@ -108,7 +111,8 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs,
 *  @args[0], and so on.
  *
  * It's only valid to call this when @task is stopped for tracing on
- * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or
+ * %SYSCALL_WORK_SYSCALL_AUDIT.
  */
 void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
 			   unsigned long *args);
@@ -123,7 +127,8 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
  * The first argument gets value @args[0], and so on.
  *
  * It's only valid to call this when @task is stopped for tracing on
- * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or
+ * %SYSCALL_WORK_SYSCALL_AUDIT.
  */
 void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 			   const unsigned long *args);
@@ -135,7 +140,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
  * Returns the AUDIT_ARCH_* based on the system call convention in use.
  *
  * It's only valid to call this when @task is stopped on entry to a system
- * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or
+ * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %SYSCALL_WORK_SYSCALL_AUDIT, or
  * %SYSCALL_WORK_SECCOMP.
  *
  * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index b30f82bed92b..d7b96f42817f 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -13,10 +13,6 @@
  * Define dummy _TIF work flags if not defined by the architecture or for
  * disabled functionality.
  */
-#ifndef _TIF_SYSCALL_AUDIT
-# define _TIF_SYSCALL_AUDIT		(0)
-#endif
-
 #ifndef _TIF_PATCH_PENDING
 # define _TIF_PATCH_PENDING		(0)
 #endif
@@ -36,9 +32,7 @@
 # define ARCH_SYSCALL_ENTER_WORK	(0)
 #endif
 
-#define SYSCALL_ENTER_WORK						\
-	(_TIF_SYSCALL_AUDIT  |						\
-	 ARCH_SYSCALL_ENTER_WORK)
+#define SYSCALL_ENTER_WORK ARCH_SYSCALL_ENTER_WORK
 
 /*
  * TIF flags handled in syscall_exit_to_user_mode()
@@ -47,16 +41,16 @@
 # define ARCH_SYSCALL_EXIT_WORK		(0)
 #endif
 
-#define SYSCALL_EXIT_WORK						\
-	(_TIF_SYSCALL_AUDIT |						\
-	 ARCH_SYSCALL_EXIT_WORK)
+#define SYSCALL_EXIT_WORK ARCH_SYSCALL_EXIT_WORK
 
 #define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP |			\
 				 SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
-				 SYSCALL_WORK_SYSCALL_EMU)
+				 SYSCALL_WORK_SYSCALL_EMU |		\
+				 SYSCALL_WORK_SYSCALL_AUDIT)
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
-				 SYSCALL_WORK_SYSCALL_TRACE)
+				 SYSCALL_WORK_SYSCALL_TRACE |		\
+				 SYSCALL_WORK_SYSCALL_AUDIT)
 
 /*
  * TIF flags handled in exit_to_user_mode_loop()
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 85b8a4216168..317363212ae9 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -40,12 +40,14 @@ enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
 	SYSCALL_WORK_BIT_SYSCALL_TRACE,
 	SYSCALL_WORK_BIT_SYSCALL_EMU,
+	SYSCALL_WORK_BIT_SYSCALL_AUDIT,
 };
 
 #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
 #define SYSCALL_WORK_SYSCALL_TRACEPOINT	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
 #define SYSCALL_WORK_SYSCALL_TRACE	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
 #define SYSCALL_WORK_SYSCALL_EMU	BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
+#define SYSCALL_WORK_SYSCALL_AUDIT	BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
 
 #include <asm/thread_info.h>
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8dba8f0983b5..c00aa5837965 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -952,7 +952,7 @@ int audit_alloc(struct task_struct *tsk)
 
 	state = audit_filter_task(tsk, &key);
 	if (state == AUDIT_DISABLED) {
-		clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+		clear_task_syscall_work(tsk, SYSCALL_AUDIT);
 		return 0;
 	}
 
@@ -964,7 +964,7 @@ int audit_alloc(struct task_struct *tsk)
 	context->filterkey = key;
 
 	audit_set_context(tsk, context);
-	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+	set_task_syscall_work(tsk, SYSCALL_AUDIT);
 	return 0;
 }
 
-- 
cgit 


From 2991552447707d791d9d81a5dc161f9e9e90b163 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 16 Nov 2020 12:42:05 -0500
Subject: entry: Drop usage of TIF flags in the generic syscall code

Now that the flags migration in the common syscall entry code is complete
and the code relies exclusively on thread_info::syscall_work, clean up the
accesses to TI flags in that path.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20201116174206.2639648-10-krisman@collabora.com
---
 include/linux/entry-common.h | 26 ++++++++++++--------------
 kernel/entry/common.c        | 17 +++++++----------
 2 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index d7b96f42817f..49b26b216e4e 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -26,31 +26,29 @@
 #endif
 
 /*
- * TIF flags handled in syscall_enter_from_user_mode()
+ * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
  */
-#ifndef ARCH_SYSCALL_ENTER_WORK
-# define ARCH_SYSCALL_ENTER_WORK	(0)
+#ifndef ARCH_SYSCALL_WORK_ENTER
+# define ARCH_SYSCALL_WORK_ENTER	(0)
 #endif
 
-#define SYSCALL_ENTER_WORK ARCH_SYSCALL_ENTER_WORK
-
 /*
- * TIF flags handled in syscall_exit_to_user_mode()
+ * SYSCALL_WORK flags handled in syscall_exit_to_user_mode()
  */
-#ifndef ARCH_SYSCALL_EXIT_WORK
-# define ARCH_SYSCALL_EXIT_WORK		(0)
+#ifndef ARCH_SYSCALL_WORK_EXIT
+# define ARCH_SYSCALL_WORK_EXIT		(0)
 #endif
 
-#define SYSCALL_EXIT_WORK ARCH_SYSCALL_EXIT_WORK
-
 #define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP |			\
 				 SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_EMU |		\
-				 SYSCALL_WORK_SYSCALL_AUDIT)
+				 SYSCALL_WORK_SYSCALL_AUDIT |		\
+				 ARCH_SYSCALL_WORK_ENTER)
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
-				 SYSCALL_WORK_SYSCALL_AUDIT)
+				 SYSCALL_WORK_SYSCALL_AUDIT |		\
+				 ARCH_SYSCALL_WORK_EXIT)
 
 /*
  * TIF flags handled in exit_to_user_mode_loop()
@@ -136,8 +134,8 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
  *
  * It handles the following work items:
  *
- *  1) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
- *     __secure_computing(), trace_sys_enter()
+ *  1) syscall_work flag dependent invocations of
+ *     arch_syscall_enter_tracehook(), __secure_computing(), trace_sys_enter()
  *  2) Invocation of audit_syscall_entry()
  */
 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 90533f34ea99..91e8fd50adf4 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -42,7 +42,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
 }
 
 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
-				unsigned long ti_work, unsigned long work)
+				unsigned long work)
 {
 	long ret = 0;
 
@@ -75,11 +75,9 @@ static __always_inline long
 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
 {
 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
-	unsigned long ti_work;
 
-	ti_work = READ_ONCE(current_thread_info()->flags);
-	if (work & SYSCALL_WORK_ENTER || ti_work & SYSCALL_ENTER_WORK)
-		syscall = syscall_trace_enter(regs, syscall, ti_work, work);
+	if (work & SYSCALL_WORK_ENTER)
+		syscall = syscall_trace_enter(regs, syscall, work);
 
 	return syscall;
 }
@@ -227,8 +225,8 @@ static inline bool report_single_step(unsigned long work)
 }
 #endif
 
-static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work,
-			      unsigned long work)
+
+static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
 {
 	bool step;
 
@@ -249,7 +247,6 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work,
 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
-	u32 cached_flags = READ_ONCE(current_thread_info()->flags);
 	unsigned long nr = syscall_get_nr(current, regs);
 
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -266,8 +263,8 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 	 * enabled, we want to run them exactly once per syscall exit with
 	 * interrupts enabled.
 	 */
-	if (unlikely(work & SYSCALL_WORK_EXIT || cached_flags & SYSCALL_EXIT_WORK))
-		syscall_exit_work(regs, cached_flags, work);
+	if (unlikely(work & SYSCALL_WORK_EXIT))
+		syscall_exit_work(regs, work);
 }
 
 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
-- 
cgit 


From e8b7db38449ac5b950a3f00519171c4be3e226ff Mon Sep 17 00:00:00 2001
From: Andres Beltran <lkmlabelt@gmail.com>
Date: Mon, 9 Nov 2020 11:04:00 +0100
Subject: Drivers: hv: vmbus: Add vmbus_requestor data structure for VMBus
 hardening

Currently, VMbus drivers use pointers into guest memory as request IDs
for interactions with Hyper-V. To be more robust in the face of errors
or malicious behavior from a compromised Hyper-V, avoid exposing
guest memory addresses to Hyper-V. Also avoid Hyper-V giving back a
bad request ID that is then treated as the address of a guest data
structure with no validation. Instead, encapsulate these memory
addresses and provide small integers as request IDs.

Signed-off-by: Andres Beltran <lkmlabelt@gmail.com>
Co-developed-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Reviewed-by: Wei Liu <wei.liu@kernel.org>
Link: https://lore.kernel.org/r/20201109100402.8946-2-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel.c      | 174 ++++++++++++++++++++++++++++++++++++++++++++--
 drivers/hv/hyperv_vmbus.h |   3 +-
 drivers/hv/ring_buffer.c  |  29 +++++++-
 include/linux/hyperv.h    |  22 ++++++
 4 files changed, 219 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index fbdda9938039..6fb0c76bfbf8 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -503,6 +503,70 @@ int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer,
 }
 EXPORT_SYMBOL_GPL(vmbus_establish_gpadl);
 
+/**
+ * request_arr_init - Allocates memory for the requestor array. Each slot
+ * keeps track of the next available slot in the array. Initially, each
+ * slot points to the next one (as in a Linked List). The last slot
+ * does not point to anything, so its value is U64_MAX by default.
+ * @size The size of the array
+ */
+static u64 *request_arr_init(u32 size)
+{
+	int i;
+	u64 *req_arr;
+
+	req_arr = kcalloc(size, sizeof(u64), GFP_KERNEL);
+	if (!req_arr)
+		return NULL;
+
+	for (i = 0; i < size - 1; i++)
+		req_arr[i] = i + 1;
+
+	/* Last slot (no more available slots) */
+	req_arr[i] = U64_MAX;
+
+	return req_arr;
+}
+
+/*
+ * vmbus_alloc_requestor - Initializes @rqstor's fields.
+ * Index 0 is the first free slot
+ * @size: Size of the requestor array
+ */
+static int vmbus_alloc_requestor(struct vmbus_requestor *rqstor, u32 size)
+{
+	u64 *rqst_arr;
+	unsigned long *bitmap;
+
+	rqst_arr = request_arr_init(size);
+	if (!rqst_arr)
+		return -ENOMEM;
+
+	bitmap = bitmap_zalloc(size, GFP_KERNEL);
+	if (!bitmap) {
+		kfree(rqst_arr);
+		return -ENOMEM;
+	}
+
+	rqstor->req_arr = rqst_arr;
+	rqstor->req_bitmap = bitmap;
+	rqstor->size = size;
+	rqstor->next_request_id = 0;
+	spin_lock_init(&rqstor->req_lock);
+
+	return 0;
+}
+
+/*
+ * vmbus_free_requestor - Frees memory allocated for @rqstor
+ * @rqstor: Pointer to the requestor struct
+ */
+static void vmbus_free_requestor(struct vmbus_requestor *rqstor)
+{
+	kfree(rqstor->req_arr);
+	bitmap_free(rqstor->req_bitmap);
+}
+
 static int __vmbus_open(struct vmbus_channel *newchannel,
 		       void *userdata, u32 userdatalen,
 		       void (*onchannelcallback)(void *context), void *context)
@@ -523,6 +587,12 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
 	if (newchannel->state != CHANNEL_OPEN_STATE)
 		return -EINVAL;
 
+	/* Create and init requestor */
+	if (newchannel->rqstor_size) {
+		if (vmbus_alloc_requestor(&newchannel->requestor, newchannel->rqstor_size))
+			return -ENOMEM;
+	}
+
 	newchannel->state = CHANNEL_OPENING_STATE;
 	newchannel->onchannel_callback = onchannelcallback;
 	newchannel->channel_callback_context = context;
@@ -626,6 +696,7 @@ error_free_gpadl:
 error_clean_ring:
 	hv_ringbuffer_cleanup(&newchannel->outbound);
 	hv_ringbuffer_cleanup(&newchannel->inbound);
+	vmbus_free_requestor(&newchannel->requestor);
 	newchannel->state = CHANNEL_OPEN_STATE;
 	return err;
 }
@@ -808,6 +879,9 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
 		channel->ringbuffer_gpadlhandle = 0;
 	}
 
+	if (!ret)
+		vmbus_free_requestor(&channel->requestor);
+
 	return ret;
 }
 
@@ -888,7 +962,7 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
 	/* in 8-bytes granularity */
 	desc.offset8 = sizeof(struct vmpacket_descriptor) >> 3;
 	desc.len8 = (u16)(packetlen_aligned >> 3);
-	desc.trans_id = requestid;
+	desc.trans_id = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
 
 	bufferlist[0].iov_base = &desc;
 	bufferlist[0].iov_len = sizeof(struct vmpacket_descriptor);
@@ -897,7 +971,7 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
 	bufferlist[2].iov_base = &aligned_data;
 	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
 
-	return hv_ringbuffer_write(channel, bufferlist, num_vecs);
+	return hv_ringbuffer_write(channel, bufferlist, num_vecs, requestid);
 }
 EXPORT_SYMBOL(vmbus_sendpacket);
 
@@ -939,7 +1013,7 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
 	desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
 	desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
 	desc.length8 = (u16)(packetlen_aligned >> 3);
-	desc.transactionid = requestid;
+	desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
 	desc.reserved = 0;
 	desc.rangecount = pagecount;
 
@@ -956,7 +1030,7 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
 	bufferlist[2].iov_base = &aligned_data;
 	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
 
-	return hv_ringbuffer_write(channel, bufferlist, 3);
+	return hv_ringbuffer_write(channel, bufferlist, 3, requestid);
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
 
@@ -983,7 +1057,7 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 	desc->flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
 	desc->dataoffset8 = desc_size >> 3; /* in 8-bytes granularity */
 	desc->length8 = (u16)(packetlen_aligned >> 3);
-	desc->transactionid = requestid;
+	desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
 	desc->reserved = 0;
 	desc->rangecount = 1;
 
@@ -994,7 +1068,7 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 	bufferlist[2].iov_base = &aligned_data;
 	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
 
-	return hv_ringbuffer_write(channel, bufferlist, 3);
+	return hv_ringbuffer_write(channel, bufferlist, 3, requestid);
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc);
 
@@ -1042,3 +1116,91 @@ int vmbus_recvpacket_raw(struct vmbus_channel *channel, void *buffer,
 				  buffer_actual_len, requestid, true);
 }
 EXPORT_SYMBOL_GPL(vmbus_recvpacket_raw);
+
+/*
+ * vmbus_next_request_id - Returns a new request id. It is also
+ * the index at which the guest memory address is stored.
+ * Uses a spin lock to avoid race conditions.
+ * @rqstor: Pointer to the requestor struct
+ * @rqst_add: Guest memory address to be stored in the array
+ */
+u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr)
+{
+	unsigned long flags;
+	u64 current_id;
+	const struct vmbus_channel *channel =
+		container_of(rqstor, const struct vmbus_channel, requestor);
+
+	/* Check rqstor has been initialized */
+	if (!channel->rqstor_size)
+		return VMBUS_NO_RQSTOR;
+
+	spin_lock_irqsave(&rqstor->req_lock, flags);
+	current_id = rqstor->next_request_id;
+
+	/* Requestor array is full */
+	if (current_id >= rqstor->size) {
+		spin_unlock_irqrestore(&rqstor->req_lock, flags);
+		return VMBUS_RQST_ERROR;
+	}
+
+	rqstor->next_request_id = rqstor->req_arr[current_id];
+	rqstor->req_arr[current_id] = rqst_addr;
+
+	/* The already held spin lock provides atomicity */
+	bitmap_set(rqstor->req_bitmap, current_id, 1);
+
+	spin_unlock_irqrestore(&rqstor->req_lock, flags);
+
+	/*
+	 * Cannot return an ID of 0, which is reserved for an unsolicited
+	 * message from Hyper-V.
+	 */
+	return current_id + 1;
+}
+EXPORT_SYMBOL_GPL(vmbus_next_request_id);
+
+/*
+ * vmbus_request_addr - Returns the memory address stored at @trans_id
+ * in @rqstor. Uses a spin lock to avoid race conditions.
+ * @rqstor: Pointer to the requestor struct
+ * @trans_id: Request id sent back from Hyper-V. Becomes the requestor's
+ * next request id.
+ */
+u64 vmbus_request_addr(struct vmbus_requestor *rqstor, u64 trans_id)
+{
+	unsigned long flags;
+	u64 req_addr;
+	const struct vmbus_channel *channel =
+		container_of(rqstor, const struct vmbus_channel, requestor);
+
+	/* Check rqstor has been initialized */
+	if (!channel->rqstor_size)
+		return VMBUS_NO_RQSTOR;
+
+	/* Hyper-V can send an unsolicited message with ID of 0 */
+	if (!trans_id)
+		return trans_id;
+
+	spin_lock_irqsave(&rqstor->req_lock, flags);
+
+	/* Data corresponding to trans_id is stored at trans_id - 1 */
+	trans_id--;
+
+	/* Invalid trans_id */
+	if (trans_id >= rqstor->size || !test_bit(trans_id, rqstor->req_bitmap)) {
+		spin_unlock_irqrestore(&rqstor->req_lock, flags);
+		return VMBUS_RQST_ERROR;
+	}
+
+	req_addr = rqstor->req_arr[trans_id];
+	rqstor->req_arr[trans_id] = rqstor->next_request_id;
+	rqstor->next_request_id = trans_id;
+
+	/* The already held spin lock provides atomicity */
+	bitmap_clear(rqstor->req_bitmap, trans_id, 1);
+
+	spin_unlock_irqrestore(&rqstor->req_lock, flags);
+	return req_addr;
+}
+EXPORT_SYMBOL_GPL(vmbus_request_addr);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 40e2b9f91163..02f3e8988836 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -179,7 +179,8 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
 
 int hv_ringbuffer_write(struct vmbus_channel *channel,
-			const struct kvec *kv_list, u32 kv_count);
+			const struct kvec *kv_list, u32 kv_count,
+			u64 requestid);
 
 int hv_ringbuffer_read(struct vmbus_channel *channel,
 		       void *buffer, u32 buflen, u32 *buffer_actual_len,
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 356e22159e83..35833d4d1a1d 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -248,7 +248,8 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 
 /* Write to the ring buffer. */
 int hv_ringbuffer_write(struct vmbus_channel *channel,
-			const struct kvec *kv_list, u32 kv_count)
+			const struct kvec *kv_list, u32 kv_count,
+			u64 requestid)
 {
 	int i;
 	u32 bytes_avail_towrite;
@@ -258,6 +259,8 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 	u64 prev_indices;
 	unsigned long flags;
 	struct hv_ring_buffer_info *outring_info = &channel->outbound;
+	struct vmpacket_descriptor *desc = kv_list[0].iov_base;
+	u64 rqst_id = VMBUS_NO_RQSTOR;
 
 	if (channel->rescind)
 		return -ENODEV;
@@ -300,6 +303,23 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 						     kv_list[i].iov_len);
 	}
 
+	/*
+	 * Allocate the request ID after the data has been copied into the
+	 * ring buffer.  Once this request ID is allocated, the completion
+	 * path could find the data and free it.
+	 */
+
+	if (desc->flags == VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED) {
+		rqst_id = vmbus_next_request_id(&channel->requestor, requestid);
+		if (rqst_id == VMBUS_RQST_ERROR) {
+			spin_unlock_irqrestore(&outring_info->ring_lock, flags);
+			pr_err("No request id available\n");
+			return -EAGAIN;
+		}
+	}
+	desc = hv_get_ring_buffer(outring_info) + old_write;
+	desc->trans_id = (rqst_id == VMBUS_NO_RQSTOR) ? requestid : rqst_id;
+
 	/* Set previous packet start */
 	prev_indices = hv_get_ring_bufferindices(outring_info);
 
@@ -319,8 +339,13 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 
 	hv_signal_on_write(old_write, channel);
 
-	if (channel->rescind)
+	if (channel->rescind) {
+		if (rqst_id != VMBUS_NO_RQSTOR) {
+			/* Reclaim request ID to avoid leak of IDs */
+			vmbus_request_addr(&channel->requestor, rqst_id);
+		}
 		return -ENODEV;
+	}
 
 	return 0;
 }
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 1ce131f29f3b..5b6d5c4e3711 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -764,6 +764,22 @@ enum vmbus_device_type {
 	HV_UNKNOWN,
 };
 
+/*
+ * Provides request ids for VMBus. Encapsulates guest memory
+ * addresses and stores the next available slot in req_arr
+ * to generate new ids in constant time.
+ */
+struct vmbus_requestor {
+	u64 *req_arr;
+	unsigned long *req_bitmap; /* is a given slot available? */
+	u32 size;
+	u64 next_request_id;
+	spinlock_t req_lock; /* provides atomicity */
+};
+
+#define VMBUS_NO_RQSTOR U64_MAX
+#define VMBUS_RQST_ERROR (U64_MAX - 1)
+
 struct vmbus_device {
 	u16  dev_type;
 	guid_t guid;
@@ -988,8 +1004,14 @@ struct vmbus_channel {
 	u32 fuzz_testing_interrupt_delay;
 	u32 fuzz_testing_message_delay;
 
+	/* request/transaction ids for VMBus */
+	struct vmbus_requestor requestor;
+	u32 rqstor_size;
 };
 
+u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr);
+u64 vmbus_request_addr(struct vmbus_requestor *rqstor, u64 trans_id);
+
 static inline bool is_hvsock_channel(const struct vmbus_channel *c)
 {
 	return !!(c->offermsg.offer.chn_flags &
-- 
cgit 


From 4d18fcc95f50950a99bd940d4e61a983f91d267a Mon Sep 17 00:00:00 2001
From: Andres Beltran <lkmlabelt@gmail.com>
Date: Mon, 9 Nov 2020 11:04:02 +0100
Subject: hv_netvsc: Use vmbus_requestor to generate transaction IDs for VMBus
 hardening

Currently, pointers to guest memory are passed to Hyper-V as
transaction IDs in netvsc. In the face of errors or malicious
behavior in Hyper-V, netvsc should not expose or trust the transaction
IDs returned by Hyper-V to be valid guest memory addresses. Instead,
use small integers generated by vmbus_requestor as requests
(transaction) IDs.

Signed-off-by: Andres Beltran <lkmlabelt@gmail.com>
Co-developed-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Wei Liu <wei.liu@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Link: https://lore.kernel.org/r/20201109100402.8946-4-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/net/hyperv/hyperv_net.h   | 13 +++++++++++++
 drivers/net/hyperv/netvsc.c       | 22 ++++++++++++++++------
 drivers/net/hyperv/rndis_filter.c |  1 +
 include/linux/hyperv.h            |  1 +
 4 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index a0f338cf1424..2a87cfa27ac0 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -847,6 +847,19 @@ struct nvsp_message {
 
 #define NETVSC_XDP_HDRM 256
 
+#define NETVSC_MIN_OUT_MSG_SIZE (sizeof(struct vmpacket_descriptor) + \
+				 sizeof(struct nvsp_message))
+#define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor)
+
+/* Estimated requestor size:
+ * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size
+ */
+static inline u32 netvsc_rqstor_size(unsigned long ringbytes)
+{
+	return ringbytes / NETVSC_MIN_OUT_MSG_SIZE +
+		ringbytes / NETVSC_MIN_IN_MSG_SIZE;
+}
+
 #define NETVSC_XFER_HEADER_SIZE(rng_cnt) \
 		(offsetof(struct vmtransfer_page_packet_header, ranges) + \
 		(rng_cnt) * sizeof(struct vmtransfer_page_range))
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 0c3de94b5178..4dbc0055aed0 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -50,7 +50,7 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf)
 
 	vmbus_sendpacket(dev->channel, init_pkt,
 			       sizeof(struct nvsp_message),
-			       (unsigned long)init_pkt,
+			       VMBUS_RQST_ID_NO_RESPONSE,
 			       VM_PKT_DATA_INBAND, 0);
 }
 
@@ -163,7 +163,7 @@ static void netvsc_revoke_recv_buf(struct hv_device *device,
 		ret = vmbus_sendpacket(device->channel,
 				       revoke_packet,
 				       sizeof(struct nvsp_message),
-				       (unsigned long)revoke_packet,
+				       VMBUS_RQST_ID_NO_RESPONSE,
 				       VM_PKT_DATA_INBAND, 0);
 		/* If the failure is because the channel is rescinded;
 		 * ignore the failure since we cannot send on a rescinded
@@ -213,7 +213,7 @@ static void netvsc_revoke_send_buf(struct hv_device *device,
 		ret = vmbus_sendpacket(device->channel,
 				       revoke_packet,
 				       sizeof(struct nvsp_message),
-				       (unsigned long)revoke_packet,
+				       VMBUS_RQST_ID_NO_RESPONSE,
 				       VM_PKT_DATA_INBAND, 0);
 
 		/* If the failure is because the channel is rescinded;
@@ -557,7 +557,7 @@ static int negotiate_nvsp_ver(struct hv_device *device,
 
 	ret = vmbus_sendpacket(device->channel, init_packet,
 				sizeof(struct nvsp_message),
-				(unsigned long)init_packet,
+				VMBUS_RQST_ID_NO_RESPONSE,
 				VM_PKT_DATA_INBAND, 0);
 
 	return ret;
@@ -614,7 +614,7 @@ static int netvsc_connect_vsp(struct hv_device *device,
 	/* Send the init request */
 	ret = vmbus_sendpacket(device->channel, init_packet,
 				sizeof(struct nvsp_message),
-				(unsigned long)init_packet,
+				VMBUS_RQST_ID_NO_RESPONSE,
 				VM_PKT_DATA_INBAND, 0);
 	if (ret != 0)
 		goto cleanup;
@@ -695,10 +695,19 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
 				    const struct vmpacket_descriptor *desc,
 				    int budget)
 {
-	struct sk_buff *skb = (struct sk_buff *)(unsigned long)desc->trans_id;
 	struct net_device_context *ndev_ctx = netdev_priv(ndev);
+	struct sk_buff *skb;
 	u16 q_idx = 0;
 	int queue_sends;
+	u64 cmd_rqst;
+
+	cmd_rqst = vmbus_request_addr(&channel->requestor, (u64)desc->trans_id);
+	if (cmd_rqst == VMBUS_RQST_ERROR) {
+		netdev_err(ndev, "Incorrect transaction id\n");
+		return;
+	}
+
+	skb = (struct sk_buff *)(unsigned long)cmd_rqst;
 
 	/* Notify the layer above us */
 	if (likely(skb)) {
@@ -1520,6 +1529,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 		       netvsc_poll, NAPI_POLL_WEIGHT);
 
 	/* Open the channel */
+	device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
 	ret = vmbus_open(device->channel, netvsc_ring_bytes,
 			 netvsc_ring_bytes,  NULL, 0,
 			 netvsc_channel_cb, net_device->chan_table);
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index b22e47bcfeca..6ae43319ece6 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1172,6 +1172,7 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	/* Set the channel before opening.*/
 	nvchan->channel = new_sc;
 
+	new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
 	ret = vmbus_open(new_sc, netvsc_ring_bytes,
 			 netvsc_ring_bytes, NULL, 0,
 			 netvsc_channel_cb, nvchan);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 5b6d5c4e3711..5ddb479c4d4c 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -779,6 +779,7 @@ struct vmbus_requestor {
 
 #define VMBUS_NO_RQSTOR U64_MAX
 #define VMBUS_RQST_ERROR (U64_MAX - 1)
+#define VMBUS_RQST_ID_NO_RESPONSE (U64_MAX - 2)
 
 struct vmbus_device {
 	u16  dev_type;
-- 
cgit 


From f97bb5272d9e95d400d6c8643ebb146b3e3e7842 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 Nov 2020 09:08:41 +0100
Subject: sched: Fix data-race in wakeup

Mel reported that on some ARM64 platforms loadavg goes bananas and
Will tracked it down to the following race:

  CPU0					CPU1

  schedule()
    prev->sched_contributes_to_load = X;
    deactivate_task(prev);

					try_to_wake_up()
					  if (p->on_rq &&) // false
					  if (smp_load_acquire(&p->on_cpu) && // true
					      ttwu_queue_wakelist())
					        p->sched_remote_wakeup = Y;

    smp_store_release(prev->on_cpu, 0);

where both p->sched_contributes_to_load and p->sched_remote_wakeup are
in the same word, and thus the stores X and Y race (and can clobber
one another's data).

Whereas prior to commit c6e7bd7afaeb ("sched/core: Optimize ttwu()
spinning on p->on_cpu") the p->on_cpu handoff serialized access to
p->sched_remote_wakeup (just as it still does with
p->sched_contributes_to_load) that commit broke that by calling
ttwu_queue_wakelist() with p->on_cpu != 0.

However, due to

  p->XXX = X			ttwu()
  schedule()			  if (p->on_rq && ...) // false
    smp_mb__after_spinlock()	  if (smp_load_acquire(&p->on_cpu) &&
    deactivate_task()		      ttwu_queue_wakelist())
      p->on_rq = 0;		        p->sched_remote_wakeup = Y;

We can be sure any 'current' store is complete and 'current' is
guaranteed asleep. Therefore we can move p->sched_remote_wakeup into
the current flags word.

Note: while the observed failure was loadavg accounting gone wrong due
to ttwu() cobbering p->sched_contributes_to_load, the reverse problem
is also possible where schedule() clobbers p->sched_remote_wakeup,
this could result in enqueue_entity() wrecking ->vruntime and causing
scheduling artifacts.

Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
Reported-by: Mel Gorman <mgorman@techsingularity.net>
Debugged-by: Will Deacon <will@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201117083016.GK3121392@hirez.programming.kicks-ass.net
---
 include/linux/sched.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d383cf09e78f..0e91b451d2a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -769,7 +769,6 @@ struct task_struct {
 	unsigned			sched_reset_on_fork:1;
 	unsigned			sched_contributes_to_load:1;
 	unsigned			sched_migrated:1;
-	unsigned			sched_remote_wakeup:1;
 #ifdef CONFIG_PSI
 	unsigned			sched_psi_wake_requeue:1;
 #endif
@@ -779,6 +778,21 @@ struct task_struct {
 
 	/* Unserialized, strictly 'current' */
 
+	/*
+	 * This field must not be in the scheduler word above due to wakelist
+	 * queueing no longer being serialized by p->on_cpu. However:
+	 *
+	 * p->XXX = X;			ttwu()
+	 * schedule()			  if (p->on_rq && ..) // false
+	 *   smp_mb__after_spinlock();	  if (smp_load_acquire(&p->on_cpu) && //true
+	 *   deactivate_task()		      ttwu_queue_wakelist())
+	 *     p->on_rq = 0;			p->sched_remote_wakeup = Y;
+	 *
+	 * guarantees all stores of 'current' are visible before
+	 * ->sched_remote_wakeup gets used, so it can be in this word.
+	 */
+	unsigned			sched_remote_wakeup:1;
+
 	/* Bit to tell LSMs we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
-- 
cgit 


From 2279f540ea7d05f22d2f0c4224319330228586bc Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Tue, 17 Nov 2020 07:14:32 +0100
Subject: sched/deadline: Fix priority inheritance with multiple scheduling
 classes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Glenn reported that "an application [he developed produces] a BUG in
deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested
PTHREAD_PRIO_INHERIT mutexes.  I believe the bug is triggered when a CFS
task that was boosted by a SCHED_DEADLINE task boosts another CFS task
(nested priority inheritance).

 ------------[ cut here ]------------
 kernel BUG at kernel/sched/deadline.c:1462!
 invalid opcode: 0000 [#1] PREEMPT SMP
 CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ...
 Hardware name: ...
 RIP: 0010:enqueue_task_dl+0x335/0x910
 Code: ...
 RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002
 RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500
 RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600
 RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078
 R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600
 R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600
 FS:  00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 PKRU: 55555554
 Call Trace:
  ? intel_pstate_update_util_hwp+0x13/0x170
  rt_mutex_setprio+0x1cc/0x4b0
  task_blocks_on_rt_mutex+0x225/0x260
  rt_spin_lock_slowlock_locked+0xab/0x2d0
  rt_spin_lock_slowlock+0x50/0x80
  hrtimer_grab_expiry_lock+0x20/0x30
  hrtimer_cancel+0x13/0x30
  do_nanosleep+0xa0/0x150
  hrtimer_nanosleep+0xe1/0x230
  ? __hrtimer_init_sleeper+0x60/0x60
  __x64_sys_nanosleep+0x8d/0xa0
  do_syscall_64+0x4a/0x100
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 RIP: 0033:0x7fa58b52330d
 ...
 ---[ end trace 0000000000000002 ]—

He also provided a simple reproducer creating the situation below:

 So the execution order of locking steps are the following
 (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2
 are mutexes that are enabled * with priority inheritance.)

 Time moves forward as this timeline goes down:

 N1              N2               D1
 |               |                |
 |               |                |
 Lock(M1)        |                |
 |               |                |
 |             Lock(M2)           |
 |               |                |
 |               |              Lock(M2)
 |               |                |
 |             Lock(M1)           |
 |             (!!bug triggered!) |

Daniel reported a similar situation as well, by just letting ksoftirqd
run with DEADLINE (and eventually block on a mutex).

Problem is that boosted entities (Priority Inheritance) use static
DEADLINE parameters of the top priority waiter. However, there might be
cases where top waiter could be a non-DEADLINE entity that is currently
boosted by a DEADLINE entity from a different lock chain (i.e., nested
priority chains involving entities of non-DEADLINE classes). In this
case, top waiter static DEADLINE parameters could be null (initialized
to 0 at fork()) and replenish_dl_entity() would hit a BUG().

Fix this by keeping track of the original donor and using its parameters
when a task is boosted.

Reported-by: Glenn Elliott <glenn@aurora.tech>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com
---
 include/linux/sched.h   | 10 ++++-
 kernel/sched/core.c     | 11 +++---
 kernel/sched/deadline.c | 97 +++++++++++++++++++++++++++----------------------
 3 files changed, 68 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0e91b451d2a2..095fdec07b38 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -551,7 +551,6 @@ struct sched_dl_entity {
 	 * overruns.
 	 */
 	unsigned int			dl_throttled      : 1;
-	unsigned int			dl_boosted        : 1;
 	unsigned int			dl_yielded        : 1;
 	unsigned int			dl_non_contending : 1;
 	unsigned int			dl_overrun	  : 1;
@@ -570,6 +569,15 @@ struct sched_dl_entity {
 	 * time.
 	 */
 	struct hrtimer inactive_timer;
+
+#ifdef CONFIG_RT_MUTEXES
+	/*
+	 * Priority Inheritance. When a DEADLINE scheduling entity is boosted
+	 * pi_se points to the donor, otherwise points to the dl_se it belongs
+	 * to (the original one/itself).
+	 */
+	struct sched_dl_entity *pi_se;
+#endif
 };
 
 #ifdef CONFIG_UCLAMP_TASK
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9f0ebfb0d17b..e7e453492cff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4912,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_prio(pi_task->prio) &&
 		     dl_entity_preempt(&pi_task->dl, &p->dl))) {
-			p->dl.dl_boosted = 1;
+			p->dl.pi_se = pi_task->dl.pi_se;
 			queue_flag |= ENQUEUE_REPLENISH;
-		} else
-			p->dl.dl_boosted = 0;
+		} else {
+			p->dl.pi_se = &p->dl;
+		}
 		p->sched_class = &dl_sched_class;
 	} else if (rt_prio(prio)) {
 		if (dl_prio(oldprio))
-			p->dl.dl_boosted = 0;
+			p->dl.pi_se = &p->dl;
 		if (oldprio < prio)
 			queue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
-			p->dl.dl_boosted = 0;
+			p->dl.pi_se = &p->dl;
 		if (rt_prio(oldprio))
 			p->rt.timeout = 0;
 		p->sched_class = &fair_sched_class;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6d93f4518734..949bc5c083c1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
 	return !RB_EMPTY_NODE(&dl_se->rb_node);
 }
 
+#ifdef CONFIG_RT_MUTEXES
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+	return dl_se->pi_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+	return pi_of(dl_se) != dl_se;
+}
+#else
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+	return dl_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+	return false;
+}
+#endif
+
 #ifdef CONFIG_SMP
 static inline struct dl_bw *dl_bw_of(int i)
 {
@@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	WARN_ON(dl_se->dl_boosted);
+	WARN_ON(is_dl_boosted(dl_se));
 	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
 
 	/*
@@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
  * could happen are, typically, a entity voluntarily trying to overcome its
  * runtime, or it just underestimated it during sched_setattr().
  */
-static void replenish_dl_entity(struct sched_dl_entity *dl_se,
-				struct sched_dl_entity *pi_se)
+static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	BUG_ON(pi_se->dl_runtime <= 0);
+	BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
 
 	/*
 	 * This could be the case for a !-dl task that is boosted.
 	 * Just go with full inherited parameters.
 	 */
 	if (dl_se->dl_deadline == 0) {
-		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
-		dl_se->runtime = pi_se->dl_runtime;
+		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+		dl_se->runtime = pi_of(dl_se)->dl_runtime;
 	}
 
 	if (dl_se->dl_yielded && dl_se->runtime > 0)
@@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 	 * arbitrary large.
 	 */
 	while (dl_se->runtime <= 0) {
-		dl_se->deadline += pi_se->dl_period;
-		dl_se->runtime += pi_se->dl_runtime;
+		dl_se->deadline += pi_of(dl_se)->dl_period;
+		dl_se->runtime += pi_of(dl_se)->dl_runtime;
 	}
 
 	/*
@@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 	 */
 	if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
 		printk_deferred_once("sched: DL replenish lagged too much\n");
-		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
-		dl_se->runtime = pi_se->dl_runtime;
+		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+		dl_se->runtime = pi_of(dl_se)->dl_runtime;
 	}
 
 	if (dl_se->dl_yielded)
@@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
  * task with deadline equal to period this is the same of using
  * dl_period instead of dl_deadline in the equation above.
  */
-static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
-			       struct sched_dl_entity *pi_se, u64 t)
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
 {
 	u64 left, right;
 
@@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
 	 * of anything below microseconds resolution is actually fiction
 	 * (but still we want to give the user that illusion >;).
 	 */
-	left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+	left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
 	right = ((dl_se->deadline - t) >> DL_SCALE) *
-		(pi_se->dl_runtime >> DL_SCALE);
+		(pi_of(dl_se)->dl_runtime >> DL_SCALE);
 
 	return dl_time_before(right, left);
 }
@@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
  * Please refer to the comments update_dl_revised_wakeup() function to find
  * more about the Revised CBS rule.
  */
-static void update_dl_entity(struct sched_dl_entity *dl_se,
-			     struct sched_dl_entity *pi_se)
+static void update_dl_entity(struct sched_dl_entity *dl_se)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
-	    dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+	    dl_entity_overflow(dl_se, rq_clock(rq))) {
 
 		if (unlikely(!dl_is_implicit(dl_se) &&
 			     !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
-			     !dl_se->dl_boosted)){
+			     !is_dl_boosted(dl_se))) {
 			update_dl_revised_wakeup(dl_se, rq);
 			return;
 		}
 
-		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
-		dl_se->runtime = pi_se->dl_runtime;
+		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+		dl_se->runtime = pi_of(dl_se)->dl_runtime;
 	}
 }
 
@@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	 * The task might have been boosted by someone else and might be in the
 	 * boosting/deboosting path, its not throttled.
 	 */
-	if (dl_se->dl_boosted)
+	if (is_dl_boosted(dl_se))
 		goto unlock;
 
 	/*
@@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	 * but do not enqueue -- wait for our wakeup to do that.
 	 */
 	if (!task_on_rq_queued(p)) {
-		replenish_dl_entity(dl_se, dl_se);
+		replenish_dl_entity(dl_se);
 		goto unlock;
 	}
 
@@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
 	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
-		if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
 			return;
 		dl_se->dl_throttled = 1;
 		if (dl_se->runtime > 0)
@@ -1287,7 +1306,7 @@ throttle:
 			dl_se->dl_overrun = 1;
 
 		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
+		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
@@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
 }
 
 static void
-enqueue_dl_entity(struct sched_dl_entity *dl_se,
-		  struct sched_dl_entity *pi_se, int flags)
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 {
 	BUG_ON(on_dl_rq(dl_se));
 
@@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 	 */
 	if (flags & ENQUEUE_WAKEUP) {
 		task_contending(dl_se, flags);
-		update_dl_entity(dl_se, pi_se);
+		update_dl_entity(dl_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
-		replenish_dl_entity(dl_se, pi_se);
+		replenish_dl_entity(dl_se);
 	} else if ((flags & ENQUEUE_RESTORE) &&
 		  dl_time_before(dl_se->deadline,
 				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
@@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
-	struct task_struct *pi_task = rt_mutex_get_top_task(p);
-	struct sched_dl_entity *pi_se = &p->dl;
-
-	/*
-	 * Use the scheduling parameters of the top pi-waiter task if:
-	 * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
-	 * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
-	 *   smaller than our deadline OR we are a !SCHED_DEADLINE task getting
-	 *   boosted due to a SCHED_DEADLINE pi-waiter).
-	 * Otherwise we keep our runtime and deadline.
-	 */
-	if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
-		pi_se = &pi_task->dl;
+	if (is_dl_boosted(&p->dl)) {
 		/*
 		 * Because of delays in the detection of the overrun of a
 		 * thread's runtime, it might be the case that a thread
@@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 		 * the throttle.
 		 */
 		p->dl.dl_throttled = 0;
-		BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+		BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
 		return;
 	}
 
@@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 		return;
 	}
 
-	enqueue_dl_entity(&p->dl, pi_se, flags);
+	enqueue_dl_entity(&p->dl, flags);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
@@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p)
 	dl_se->dl_bw			= 0;
 	dl_se->dl_density		= 0;
 
-	dl_se->dl_boosted		= 0;
 	dl_se->dl_throttled		= 0;
 	dl_se->dl_yielded		= 0;
 	dl_se->dl_non_contending	= 0;
 	dl_se->dl_overrun		= 0;
+
+#ifdef CONFIG_RT_MUTEXES
+	dl_se->pi_se			= dl_se;
+#endif
 }
 
 bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
-- 
cgit 


From 95bb7c42ac8a94ce3d0eb059ad64430390351ccb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 13 Nov 2020 00:01:21 +0200
Subject: mm: Add 'mprotect' hook to struct vm_operations_struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Background
==========

1. SGX enclave pages are populated with data by copying from normal memory
   via ioctl() (SGX_IOC_ENCLAVE_ADD_PAGES), which will be added later in
   this series.
2. It is desirable to be able to restrict those normal memory data sources.
   For instance, to ensure that the source data is executable before
   copying data to an executable enclave page.
3. Enclave page permissions are dynamic (just like normal permissions) and
   can be adjusted at runtime with mprotect().

This creates a problem because the original data source may have long since
vanished at the time when enclave page permissions are established (mmap()
or mprotect()).

The solution (elsewhere in this series) is to force enclave creators to
declare their paging permission *intent* up front to the ioctl().  This
intent can be immediately compared to the source data’s mapping and
rejected if necessary.

The “intent” is also stashed off for later comparison with enclave
PTEs. This ensures that any future mmap()/mprotect() operations
performed by the enclave creator or done on behalf of the enclave
can be compared with the earlier declared permissions.

Problem
=======

There is an existing mmap() hook which allows SGX to perform this
permission comparison at mmap() time.  However, there is no corresponding
->mprotect() hook.

Solution
========

Add a vm_ops->mprotect() hook so that mprotect() operations which are
inconsistent with any page's stashed intent can be rejected by the driver.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hdanton@sina.com>
Cc: linux-mm@kvack.org
Link: https://lkml.kernel.org/r/20201112220135.165028-11-jarkko@kernel.org
---
 include/linux/mm.h | 7 +++++++
 mm/mprotect.c      | 7 +++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..1813fa86b981 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -559,6 +559,13 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*split)(struct vm_area_struct * area, unsigned long addr);
 	int (*mremap)(struct vm_area_struct * area);
+	/*
+	 * Called by mprotect() to make driver-specific permission
+	 * checks before mprotect() is finalised.   The VMA must not
+	 * be modified.  Returns 0 if eprotect() can proceed.
+	 */
+	int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
+			unsigned long end, unsigned long newflags);
 	vm_fault_t (*fault)(struct vm_fault *vmf);
 	vm_fault_t (*huge_fault)(struct vm_fault *vmf,
 			enum page_entry_size pe_size);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 56c02beb6041..ab709023e9aa 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -616,9 +616,16 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
 		tmp = vma->vm_end;
 		if (tmp > end)
 			tmp = end;
+
+		if (vma->vm_ops && vma->vm_ops->mprotect)
+			error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
+		if (error)
+			goto out;
+
 		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
 		if (error)
 			goto out;
+
 		nstart = tmp;
 
 		if (nstart < prev->vm_end)
-- 
cgit 


From 5bdba520c1b318578caffd325515b35d187f8a0e Mon Sep 17 00:00:00 2001
From: Faiyaz Mohammed <faiyazm@codeaurora.org>
Date: Mon, 16 Nov 2020 17:05:37 +0530
Subject: mm: memblock: drop __init from memblock functions to make it inline

__init is used with inline due to which memblock wraper functions are
not getting inline.
for example:
[    0.000000] memblock_alloc_try_nid: 1490 bytes align=0x40 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 memblock_alloc+0x20/0x2c
[    0.000000] memblock_reserve: [0x000000023f09a3c0-0x000000023f09a991] memblock_alloc_range_nid+0xc0/0x188

Dropping __init from memblock wrapper functions to make it inline and it
increase the debugability.
After:
[    0.000000] memblock_alloc_try_nid: 1490 bytes align=0x40 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 start_kernel+0xa4/0x568
[    0.000000] memblock_reserve: [0x000000023f09a3c0-0x000000023f09a991] memblock_alloc_range_nid+0xc0/0x188

Signed-off-by: Faiyaz Mohammed <faiyazm@codeaurora.org>
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/memblock.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ef131255cedc..b93c44b9121e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -404,13 +404,13 @@ void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
 			     phys_addr_t min_addr, phys_addr_t max_addr,
 			     int nid);
 
-static inline void * __init memblock_alloc(phys_addr_t size,  phys_addr_t align)
+static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
 {
 	return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
 				      MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_raw(phys_addr_t size,
+static inline void *memblock_alloc_raw(phys_addr_t size,
 					       phys_addr_t align)
 {
 	return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
@@ -418,7 +418,7 @@ static inline void * __init memblock_alloc_raw(phys_addr_t size,
 					  NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_from(phys_addr_t size,
+static inline void *memblock_alloc_from(phys_addr_t size,
 						phys_addr_t align,
 						phys_addr_t min_addr)
 {
@@ -426,33 +426,33 @@ static inline void * __init memblock_alloc_from(phys_addr_t size,
 				      MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_low(phys_addr_t size,
+static inline void *memblock_alloc_low(phys_addr_t size,
 					       phys_addr_t align)
 {
 	return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
 				      ARCH_LOW_ADDRESS_LIMIT, NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_node(phys_addr_t size,
+static inline void *memblock_alloc_node(phys_addr_t size,
 						phys_addr_t align, int nid)
 {
 	return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
 				      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
-static inline void __init memblock_free_early(phys_addr_t base,
+static inline void memblock_free_early(phys_addr_t base,
 					      phys_addr_t size)
 {
 	memblock_free(base, size);
 }
 
-static inline void __init memblock_free_early_nid(phys_addr_t base,
+static inline void memblock_free_early_nid(phys_addr_t base,
 						  phys_addr_t size, int nid)
 {
 	memblock_free(base, size);
 }
 
-static inline void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
+static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
 {
 	__memblock_free_late(base, size);
 }
@@ -460,7 +460,7 @@ static inline void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
-static inline void __init memblock_set_bottom_up(bool enable)
+static inline void memblock_set_bottom_up(bool enable)
 {
 	memblock.bottom_up = enable;
 }
-- 
cgit 


From 1f90f6a835514cb69bfede0b2752b0cb7a351bbd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Nov 2020 22:45:05 +0200
Subject: resource: Group resource_overlaps() with other inline helpers

For better maintenance group resource_overlaps() with other inline helpers.
While at it, drop extra parentheses.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/ioport.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5135d4b86cd6..df4581107536 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -229,6 +229,11 @@ static inline bool resource_contains(struct resource *r1, struct resource *r2)
 	return r1->start <= r2->start && r1->end >= r2->end;
 }
 
+/* True if any part of r1 overlaps r2 */
+static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
+{
+       return r1->start <= r2->end && r1->end >= r2->start;
+}
 
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)		__request_region(&ioport_resource, (start), (n), (name), 0)
@@ -296,12 +301,6 @@ extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
 		    void *arg, int (*func)(struct resource *, void *));
 
-/* True if any part of r1 overlaps r2 */
-static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
-{
-       return (r1->start <= r2->end && r1->end >= r2->start);
-}
-
 struct resource *devm_request_free_mem_region(struct device *dev,
 		struct resource *base, unsigned long size);
 struct resource *request_free_mem_region(struct resource *base,
-- 
cgit 


From 5562f35d7feabfd68cd58a1ee28b451f90e82417 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Nov 2020 22:45:06 +0200
Subject: resource: Introduce resource_union() for overlapping resources

Some already present users may utilize resource_union() helper.
Provide it for them and for wider use in the future.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/ioport.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index df4581107536..40320eb5bc0e 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -10,9 +10,10 @@
 #define _LINUX_IOPORT_H
 
 #ifndef __ASSEMBLY__
+#include <linux/bits.h>
 #include <linux/compiler.h>
+#include <linux/minmax.h>
 #include <linux/types.h>
-#include <linux/bits.h>
 /*
  * Resources are tree-like, allowing
  * nesting etc..
@@ -235,6 +236,16 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
        return r1->start <= r2->end && r1->end >= r2->start;
 }
 
+static inline bool
+resource_union(struct resource *r1, struct resource *r2, struct resource *r)
+{
+	if (!resource_overlaps(r1, r2))
+		return false;
+	r->start = min(r1->start, r2->start);
+	r->end = max(r1->end, r2->end);
+	return true;
+}
+
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)		__request_region(&ioport_resource, (start), (n), (name), 0)
 #define request_muxed_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED)
-- 
cgit 


From f65674df1b23cdcb6f656a14f659ffea83e7acaa Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Nov 2020 22:45:07 +0200
Subject: resource: Introduce resource_intersection() for overlapping resources

There will be at least one user that can utilize new helper.
Provide the helper for future user and for wider use.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/ioport.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 40320eb5bc0e..ece1a8db309c 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -236,6 +236,16 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
        return r1->start <= r2->end && r1->end >= r2->start;
 }
 
+static inline bool
+resource_intersection(struct resource *r1, struct resource *r2, struct resource *r)
+{
+	if (!resource_overlaps(r1, r2))
+		return false;
+	r->start = max(r1->start, r2->start);
+	r->end = min(r1->end, r2->end);
+	return true;
+}
+
 static inline bool
 resource_union(struct resource *r1, struct resource *r2, struct resource *r)
 {
-- 
cgit 


From 97f53a08cba128a724ebbbf34778d3553d559816 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 16 Nov 2020 13:21:08 -0800
Subject: net: linux/skbuff.h: combine SKB_EXTENSIONS + KCOV handling

The previous Kconfig patch led to some other build errors as
reported by the 0day bot and my own overnight build testing.

These are all in <linux/skbuff.h> when KCOV is enabled but
SKB_EXTENSIONS is not enabled, so fix those by combining those conditions
in the header file.

Fixes: 6370cc3bbd8a ("net: add kcov handle to skb extensions")
Fixes: 85ce50d337d1 ("net: kcov: don't select SKB_EXTENSIONS when there is no NET")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Aleksandr Nogikh <nogikh@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20201116212108.32465-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2d01b2bbb746..0a1239819fd2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4608,7 +4608,7 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 #endif
 }
 
-#ifdef CONFIG_KCOV
+#if IS_ENABLED(CONFIG_KCOV) && IS_ENABLED(CONFIG_SKB_EXTENSIONS)
 static inline void skb_set_kcov_handle(struct sk_buff *skb,
 				       const u64 kcov_handle)
 {
@@ -4636,7 +4636,7 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 static inline void skb_set_kcov_handle(struct sk_buff *skb,
 				       const u64 kcov_handle) { }
 static inline u64 skb_get_kcov_handle(struct sk_buff *skb) { return 0; }
-#endif /* CONFIG_KCOV */
+#endif /* CONFIG_KCOV && CONFIG_SKB_EXTENSIONS */
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
-- 
cgit 


From 172292be01dbd6c26aba23f62e8ec090f31cdb71 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 6 Nov 2020 19:19:41 +0100
Subject: dma-mapping: remove dma_virt_ops

Now that the RDMA core deals with devices that only do DMA mapping in
lower layers properly, there is no user for dma_virt_ops and it can be
removed.

Link: https://lore.kernel.org/r/20201106181941.1878556-11-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/dma-mapping.h |  2 --
 kernel/dma/Kconfig          |  5 ----
 kernel/dma/Makefile         |  1 -
 kernel/dma/virt.c           | 61 ---------------------------------------------
 4 files changed, 69 deletions(-)
 delete mode 100644 kernel/dma/virt.c

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 956151052d45..2aaed35b556d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -565,6 +565,4 @@ static inline int dma_mmap_wc(struct device *dev,
 int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
 		dma_addr_t dma_start, u64 size);
 
-extern const struct dma_map_ops dma_virt_ops;
-
 #endif /* _LINUX_DMA_MAPPING_H */
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a21458..fd2db2665fc6 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -75,11 +75,6 @@ config ARCH_HAS_DMA_PREP_COHERENT
 config ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	bool
 
-config DMA_VIRT_OPS
-	bool
-	depends on HAS_DMA
-	select DMA_OPS
-
 config SWIOTLB
 	bool
 	select NEED_DMA_MAP_STATE
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index dc755ab68aab..cd1d86358a7a 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -5,7 +5,6 @@ obj-$(CONFIG_DMA_OPS)			+= ops_helpers.o
 obj-$(CONFIG_DMA_OPS)			+= dummy.o
 obj-$(CONFIG_DMA_CMA)			+= contiguous.o
 obj-$(CONFIG_DMA_DECLARE_COHERENT)	+= coherent.o
-obj-$(CONFIG_DMA_VIRT_OPS)		+= virt.o
 obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
 obj-$(CONFIG_DMA_COHERENT_POOL)		+= pool.o
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
deleted file mode 100644
index 59d32317dd57..000000000000
--- a/kernel/dma/virt.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DMA operations that map to virtual addresses without flushing memory.
- */
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/dma-map-ops.h>
-#include <linux/scatterlist.h>
-
-static void *dma_virt_alloc(struct device *dev, size_t size,
-			    dma_addr_t *dma_handle, gfp_t gfp,
-			    unsigned long attrs)
-{
-	void *ret;
-
-	ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size));
-	if (ret)
-		*dma_handle = (uintptr_t)ret;
-	return ret;
-}
-
-static void dma_virt_free(struct device *dev, size_t size,
-			  void *cpu_addr, dma_addr_t dma_addr,
-			  unsigned long attrs)
-{
-	free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page,
-				    unsigned long offset, size_t size,
-				    enum dma_data_direction dir,
-				    unsigned long attrs)
-{
-	return (uintptr_t)(page_address(page) + offset);
-}
-
-static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl,
-			   int nents, enum dma_data_direction dir,
-			   unsigned long attrs)
-{
-	int i;
-	struct scatterlist *sg;
-
-	for_each_sg(sgl, sg, nents, i) {
-		BUG_ON(!sg_page(sg));
-		sg_dma_address(sg) = (uintptr_t)sg_virt(sg);
-		sg_dma_len(sg) = sg->length;
-	}
-
-	return nents;
-}
-
-const struct dma_map_ops dma_virt_ops = {
-	.alloc			= dma_virt_alloc,
-	.free			= dma_virt_free,
-	.map_page		= dma_virt_map_page,
-	.map_sg			= dma_virt_map_sg,
-	.alloc_pages		= dma_common_alloc_pages,
-	.free_pages		= dma_common_free_pages,
-};
-EXPORT_SYMBOL(dma_virt_ops);
-- 
cgit 


From f73659192b0bdf7bad826587b3530cef43cc048d Mon Sep 17 00:00:00 2001
From: Xie He <xie.he.0141@gmail.com>
Date: Sat, 14 Nov 2020 07:09:21 -0800
Subject: net: wan: Delete the DLCI / SDLA drivers

The DLCI driver (dlci.c) implements the Frame Relay protocol. However,
we already have another newer and better implementation of Frame Relay
provided by the HDLC_FR driver (hdlc_fr.c).

The DLCI driver's implementation of Frame Relay is used by only one
hardware driver in the kernel - the SDLA driver (sdla.c).

The SDLA driver provides Frame Relay support for the Sangoma S50x devices.
However, the vendor provides their own driver (along with their own
multi-WAN-protocol implementations including Frame Relay), called WANPIPE.
I believe most users of the hardware would use the vendor-provided WANPIPE
driver instead.

(The WANPIPE driver was even once in the kernel, but was deleted in
commit 8db60bcf3021 ("[WAN]: Remove broken and unmaintained Sangoma
drivers.") because the vendor no longer updated the in-kernel WANPIPE
driver.)

Cc: Mike McLagan <mike.mclagan@linux.org>
Signed-off-by: Xie He <xie.he.0141@gmail.com>
Link: https://lore.kernel.org/r/20201114150921.685594-1-xie.he.0141@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 CREDITS                                 |    9 -
 Documentation/networking/framerelay.rst |   44 -
 MAINTAINERS                             |    6 -
 arch/arm/configs/ixp4xx_defconfig       |    1 -
 arch/mips/configs/gpr_defconfig         |    1 -
 arch/mips/configs/mtx1_defconfig        |    1 -
 drivers/net/wan/Kconfig                 |   45 -
 drivers/net/wan/Makefile                |    2 -
 drivers/net/wan/dlci.c                  |  541 ----------
 drivers/net/wan/sdla.c                  | 1655 -------------------------------
 include/linux/if_frad.h                 |   92 --
 include/linux/sdla.h                    |  240 -----
 include/uapi/linux/if_frad.h            |  123 ---
 include/uapi/linux/sdla.h               |  117 ---
 net/socket.c                            |   25 -
 15 files changed, 2902 deletions(-)
 delete mode 100644 Documentation/networking/framerelay.rst
 delete mode 100644 drivers/net/wan/dlci.c
 delete mode 100644 drivers/net/wan/sdla.c
 delete mode 100644 include/linux/if_frad.h
 delete mode 100644 include/linux/sdla.h
 delete mode 100644 include/uapi/linux/if_frad.h
 delete mode 100644 include/uapi/linux/sdla.h

(limited to 'include/linux')

diff --git a/CREDITS b/CREDITS
index 8592e45e3932..67421adb747c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2499,15 +2499,6 @@ W: http://www.rdrop.com/users/paulmck/
 D: RCU and variants
 D: rcutorture module
 
-N: Mike McLagan
-E: mike.mclagan@linux.org
-W: http://www.invlogic.com/~mmclagan
-D: DLCI/FRAD drivers for Sangoma SDLAs
-S: Innovative Logic Corp
-S: Post Office Box 1068
-S: Laurel, Maryland 20732
-S: USA
-
 N: Bradley McLean
 E: brad@bradpc.gaylord.com
 D: Device driver hacker
diff --git a/Documentation/networking/framerelay.rst b/Documentation/networking/framerelay.rst
deleted file mode 100644
index 6d904399ec6d..000000000000
--- a/Documentation/networking/framerelay.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-================
-Frame Relay (FR)
-================
-
-Frame Relay (FR) support for linux is built into a two tiered system of device
-drivers.  The upper layer implements RFC1490 FR specification, and uses the
-Data Link Connection Identifier (DLCI) as its hardware address.  Usually these
-are assigned by your network supplier, they give you the number/numbers of
-the Virtual Connections (VC) assigned to you.
-
-Each DLCI is a point-to-point link between your machine and a remote one.
-As such, a separate device is needed to accommodate the routing.  Within the
-net-tools archives is 'dlcicfg'.  This program will communicate with the
-base "DLCI" device, and create new net devices named 'dlci00', 'dlci01'...
-The configuration script will ask you how many DLCIs you need, as well as
-how many DLCIs you want to assign to each Frame Relay Access Device (FRAD).
-
-The DLCI uses a number of function calls to communicate with the FRAD, all
-of which are stored in the FRAD's private data area.  assoc/deassoc,
-activate/deactivate and dlci_config.  The DLCI supplies a receive function
-to the FRAD to accept incoming packets.
-
-With this initial offering, only 1 FRAD driver is available.  With many thanks
-to Sangoma Technologies, David Mandelstam & Gene Kozin, the S502A, S502E &
-S508 are supported.  This driver is currently set up for only FR, but as
-Sangoma makes more firmware modules available, it can be updated to provide
-them as well.
-
-Configuration of the FRAD makes use of another net-tools program, 'fradcfg'.
-This program makes use of a configuration file (which dlcicfg can also read)
-to specify the types of boards to be configured as FRADs, as well as perform
-any board specific configuration.  The Sangoma module of fradcfg loads the
-FR firmware into the card, sets the irq/port/memory information, and provides
-an initial configuration.
-
-Additional FRAD device drivers can be added as hardware is available.
-
-At this time, the dlcicfg and fradcfg programs have not been incorporated into
-the net-tools distribution.  They can be found at ftp.invlogic.com, in
-/pub/linux.  Note that with OS/2 FTPD, you end up in /pub by default, so just
-use 'cd linux'.  v0.10 is for use on pre-2.0.3 and earlier, v0.15 is for
-pre-2.0.4 and later.
diff --git a/MAINTAINERS b/MAINTAINERS
index af9f6a3ab100..3341959af0c7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6905,12 +6905,6 @@ S:	Maintained
 W:	http://floatingpoint.sourceforge.net/emulator/index.html
 F:	arch/x86/math-emu/
 
-FRAME RELAY DLCI/FRAD (Sangoma drivers too)
-L:	netdev@vger.kernel.org
-S:	Orphan
-F:	drivers/net/wan/dlci.c
-F:	drivers/net/wan/sdla.c
-
 FRAMEBUFFER LAYER
 M:	Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
 L:	dri-devel@lists.freedesktop.org
diff --git a/arch/arm/configs/ixp4xx_defconfig b/arch/arm/configs/ixp4xx_defconfig
index 27e7c0714b96..0d6edeb27659 100644
--- a/arch/arm/configs/ixp4xx_defconfig
+++ b/arch/arm/configs/ixp4xx_defconfig
@@ -141,7 +141,6 @@ CONFIG_HDLC_CISCO=m
 CONFIG_HDLC_FR=m
 CONFIG_HDLC_PPP=m
 CONFIG_HDLC_X25=m
-CONFIG_DLCI=m
 CONFIG_WAN_ROUTER_DRIVERS=m
 CONFIG_ATM_TCP=m
 # CONFIG_INPUT_KEYBOARD is not set
diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig
index 599d5604aabe..8a921c8ac233 100644
--- a/arch/mips/configs/gpr_defconfig
+++ b/arch/mips/configs/gpr_defconfig
@@ -228,7 +228,6 @@ CONFIG_FARSYNC=m
 CONFIG_DSCC4=m
 CONFIG_DSCC4_PCISYNC=y
 CONFIG_DSCC4_PCI_RST=y
-CONFIG_DLCI=m
 CONFIG_LAPBETHER=m
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig
index dc69b054181c..30dacce94198 100644
--- a/arch/mips/configs/mtx1_defconfig
+++ b/arch/mips/configs/mtx1_defconfig
@@ -378,7 +378,6 @@ CONFIG_FARSYNC=m
 CONFIG_DSCC4=m
 CONFIG_DSCC4_PCISYNC=y
 CONFIG_DSCC4_PCI_RST=y
-CONFIG_DLCI=m
 CONFIG_LAPBETHER=m
 # CONFIG_KEYBOARD_ATKBD is not set
 CONFIG_KEYBOARD_GPIO=y
diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig
index 2cf98a732a26..4029fde71a9e 100644
--- a/drivers/net/wan/Kconfig
+++ b/drivers/net/wan/Kconfig
@@ -321,51 +321,6 @@ config IXP4XX_HSS
 	  Say Y here if you want to use built-in HSS ports
 	  on IXP4xx processor.
 
-config DLCI
-	tristate "Frame Relay DLCI support"
-	help
-	  Support for the Frame Relay protocol.
-
-	  Frame Relay is a fast low-cost way to connect to a remote Internet
-	  access provider or to form a private wide area network. The one
-	  physical line from your box to the local "switch" (i.e. the entry
-	  point to the Frame Relay network, usually at the phone company) can
-	  carry several logical point-to-point connections to other computers
-	  connected to the Frame Relay network. For a general explanation of
-	  the protocol, check out <http://www.mplsforum.org/>.
-
-	  To use frame relay, you need supporting hardware (called FRAD) and
-	  certain programs from the net-tools package as explained in
-	  <file:Documentation/networking/framerelay.rst>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called dlci.
-
-config DLCI_MAX
-	int "Max DLCI per device"
-	depends on DLCI
-	default "8"
-	help
-	  How many logical point-to-point frame relay connections (the
-	  identifiers of which are called DCLIs) should be handled by each
-	  of your hardware frame relay access devices.
-
-	  Go with the default.
-
-config SDLA
-	tristate "SDLA (Sangoma S502/S508) support"
-	depends on DLCI && ISA
-	help
-	  Driver for the Sangoma S502A, S502E, and S508 Frame Relay Access
-	  Devices.
-
-	  These are multi-protocol cards, but only Frame Relay is supported
-	  by the driver at this time. Please read
-	  <file:Documentation/networking/framerelay.rst>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called sdla.
-
 # X.25 network drivers
 config LAPBETHER
 	tristate "LAPB over Ethernet driver"
diff --git a/drivers/net/wan/Makefile b/drivers/net/wan/Makefile
index 5b9dc85eae34..081666c36ca2 100644
--- a/drivers/net/wan/Makefile
+++ b/drivers/net/wan/Makefile
@@ -21,8 +21,6 @@ obj-$(CONFIG_FARSYNC)		+= farsync.o
 
 obj-$(CONFIG_LANMEDIA)		+= lmc/
 
-obj-$(CONFIG_DLCI)		+= dlci.o 
-obj-$(CONFIG_SDLA)		+= sdla.o
 obj-$(CONFIG_LAPBETHER)		+= lapbether.o
 obj-$(CONFIG_SBNI)		+= sbni.o
 obj-$(CONFIG_N2)		+= n2.o
diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c
deleted file mode 100644
index 3ca4daf63389..000000000000
--- a/drivers/net/wan/dlci.c
+++ /dev/null
@@ -1,541 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * DLCI		Implementation of Frame Relay protocol for Linux, according to
- *		RFC 1490.  This generic device provides en/decapsulation for an
- *		underlying hardware driver.  Routes & IPs are assigned to these
- *		interfaces.  Requires 'dlcicfg' program to create usable 
- *		interfaces, the initial one, 'dlci' is for IOCTL use only.
- *
- * Version:	@(#)dlci.c	0.35	4 Jan 1997
- *
- * Author:	Mike McLagan <mike.mclagan@linux.org>
- *
- * Changes:
- *
- *		0.15	Mike Mclagan	Packet freeing, bug in kmalloc call
- *					DLCI_RET handling
- *		0.20	Mike McLagan	More conservative on which packets
- *					are returned for retry and which are
- *					are dropped.  If DLCI_RET_DROP is
- *					returned from the FRAD, the packet is
- *				 	sent back to Linux for re-transmission
- *		0.25	Mike McLagan	Converted to use SIOC IOCTL calls
- *		0.30	Jim Freeman	Fixed to allow IPX traffic
- *		0.35	Michael Elizabeth	Fixed incorrect memcpy_fromfs
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/interrupt.h>
-#include <linux/ptrace.h>
-#include <linux/ioport.h>
-#include <linux/in.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/if_frad.h>
-#include <linux/bitops.h>
-
-#include <net/sock.h>
-
-#include <asm/io.h>
-#include <asm/dma.h>
-#include <linux/uaccess.h>
-
-static const char version[] = "DLCI driver v0.35, 4 Jan 1997, mike.mclagan@linux.org";
-
-static LIST_HEAD(dlci_devs);
-
-static void dlci_setup(struct net_device *);
-
-/* 
- * these encapsulate the RFC 1490 requirements as well as 
- * deal with packet transmission and reception, working with
- * the upper network layers 
- */
-
-static int dlci_header(struct sk_buff *skb, struct net_device *dev, 
-		       unsigned short type, const void *daddr,
-		       const void *saddr, unsigned len)
-{
-	struct frhdr		hdr;
-	unsigned int		hlen;
-	char			*dest;
-
-	hdr.control = FRAD_I_UI;
-	switch (type)
-	{
-		case ETH_P_IP:
-			hdr.IP_NLPID = FRAD_P_IP;
-			hlen = sizeof(hdr.control) + sizeof(hdr.IP_NLPID);
-			break;
-
-		/* feel free to add other types, if necessary */
-
-		default:
-			hdr.pad = FRAD_P_PADDING;
-			hdr.NLPID = FRAD_P_SNAP;
-			memset(hdr.OUI, 0, sizeof(hdr.OUI));
-			hdr.PID = htons(type);
-			hlen = sizeof(hdr);
-			break;
-	}
-
-	dest = skb_push(skb, hlen);
-	if (!dest)
-		return 0;
-
-	memcpy(dest, &hdr, hlen);
-
-	return hlen;
-}
-
-static void dlci_receive(struct sk_buff *skb, struct net_device *dev)
-{
-	struct frhdr		*hdr;
-	int					process, header;
-
-	if (!pskb_may_pull(skb, sizeof(*hdr))) {
-		netdev_notice(dev, "invalid data no header\n");
-		dev->stats.rx_errors++;
-		kfree_skb(skb);
-		return;
-	}
-
-	hdr = (struct frhdr *) skb->data;
-	process = 0;
-	header = 0;
-	skb->dev = dev;
-
-	if (hdr->control != FRAD_I_UI)
-	{
-		netdev_notice(dev, "Invalid header flag 0x%02X\n",
-			      hdr->control);
-		dev->stats.rx_errors++;
-	}
-	else
-		switch (hdr->IP_NLPID)
-		{
-			case FRAD_P_PADDING:
-				if (hdr->NLPID != FRAD_P_SNAP)
-				{
-					netdev_notice(dev, "Unsupported NLPID 0x%02X\n",
-						      hdr->NLPID);
-					dev->stats.rx_errors++;
-					break;
-				}
-	 
-				if (hdr->OUI[0] + hdr->OUI[1] + hdr->OUI[2] != 0)
-				{
-					netdev_notice(dev, "Unsupported organizationally unique identifier 0x%02X-%02X-%02X\n",
-						      hdr->OUI[0],
-						      hdr->OUI[1],
-						      hdr->OUI[2]);
-					dev->stats.rx_errors++;
-					break;
-				}
-
-				/* at this point, it's an EtherType frame */
-				header = sizeof(struct frhdr);
-				/* Already in network order ! */
-				skb->protocol = hdr->PID;
-				process = 1;
-				break;
-
-			case FRAD_P_IP:
-				header = sizeof(hdr->control) + sizeof(hdr->IP_NLPID);
-				skb->protocol = htons(ETH_P_IP);
-				process = 1;
-				break;
-
-			case FRAD_P_SNAP:
-			case FRAD_P_Q933:
-			case FRAD_P_CLNP:
-				netdev_notice(dev, "Unsupported NLPID 0x%02X\n",
-					      hdr->pad);
-				dev->stats.rx_errors++;
-				break;
-
-			default:
-				netdev_notice(dev, "Invalid pad byte 0x%02X\n",
-					      hdr->pad);
-				dev->stats.rx_errors++;
-				break;				
-		}
-
-	if (process)
-	{
-		/* we've set up the protocol, so discard the header */
-		skb_reset_mac_header(skb);
-		skb_pull(skb, header);
-		dev->stats.rx_bytes += skb->len;
-		netif_rx(skb);
-		dev->stats.rx_packets++;
-	}
-	else
-		dev_kfree_skb(skb);
-}
-
-static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct dlci_local *dlp = netdev_priv(dev);
-
-	if (skb) {
-		struct netdev_queue *txq = skb_get_tx_queue(dev, skb);
-		netdev_start_xmit(skb, dlp->slave, txq, false);
-	}
-	return NETDEV_TX_OK;
-}
-
-static int dlci_config(struct net_device *dev, struct dlci_conf __user *conf, int get)
-{
-	struct dlci_conf	config;
-	struct dlci_local	*dlp;
-	struct frad_local	*flp;
-	int			err;
-
-	dlp = netdev_priv(dev);
-
-	flp = netdev_priv(dlp->slave);
-
-	if (!get)
-	{
-		if (copy_from_user(&config, conf, sizeof(struct dlci_conf)))
-			return -EFAULT;
-		if (config.flags & ~DLCI_VALID_FLAGS)
-			return -EINVAL;
-		memcpy(&dlp->config, &config, sizeof(struct dlci_conf));
-		dlp->configured = 1;
-	}
-
-	err = (*flp->dlci_conf)(dlp->slave, dev, get);
-	if (err)
-		return err;
-
-	if (get)
-	{
-		if (copy_to_user(conf, &dlp->config, sizeof(struct dlci_conf)))
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int dlci_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	struct dlci_local *dlp;
-
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	dlp = netdev_priv(dev);
-
-	switch (cmd)
-	{
-		case DLCI_GET_SLAVE:
-			if (!*(short *)(dev->dev_addr))
-				return -EINVAL;
-
-			strncpy(ifr->ifr_slave, dlp->slave->name, sizeof(ifr->ifr_slave));
-			break;
-
-		case DLCI_GET_CONF:
-		case DLCI_SET_CONF:
-			if (!*(short *)(dev->dev_addr))
-				return -EINVAL;
-
-			return dlci_config(dev, ifr->ifr_data, cmd == DLCI_GET_CONF);
-
-		default: 
-			return -EOPNOTSUPP;
-	}
-	return 0;
-}
-
-static int dlci_change_mtu(struct net_device *dev, int new_mtu)
-{
-	struct dlci_local *dlp = netdev_priv(dev);
-
-	return dev_set_mtu(dlp->slave, new_mtu);
-}
-
-static int dlci_open(struct net_device *dev)
-{
-	struct dlci_local	*dlp;
-	struct frad_local	*flp;
-	int			err;
-
-	dlp = netdev_priv(dev);
-
-	if (!*(short *)(dev->dev_addr))
-		return -EINVAL;
-
-	if (!netif_running(dlp->slave))
-		return -ENOTCONN;
-
-	flp = netdev_priv(dlp->slave);
-	err = (*flp->activate)(dlp->slave, dev);
-	if (err)
-		return err;
-
-	netif_start_queue(dev);
-
-	return 0;
-}
-
-static int dlci_close(struct net_device *dev)
-{
-	struct dlci_local	*dlp;
-	struct frad_local	*flp;
-
-	netif_stop_queue(dev);
-
-	dlp = netdev_priv(dev);
-
-	flp = netdev_priv(dlp->slave);
-	(*flp->deactivate)(dlp->slave, dev);
-
-	return 0;
-}
-
-static int dlci_add(struct dlci_add *dlci)
-{
-	struct net_device	*master, *slave;
-	struct dlci_local	*dlp;
-	struct frad_local	*flp;
-	int			err = -EINVAL;
-
-
-	/* validate slave device */
-	slave = dev_get_by_name(&init_net, dlci->devname);
-	if (!slave)
-		return -ENODEV;
-
-	if (slave->type != ARPHRD_FRAD || netdev_priv(slave) == NULL)
-		goto err1;
-
-	/* create device name */
-	master = alloc_netdev(sizeof(struct dlci_local), "dlci%d",
-			      NET_NAME_UNKNOWN, dlci_setup);
-	if (!master) {
-		err = -ENOMEM;
-		goto err1;
-	}
-
-	/* make sure same slave not already registered */
-	rtnl_lock();
-	list_for_each_entry(dlp, &dlci_devs, list) {
-		if (dlp->slave == slave) {
-			err = -EBUSY;
-			goto err2;
-		}
-	}
-
-	*(short *)(master->dev_addr) = dlci->dlci;
-
-	dlp = netdev_priv(master);
-	dlp->slave = slave;
-	dlp->master = master;
-
-	flp = netdev_priv(slave);
-	err = (*flp->assoc)(slave, master);
-	if (err < 0)
-		goto err2;
-
-	err = register_netdevice(master);
-	if (err < 0) 
-		goto err2;
-
-	strcpy(dlci->devname, master->name);
-
-	list_add(&dlp->list, &dlci_devs);
-	rtnl_unlock();
-
-	return 0;
-
- err2:
-	rtnl_unlock();
-	free_netdev(master);
- err1:
-	dev_put(slave);
-	return err;
-}
-
-static int dlci_del(struct dlci_add *dlci)
-{
-	struct dlci_local	*dlp;
-	struct frad_local	*flp;
-	struct net_device	*master, *slave;
-	int			err;
-	bool			found = false;
-
-	rtnl_lock();
-
-	/* validate slave device */
-	master = __dev_get_by_name(&init_net, dlci->devname);
-	if (!master) {
-		err = -ENODEV;
-		goto out;
-	}
-
-	list_for_each_entry(dlp, &dlci_devs, list) {
-		if (dlp->master == master) {
-			found = true;
-			break;
-		}
-	}
-	if (!found) {
-		err = -ENODEV;
-		goto out;
-	}
-
-	if (netif_running(master)) {
-		err = -EBUSY;
-		goto out;
-	}
-
-	dlp = netdev_priv(master);
-	slave = dlp->slave;
-	flp = netdev_priv(slave);
-
-	err = (*flp->deassoc)(slave, master);
-	if (!err) {
-		list_del(&dlp->list);
-
-		unregister_netdevice(master);
-
-		dev_put(slave);
-	}
-out:
-	rtnl_unlock();
-	return err;
-}
-
-static int dlci_ioctl(unsigned int cmd, void __user *arg)
-{
-	struct dlci_add add;
-	int err;
-	
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (copy_from_user(&add, arg, sizeof(struct dlci_add)))
-		return -EFAULT;
-
-	switch (cmd)
-	{
-		case SIOCADDDLCI:
-			err = dlci_add(&add);
-
-			if (!err)
-				if (copy_to_user(arg, &add, sizeof(struct dlci_add)))
-					return -EFAULT;
-			break;
-
-		case SIOCDELDLCI:
-			err = dlci_del(&add);
-			break;
-
-		default:
-			err = -EINVAL;
-	}
-
-	return err;
-}
-
-static const struct header_ops dlci_header_ops = {
-	.create	= dlci_header,
-};
-
-static const struct net_device_ops dlci_netdev_ops = {
-	.ndo_open	= dlci_open,
-	.ndo_stop	= dlci_close,
-	.ndo_do_ioctl	= dlci_dev_ioctl,
-	.ndo_start_xmit	= dlci_transmit,
-	.ndo_change_mtu	= dlci_change_mtu,
-};
-
-static void dlci_setup(struct net_device *dev)
-{
-	struct dlci_local *dlp = netdev_priv(dev);
-
-	dev->flags		= 0;
-	dev->header_ops		= &dlci_header_ops;
-	dev->netdev_ops		= &dlci_netdev_ops;
-	dev->needs_free_netdev	= true;
-
-	dlp->receive		= dlci_receive;
-
-	dev->type		= ARPHRD_DLCI;
-	dev->hard_header_len	= sizeof(struct frhdr);
-	dev->addr_len		= sizeof(short);
-
-}
-
-/* if slave is unregistering, then cleanup master */
-static int dlci_dev_event(struct notifier_block *unused,
-			  unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	if (dev_net(dev) != &init_net)
-		return NOTIFY_DONE;
-
-	if (event == NETDEV_UNREGISTER) {
-		struct dlci_local *dlp;
-
-		list_for_each_entry(dlp, &dlci_devs, list) {
-			if (dlp->slave == dev) {
-				list_del(&dlp->list);
-				unregister_netdevice(dlp->master);
-				dev_put(dlp->slave);
-				break;
-			}
-		}
-	}
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block dlci_notifier = {
-	.notifier_call = dlci_dev_event,
-};
-
-static int __init init_dlci(void)
-{
-	dlci_ioctl_set(dlci_ioctl);
-	register_netdevice_notifier(&dlci_notifier);
-
-	printk("%s.\n", version);
-
-	return 0;
-}
-
-static void __exit dlci_exit(void)
-{
-	struct dlci_local	*dlp, *nxt;
-	
-	dlci_ioctl_set(NULL);
-	unregister_netdevice_notifier(&dlci_notifier);
-
-	rtnl_lock();
-	list_for_each_entry_safe(dlp, nxt, &dlci_devs, list) {
-		unregister_netdevice(dlp->master);
-		dev_put(dlp->slave);
-	}
-	rtnl_unlock();
-}
-
-module_init(init_dlci);
-module_exit(dlci_exit);
-
-MODULE_AUTHOR("Mike McLagan");
-MODULE_DESCRIPTION("Frame Relay DLCI layer");
-MODULE_LICENSE("GPL");
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
deleted file mode 100644
index bc2c1c7fb1a4..000000000000
--- a/drivers/net/wan/sdla.c
+++ /dev/null
@@ -1,1655 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SDLA		An implementation of a driver for the Sangoma S502/S508 series
- *		multi-protocol PC interface card.  Initial offering is with 
- *		the DLCI driver, providing Frame Relay support for linux.
- *
- *		Global definitions for the Frame relay interface.
- *
- * Version:	@(#)sdla.c   0.30	12 Sep 1996
- *
- * Credits:	Sangoma Technologies, for the use of 2 cards for an extended
- *			period of time.
- *		David Mandelstam <dm@sangoma.com> for getting me started on 
- *			this project, and incentive to complete it.
- *		Gene Kozen <74604.152@compuserve.com> for providing me with
- *			important information about the cards.
- *
- * Author:	Mike McLagan <mike.mclagan@linux.org>
- *
- * Changes:
- *		0.15	Mike McLagan	Improved error handling, packet dropping
- *		0.20	Mike McLagan	New transmit/receive flags for config
- *					If in FR mode, don't accept packets from
- *					non DLCI devices.
- *		0.25	Mike McLagan	Fixed problem with rejecting packets
- *					from non DLCI devices.
- *		0.30	Mike McLagan	Fixed kernel panic when used with modified
- *					ifconfig
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/interrupt.h>
-#include <linux/ptrace.h>
-#include <linux/ioport.h>
-#include <linux/in.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/timer.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/if_frad.h>
-#include <linux/sdla.h>
-#include <linux/bitops.h>
-
-#include <asm/io.h>
-#include <asm/dma.h>
-#include <linux/uaccess.h>
-
-static const char* version = "SDLA driver v0.30, 12 Sep 1996, mike.mclagan@linux.org";
-
-static unsigned int valid_port[] = { 0x250, 0x270, 0x280, 0x300, 0x350, 0x360, 0x380, 0x390};
-
-static unsigned int valid_mem[] = {
-				    0xA0000, 0xA2000, 0xA4000, 0xA6000, 0xA8000, 0xAA000, 0xAC000, 0xAE000, 
-                                    0xB0000, 0xB2000, 0xB4000, 0xB6000, 0xB8000, 0xBA000, 0xBC000, 0xBE000,
-                                    0xC0000, 0xC2000, 0xC4000, 0xC6000, 0xC8000, 0xCA000, 0xCC000, 0xCE000,
-                                    0xD0000, 0xD2000, 0xD4000, 0xD6000, 0xD8000, 0xDA000, 0xDC000, 0xDE000,
-                                    0xE0000, 0xE2000, 0xE4000, 0xE6000, 0xE8000, 0xEA000, 0xEC000, 0xEE000}; 
-
-static DEFINE_SPINLOCK(sdla_lock);
-
-/*********************************************************
- *
- * these are the core routines that access the card itself 
- *
- *********************************************************/
-
-#define SDLA_WINDOW(dev,addr) outb((((addr) >> 13) & 0x1F), (dev)->base_addr + SDLA_REG_Z80_WINDOW)
-
-static void __sdla_read(struct net_device *dev, int addr, void *buf, short len)
-{
-	char          *temp;
-	const void    *base;
-	int           offset, bytes;
-
-	temp = buf;
-	while(len)
-	{	
-		offset = addr & SDLA_ADDR_MASK;
-		bytes = offset + len > SDLA_WINDOW_SIZE ? SDLA_WINDOW_SIZE - offset : len;
-		base = (const void *) (dev->mem_start + offset);
-
-		SDLA_WINDOW(dev, addr);
-		memcpy(temp, base, bytes);
-
-		addr += bytes;
-		temp += bytes;
-		len  -= bytes;
-	}  
-}
-
-static void sdla_read(struct net_device *dev, int addr, void *buf, short len)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&sdla_lock, flags);
-	__sdla_read(dev, addr, buf, len);
-	spin_unlock_irqrestore(&sdla_lock, flags);
-}
-
-static void __sdla_write(struct net_device *dev, int addr, 
-			 const void *buf, short len)
-{
-	const char    *temp;
-	void 	      *base;
-	int           offset, bytes;
-
-	temp = buf;
-	while(len)
-	{
-		offset = addr & SDLA_ADDR_MASK;
-		bytes = offset + len > SDLA_WINDOW_SIZE ? SDLA_WINDOW_SIZE - offset : len;
-		base = (void *) (dev->mem_start + offset);
-
-		SDLA_WINDOW(dev, addr);
-		memcpy(base, temp, bytes);
-
-		addr += bytes;
-		temp += bytes;
-		len  -= bytes;
-	}
-}
-
-static void sdla_write(struct net_device *dev, int addr, 
-		       const void *buf, short len)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&sdla_lock, flags);
-	__sdla_write(dev, addr, buf, len);
-	spin_unlock_irqrestore(&sdla_lock, flags);
-}
-
-
-static void sdla_clear(struct net_device *dev)
-{
-	unsigned long flags;
-	char          *base;
-	int           len, addr, bytes;
-
-	len = 65536;	
-	addr = 0;
-	bytes = SDLA_WINDOW_SIZE;
-	base = (void *) dev->mem_start;
-
-	spin_lock_irqsave(&sdla_lock, flags);
-	while(len)
-	{
-		SDLA_WINDOW(dev, addr);
-		memset(base, 0, bytes);
-
-		addr += bytes;
-		len  -= bytes;
-	}
-	spin_unlock_irqrestore(&sdla_lock, flags);
-
-}
-
-static char sdla_byte(struct net_device *dev, int addr)
-{
-	unsigned long flags;
-	char          byte, *temp;
-
-	temp = (void *) (dev->mem_start + (addr & SDLA_ADDR_MASK));
-
-	spin_lock_irqsave(&sdla_lock, flags);
-	SDLA_WINDOW(dev, addr);
-	byte = *temp;
-	spin_unlock_irqrestore(&sdla_lock, flags);
-
-	return byte;
-}
-
-static void sdla_stop(struct net_device *dev)
-{
-	struct frad_local *flp;
-
-	flp = netdev_priv(dev);
-	switch(flp->type)
-	{
-		case SDLA_S502A:
-			outb(SDLA_S502A_HALT, dev->base_addr + SDLA_REG_CONTROL);
-			flp->state = SDLA_HALT;
-			break;
-		case SDLA_S502E:
-			outb(SDLA_HALT, dev->base_addr + SDLA_REG_Z80_CONTROL);
-			outb(SDLA_S502E_ENABLE, dev->base_addr + SDLA_REG_CONTROL);
-			flp->state = SDLA_S502E_ENABLE;
-			break;
-		case SDLA_S507:
-			flp->state &= ~SDLA_CPUEN;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			break;
-		case SDLA_S508:
-			flp->state &= ~SDLA_CPUEN;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			break;
-	}
-}
-
-static void sdla_start(struct net_device *dev)
-{
-	struct frad_local *flp;
-
-	flp = netdev_priv(dev);
-	switch(flp->type)
-	{
-		case SDLA_S502A:
-			outb(SDLA_S502A_NMI, dev->base_addr + SDLA_REG_CONTROL);
-			outb(SDLA_S502A_START, dev->base_addr + SDLA_REG_CONTROL);
-			flp->state = SDLA_S502A_START;
-			break;
-		case SDLA_S502E:
-			outb(SDLA_S502E_CPUEN, dev->base_addr + SDLA_REG_Z80_CONTROL);
-			outb(0x00, dev->base_addr + SDLA_REG_CONTROL);
-			flp->state = 0;
-			break;
-		case SDLA_S507:
-			flp->state |= SDLA_CPUEN;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			break;
-		case SDLA_S508:
-			flp->state |= SDLA_CPUEN;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			break;
-	}
-}
-
-/****************************************************
- *
- * this is used for the S502A/E cards to determine
- * the speed of the onboard CPU.  Calibration is
- * necessary for the Frame Relay code uploaded 
- * later.  Incorrect results cause timing problems
- * with link checks & status messages
- *
- ***************************************************/
-
-static int sdla_z80_poll(struct net_device *dev, int z80_addr, int jiffs, char resp1, char resp2)
-{
-	unsigned long start, done, now;
-	char          resp, *temp;
-
-	start = now = jiffies;
-	done = jiffies + jiffs;
-
-	temp = (void *)dev->mem_start;
-	temp += z80_addr & SDLA_ADDR_MASK;
-	
-	resp = ~resp1;
-	while (time_before(jiffies, done) && (resp != resp1) && (!resp2 || (resp != resp2)))
-	{
-		if (jiffies != now)
-		{
-			SDLA_WINDOW(dev, z80_addr);
-			now = jiffies;
-			resp = *temp;
-		}
-	}
-	return time_before(jiffies, done) ? jiffies - start : -1;
-}
-
-/* constants for Z80 CPU speed */
-#define Z80_READY 		'1'	/* Z80 is ready to begin */
-#define LOADER_READY 		'2'	/* driver is ready to begin */
-#define Z80_SCC_OK 		'3'	/* SCC is on board */
-#define Z80_SCC_BAD	 	'4'	/* SCC was not found */
-
-static int sdla_cpuspeed(struct net_device *dev, struct ifreq *ifr)
-{
-	int  jiffs;
-	char data;
-
-	sdla_start(dev);
-	if (sdla_z80_poll(dev, 0, 3*HZ, Z80_READY, 0) < 0)
-		return -EIO;
-
-	data = LOADER_READY;
-	sdla_write(dev, 0, &data, 1);
-
-	if ((jiffs = sdla_z80_poll(dev, 0, 8*HZ, Z80_SCC_OK, Z80_SCC_BAD)) < 0)
-		return -EIO;
-
-	sdla_stop(dev);
-	sdla_read(dev, 0, &data, 1);
-
-	if (data == Z80_SCC_BAD)
-	{
-		printk("%s: SCC bad\n", dev->name);
-		return -EIO;
-	}
-
-	if (data != Z80_SCC_OK)
-		return -EINVAL;
-
-	if (jiffs < 165)
-		ifr->ifr_mtu = SDLA_CPU_16M;
-	else if (jiffs < 220)
-		ifr->ifr_mtu = SDLA_CPU_10M;
-	else if (jiffs < 258)
-		ifr->ifr_mtu = SDLA_CPU_8M;
-	else if (jiffs < 357)
-		ifr->ifr_mtu = SDLA_CPU_7M;
-	else if (jiffs < 467)
-		ifr->ifr_mtu = SDLA_CPU_5M;
-	else
-		ifr->ifr_mtu = SDLA_CPU_3M;
- 
-	return 0;
-}
-
-/************************************************
- *
- *  Direct interaction with the Frame Relay code 
- *  starts here.
- *
- ************************************************/
-
-struct _dlci_stat 
-{
-	short dlci;
-	char  flags;
-} __packed;
-
-struct _frad_stat 
-{
-	char    flags;
-	struct _dlci_stat dlcis[SDLA_MAX_DLCI];
-};
-
-static void sdla_errors(struct net_device *dev, int cmd, int dlci, int ret, int len, void *data) 
-{
-	struct _dlci_stat *pstatus;
-	short             *pdlci;
-	int               i;
-	char              *state, line[30];
-
-	switch (ret)
-	{
-		case SDLA_RET_MODEM:
-			state = data;
-			if (*state & SDLA_MODEM_DCD_LOW)
-				netdev_info(dev, "Modem DCD unexpectedly low!\n");
-			if (*state & SDLA_MODEM_CTS_LOW)
-				netdev_info(dev, "Modem CTS unexpectedly low!\n");
-			/* I should probably do something about this! */
-			break;
-
-		case SDLA_RET_CHANNEL_OFF:
-			netdev_info(dev, "Channel became inoperative!\n");
-			/* same here */
-			break;
-
-		case SDLA_RET_CHANNEL_ON:
-			netdev_info(dev, "Channel became operative!\n");
-			/* same here */
-			break;
-
-		case SDLA_RET_DLCI_STATUS:
-			netdev_info(dev, "Status change reported by Access Node\n");
-			len /= sizeof(struct _dlci_stat);
-			for(pstatus = data, i=0;i < len;i++,pstatus++)
-			{
-				if (pstatus->flags & SDLA_DLCI_NEW)
-					state = "new";
-				else if (pstatus->flags & SDLA_DLCI_DELETED)
-					state = "deleted";
-				else if (pstatus->flags & SDLA_DLCI_ACTIVE)
-					state = "active";
-				else
-				{
-					sprintf(line, "unknown status: %02X", pstatus->flags);
-					state = line;
-				}
-				netdev_info(dev, "DLCI %i: %s\n",
-					    pstatus->dlci, state);
-				/* same here */
-			}
-			break;
-
-		case SDLA_RET_DLCI_UNKNOWN:
-			netdev_info(dev, "Received unknown DLCIs:");
-			len /= sizeof(short);
-			for(pdlci = data,i=0;i < len;i++,pdlci++)
-				pr_cont(" %i", *pdlci);
-			pr_cont("\n");
-			break;
-
-		case SDLA_RET_TIMEOUT:
-			netdev_err(dev, "Command timed out!\n");
-			break;
-
-		case SDLA_RET_BUF_OVERSIZE:
-			netdev_info(dev, "Bc/CIR overflow, acceptable size is %i\n",
-				    len);
-			break;
-
-		case SDLA_RET_BUF_TOO_BIG:
-			netdev_info(dev, "Buffer size over specified max of %i\n",
-				    len);
-			break;
-
-		case SDLA_RET_CHANNEL_INACTIVE:
-		case SDLA_RET_DLCI_INACTIVE:
-		case SDLA_RET_CIR_OVERFLOW:
-		case SDLA_RET_NO_BUFS:
-			if (cmd == SDLA_INFORMATION_WRITE)
-				break;
-			fallthrough;
-
-		default: 
-			netdev_dbg(dev, "Cmd 0x%02X generated return code 0x%02X\n",
-				   cmd, ret);
-			/* Further processing could be done here */
-			break;
-	}
-}
-
-static int sdla_cmd(struct net_device *dev, int cmd, short dlci, short flags, 
-                        void *inbuf, short inlen, void *outbuf, short *outlen)
-{
-	static struct _frad_stat status;
-	struct frad_local        *flp;
-	struct sdla_cmd          *cmd_buf;
-	unsigned long            pflags;
-	unsigned long		 jiffs;
-	int                      ret, waiting, len;
-	long                     window;
-
-	flp = netdev_priv(dev);
-	window = flp->type == SDLA_S508 ? SDLA_508_CMD_BUF : SDLA_502_CMD_BUF;
-	cmd_buf = (struct sdla_cmd *)(dev->mem_start + (window & SDLA_ADDR_MASK));
-	ret = 0;
-	len = 0;
-	jiffs = jiffies + HZ;  /* 1 second is plenty */
-
-	spin_lock_irqsave(&sdla_lock, pflags);
-	SDLA_WINDOW(dev, window);
-	cmd_buf->cmd = cmd;
-	cmd_buf->dlci = dlci;
-	cmd_buf->flags = flags;
-
-	if (inbuf)
-		memcpy(cmd_buf->data, inbuf, inlen);
-
-	cmd_buf->length = inlen;
-
-	cmd_buf->opp_flag = 1;
-	spin_unlock_irqrestore(&sdla_lock, pflags);
-
-	waiting = 1;
-	len = 0;
-	while (waiting && time_before_eq(jiffies, jiffs))
-	{
-		if (waiting++ % 3) 
-		{
-			spin_lock_irqsave(&sdla_lock, pflags);
-			SDLA_WINDOW(dev, window);
-			waiting = ((volatile int)(cmd_buf->opp_flag));
-			spin_unlock_irqrestore(&sdla_lock, pflags);
-		}
-	}
-	
-	if (!waiting)
-	{
-
-		spin_lock_irqsave(&sdla_lock, pflags);
-		SDLA_WINDOW(dev, window);
-		ret = cmd_buf->retval;
-		len = cmd_buf->length;
-		if (outbuf && outlen)
-		{
-			*outlen = *outlen >= len ? len : *outlen;
-
-			if (*outlen)
-				memcpy(outbuf, cmd_buf->data, *outlen);
-		}
-
-		/* This is a local copy that's used for error handling */
-		if (ret)
-			memcpy(&status, cmd_buf->data, len > sizeof(status) ? sizeof(status) : len);
-
-		spin_unlock_irqrestore(&sdla_lock, pflags);
-	}
-	else
-		ret = SDLA_RET_TIMEOUT;
-
-	if (ret != SDLA_RET_OK)
-	   	sdla_errors(dev, cmd, dlci, ret, len, &status);
-
-	return ret;
-}
-
-/***********************************************
- *
- * these functions are called by the DLCI driver 
- *
- ***********************************************/
-
-static int sdla_reconfig(struct net_device *dev);
-
-static int sdla_activate(struct net_device *slave, struct net_device *master)
-{
-	struct frad_local *flp;
-	int i;
-
-	flp = netdev_priv(slave);
-
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-		if (flp->master[i] == master)
-			break;
-
-	if (i == CONFIG_DLCI_MAX)
-		return -ENODEV;
-
-	flp->dlci[i] = abs(flp->dlci[i]);
-
-	if (netif_running(slave) && (flp->config.station == FRAD_STATION_NODE))
-		sdla_cmd(slave, SDLA_ACTIVATE_DLCI, 0, 0, &flp->dlci[i], sizeof(short), NULL, NULL);
-
-	return 0;
-}
-
-static int sdla_deactivate(struct net_device *slave, struct net_device *master)
-{
-	struct frad_local *flp;
-	int               i;
-
-	flp = netdev_priv(slave);
-
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-		if (flp->master[i] == master)
-			break;
-
-	if (i == CONFIG_DLCI_MAX)
-		return -ENODEV;
-
-	flp->dlci[i] = -abs(flp->dlci[i]);
-
-	if (netif_running(slave) && (flp->config.station == FRAD_STATION_NODE))
-		sdla_cmd(slave, SDLA_DEACTIVATE_DLCI, 0, 0, &flp->dlci[i], sizeof(short), NULL, NULL);
-
-	return 0;
-}
-
-static int sdla_assoc(struct net_device *slave, struct net_device *master)
-{
-	struct frad_local *flp;
-	int               i;
-
-	if (master->type != ARPHRD_DLCI)
-		return -EINVAL;
-
-	flp = netdev_priv(slave);
-
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-	{
-		if (!flp->master[i])
-			break;
-		if (abs(flp->dlci[i]) == *(short *)(master->dev_addr))
-			return -EADDRINUSE;
-	} 
-
-	if (i == CONFIG_DLCI_MAX)
-		return -EMLINK;  /* #### Alan: Comments on this ?? */
-
-
-	flp->master[i] = master;
-	flp->dlci[i] = -*(short *)(master->dev_addr);
-	master->mtu = slave->mtu;
-
-	if (netif_running(slave)) {
-		if (flp->config.station == FRAD_STATION_CPE)
-			sdla_reconfig(slave);
-		else
-			sdla_cmd(slave, SDLA_ADD_DLCI, 0, 0, master->dev_addr, sizeof(short), NULL, NULL);
-	}
-
-	return 0;
-}
-
-static int sdla_deassoc(struct net_device *slave, struct net_device *master)
-{
-	struct frad_local *flp;
-	int               i;
-
-	flp = netdev_priv(slave);
-
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-		if (flp->master[i] == master)
-			break;
-
-	if (i == CONFIG_DLCI_MAX)
-		return -ENODEV;
-
-	flp->master[i] = NULL;
-	flp->dlci[i] = 0;
-
-
-	if (netif_running(slave)) {
-		if (flp->config.station == FRAD_STATION_CPE)
-			sdla_reconfig(slave);
-		else
-			sdla_cmd(slave, SDLA_DELETE_DLCI, 0, 0, master->dev_addr, sizeof(short), NULL, NULL);
-	}
-
-	return 0;
-}
-
-static int sdla_dlci_conf(struct net_device *slave, struct net_device *master, int get)
-{
-	struct frad_local *flp;
-	struct dlci_local *dlp;
-	int               i;
-	short             len, ret;
-
-	flp = netdev_priv(slave);
-
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-		if (flp->master[i] == master)
-			break;
-
-	if (i == CONFIG_DLCI_MAX)
-		return -ENODEV;
-
-	dlp = netdev_priv(master);
-
-	ret = SDLA_RET_OK;
-	len = sizeof(struct dlci_conf);
-	if (netif_running(slave)) {
-		if (get)
-			ret = sdla_cmd(slave, SDLA_READ_DLCI_CONFIGURATION, abs(flp->dlci[i]), 0,  
-			            NULL, 0, &dlp->config, &len);
-		else
-			ret = sdla_cmd(slave, SDLA_SET_DLCI_CONFIGURATION, abs(flp->dlci[i]), 0,  
-			            &dlp->config, sizeof(struct dlci_conf) - 4 * sizeof(short), NULL, NULL);
-	}
-
-	return ret == SDLA_RET_OK ? 0 : -EIO;
-}
-
-/**************************
- *
- * now for the Linux driver 
- *
- **************************/
-
-/* NOTE: the DLCI driver deals with freeing the SKB!! */
-static netdev_tx_t sdla_transmit(struct sk_buff *skb,
-				 struct net_device *dev)
-{
-	struct frad_local *flp;
-	int               ret, addr, accept, i;
-	short             size;
-	unsigned long     flags;
-	struct buf_entry  *pbuf;
-
-	flp = netdev_priv(dev);
-	ret = 0;
-	accept = 1;
-
-	netif_stop_queue(dev);
-
-	/*
-	 * stupid GateD insists on setting up the multicast router thru us
-	 * and we're ill equipped to handle a non Frame Relay packet at this
-	 * time!
-	 */
-
-	accept = 1;
-	switch (dev->type)
-	{
-		case ARPHRD_FRAD:
-			if (skb->dev->type != ARPHRD_DLCI)
-			{
-				netdev_warn(dev, "Non DLCI device, type %i, tried to send on FRAD module\n",
-					    skb->dev->type);
-				accept = 0;
-			}
-			break;
-		default:
-			netdev_warn(dev, "unknown firmware type 0x%04X\n",
-				    dev->type);
-			accept = 0;
-			break;
-	}
-	if (accept)
-	{
-		/* this is frame specific, but till there's a PPP module, it's the default */
-		switch (flp->type)
-		{
-			case SDLA_S502A:
-			case SDLA_S502E:
-				ret = sdla_cmd(dev, SDLA_INFORMATION_WRITE, *(short *)(skb->dev->dev_addr), 0, skb->data, skb->len, NULL, NULL);
-				break;
-				case SDLA_S508:
-				size = sizeof(addr);
-				ret = sdla_cmd(dev, SDLA_INFORMATION_WRITE, *(short *)(skb->dev->dev_addr), 0, NULL, skb->len, &addr, &size);
-				if (ret == SDLA_RET_OK)
-				{
-
-					spin_lock_irqsave(&sdla_lock, flags);
-					SDLA_WINDOW(dev, addr);
-					pbuf = (void *)(dev->mem_start + (addr & SDLA_ADDR_MASK));
-					__sdla_write(dev, pbuf->buf_addr, skb->data, skb->len);
-					SDLA_WINDOW(dev, addr);
-					pbuf->opp_flag = 1;
-					spin_unlock_irqrestore(&sdla_lock, flags);
-				}
-				break;
-		}
-
-		switch (ret)
-		{
-			case SDLA_RET_OK:
-				dev->stats.tx_packets++;
-				break;
-
-			case SDLA_RET_CIR_OVERFLOW:
-			case SDLA_RET_BUF_OVERSIZE:
-			case SDLA_RET_NO_BUFS:
-				dev->stats.tx_dropped++;
-				break;
-
-			default:
-				dev->stats.tx_errors++;
-				break;
-		}
-	}
-	netif_wake_queue(dev);
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-	{
-		if(flp->master[i]!=NULL)
-			netif_wake_queue(flp->master[i]);
-	}		
-
-	dev_kfree_skb(skb);
-	return NETDEV_TX_OK;
-}
-
-static void sdla_receive(struct net_device *dev)
-{
-	struct net_device	  *master;
-	struct frad_local *flp;
-	struct dlci_local *dlp;
-	struct sk_buff	 *skb;
-
-	struct sdla_cmd	*cmd;
-	struct buf_info	*pbufi;
-	struct buf_entry  *pbuf;
-
-	unsigned long	  flags;
-	int               i=0, received, success, addr, buf_base, buf_top;
-	short             dlci, len, len2, split;
-
-	flp = netdev_priv(dev);
-	success = 1;
-	received = addr = buf_top = buf_base = 0;
-	len = dlci = 0;
-	skb = NULL;
-	master = NULL;
-	cmd = NULL;
-	pbufi = NULL;
-	pbuf = NULL;
-
-	spin_lock_irqsave(&sdla_lock, flags);
-
-	switch (flp->type)
-	{
-		case SDLA_S502A:
-		case SDLA_S502E:
-			cmd = (void *) (dev->mem_start + (SDLA_502_RCV_BUF & SDLA_ADDR_MASK));
-			SDLA_WINDOW(dev, SDLA_502_RCV_BUF);
-			success = cmd->opp_flag;
-			if (!success)
-				break;
-
-			dlci = cmd->dlci;
-			len = cmd->length;
-			break;
-
-		case SDLA_S508:
-			pbufi = (void *) (dev->mem_start + (SDLA_508_RXBUF_INFO & SDLA_ADDR_MASK));
-			SDLA_WINDOW(dev, SDLA_508_RXBUF_INFO);
-			pbuf = (void *) (dev->mem_start + ((pbufi->rse_base + flp->buffer * sizeof(struct buf_entry)) & SDLA_ADDR_MASK));
-			success = pbuf->opp_flag;
-			if (!success)
-				break;
-
-			buf_top = pbufi->buf_top;
-			buf_base = pbufi->buf_base;
-			dlci = pbuf->dlci;
-			len = pbuf->length;
-			addr = pbuf->buf_addr;
-			break;
-	}
-
-	/* common code, find the DLCI and get the SKB */
-	if (success)
-	{
-		for (i=0;i<CONFIG_DLCI_MAX;i++)
-			if (flp->dlci[i] == dlci)
-				break;
-
-		if (i == CONFIG_DLCI_MAX)
-		{
-			netdev_notice(dev, "Received packet from invalid DLCI %i, ignoring\n",
-				      dlci);
-			dev->stats.rx_errors++;
-			success = 0;
-		}
-	}
-
-	if (success)
-	{
-		master = flp->master[i];
-		skb = dev_alloc_skb(len + sizeof(struct frhdr));
-		if (skb == NULL) 
-		{
-			netdev_notice(dev, "Memory squeeze, dropping packet\n");
-			dev->stats.rx_dropped++;
-			success = 0;
-		}
-		else
-			skb_reserve(skb, sizeof(struct frhdr));
-	}
-
-	/* pick up the data */
-	switch (flp->type)
-	{
-		case SDLA_S502A:
-		case SDLA_S502E:
-			if (success)
-				__sdla_read(dev, SDLA_502_RCV_BUF + SDLA_502_DATA_OFS, skb_put(skb,len), len);
-
-			SDLA_WINDOW(dev, SDLA_502_RCV_BUF);
-			cmd->opp_flag = 0;
-			break;
-
-		case SDLA_S508:
-			if (success)
-			{
-				/* is this buffer split off the end of the internal ring buffer */
-				split = addr + len > buf_top + 1 ? len - (buf_top - addr + 1) : 0;
-				len2 = len - split;
-
-				__sdla_read(dev, addr, skb_put(skb, len2), len2);
-				if (split)
-					__sdla_read(dev, buf_base, skb_put(skb, split), split);
-			}
-
-			/* increment the buffer we're looking at */
-			SDLA_WINDOW(dev, SDLA_508_RXBUF_INFO);
-			flp->buffer = (flp->buffer + 1) % pbufi->rse_num;
-			pbuf->opp_flag = 0;
-			break;
-	}
-
-	if (success)
-	{
-		dev->stats.rx_packets++;
-		dlp = netdev_priv(master);
-		(*dlp->receive)(skb, master);
-	}
-
-	spin_unlock_irqrestore(&sdla_lock, flags);
-}
-
-static irqreturn_t sdla_isr(int dummy, void *dev_id)
-{
-	struct net_device     *dev;
-	struct frad_local *flp;
-	char              byte;
-
-	dev = dev_id;
-
-	flp = netdev_priv(dev);
-
-	if (!flp->initialized)
-	{
-		netdev_warn(dev, "irq %d for uninitialized device\n", dev->irq);
-		return IRQ_NONE;
-	}
-
-	byte = sdla_byte(dev, flp->type == SDLA_S508 ? SDLA_508_IRQ_INTERFACE : SDLA_502_IRQ_INTERFACE);
-	switch (byte)
-	{
-		case SDLA_INTR_RX:
-			sdla_receive(dev);
-			break;
-
-		/* the command will get an error return, which is processed above */
-		case SDLA_INTR_MODEM:
-		case SDLA_INTR_STATUS:
-			sdla_cmd(dev, SDLA_READ_DLC_STATUS, 0, 0, NULL, 0, NULL, NULL);
-			break;
-
-		case SDLA_INTR_TX:
-		case SDLA_INTR_COMPLETE:
-		case SDLA_INTR_TIMER:
-			netdev_warn(dev, "invalid irq flag 0x%02X\n", byte);
-			break;
-	}
-
-	/* the S502E requires a manual acknowledgement of the interrupt */ 
-	if (flp->type == SDLA_S502E)
-	{
-		flp->state &= ~SDLA_S502E_INTACK;
-		outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-		flp->state |= SDLA_S502E_INTACK;
-		outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-	}
-
-	/* this clears the byte, informing the Z80 we're done */
-	byte = 0;
-	sdla_write(dev, flp->type == SDLA_S508 ? SDLA_508_IRQ_INTERFACE : SDLA_502_IRQ_INTERFACE, &byte, sizeof(byte));
-	return IRQ_HANDLED;
-}
-
-static void sdla_poll(struct timer_list *t)
-{
-	struct frad_local *flp = from_timer(flp, t, timer);
-	struct net_device *dev = flp->dev;
-
-	if (sdla_byte(dev, SDLA_502_RCV_BUF))
-		sdla_receive(dev);
-
-	flp->timer.expires = 1;
-	add_timer(&flp->timer);
-}
-
-static int sdla_close(struct net_device *dev)
-{
-	struct frad_local *flp;
-	struct intr_info  intr;
-	int               len, i;
-	short             dlcis[CONFIG_DLCI_MAX];
-
-	flp = netdev_priv(dev);
-
-	len = 0;
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-		if (flp->dlci[i])
-			dlcis[len++] = abs(flp->dlci[i]);
-	len *= 2;
-
-	if (flp->config.station == FRAD_STATION_NODE)
-	{
-		for(i=0;i<CONFIG_DLCI_MAX;i++)
-			if (flp->dlci[i] > 0) 
-				sdla_cmd(dev, SDLA_DEACTIVATE_DLCI, 0, 0, dlcis, len, NULL, NULL);
-		sdla_cmd(dev, SDLA_DELETE_DLCI, 0, 0, &flp->dlci[i], sizeof(flp->dlci[i]), NULL, NULL);
-	}
-
-	memset(&intr, 0, sizeof(intr));
-	/* let's start up the reception */
-	switch(flp->type)
-	{
-		case SDLA_S502A:
-			del_timer(&flp->timer); 
-			break;
-
-		case SDLA_S502E:
-			sdla_cmd(dev, SDLA_SET_IRQ_TRIGGER, 0, 0, &intr, sizeof(char) + sizeof(short), NULL, NULL);
-			flp->state &= ~SDLA_S502E_INTACK;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			break;
-
-		case SDLA_S507:
-			break;
-
-		case SDLA_S508:
-			sdla_cmd(dev, SDLA_SET_IRQ_TRIGGER, 0, 0, &intr, sizeof(struct intr_info), NULL, NULL);
-			flp->state &= ~SDLA_S508_INTEN;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			break;
-	}
-
-	sdla_cmd(dev, SDLA_DISABLE_COMMUNICATIONS, 0, 0, NULL, 0, NULL, NULL);
-
-	netif_stop_queue(dev);
-	
-	return 0;
-}
-
-struct conf_data {
-	struct frad_conf config;
-	short            dlci[CONFIG_DLCI_MAX];
-};
-
-static int sdla_open(struct net_device *dev)
-{
-	struct frad_local *flp;
-	struct dlci_local *dlp;
-	struct conf_data  data;
-	struct intr_info  intr;
-	int               len, i;
-	char              byte;
-
-	flp = netdev_priv(dev);
-
-	if (!flp->initialized)
-		return -EPERM;
-
-	if (!flp->configured)
-		return -EPERM;
-
-	/* time to send in the configuration */
-	len = 0;
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-		if (flp->dlci[i])
-			data.dlci[len++] = abs(flp->dlci[i]);
-	len *= 2;
-
-	memcpy(&data.config, &flp->config, sizeof(struct frad_conf));
-	len += sizeof(struct frad_conf);
-
-	sdla_cmd(dev, SDLA_DISABLE_COMMUNICATIONS, 0, 0, NULL, 0, NULL, NULL);
-	sdla_cmd(dev, SDLA_SET_DLCI_CONFIGURATION, 0, 0, &data, len, NULL, NULL);
-
-	if (flp->type == SDLA_S508)
-		flp->buffer = 0;
-
-	sdla_cmd(dev, SDLA_ENABLE_COMMUNICATIONS, 0, 0, NULL, 0, NULL, NULL);
-
-	/* let's start up the reception */
-	memset(&intr, 0, sizeof(intr));
-	switch(flp->type)
-	{
-		case SDLA_S502A:
-			flp->timer.expires = 1;
-			add_timer(&flp->timer);
-			break;
-
-		case SDLA_S502E:
-			flp->state |= SDLA_S502E_ENABLE;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			flp->state |= SDLA_S502E_INTACK;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			byte = 0;
-			sdla_write(dev, SDLA_502_IRQ_INTERFACE, &byte, sizeof(byte));
-			intr.flags = SDLA_INTR_RX | SDLA_INTR_STATUS | SDLA_INTR_MODEM;
-			sdla_cmd(dev, SDLA_SET_IRQ_TRIGGER, 0, 0, &intr, sizeof(char) + sizeof(short), NULL, NULL);
-			break;
-
-		case SDLA_S507:
-			break;
-
-		case SDLA_S508:
-			flp->state |= SDLA_S508_INTEN;
-			outb(flp->state, dev->base_addr + SDLA_REG_CONTROL);
-			byte = 0;
-			sdla_write(dev, SDLA_508_IRQ_INTERFACE, &byte, sizeof(byte));
-			intr.flags = SDLA_INTR_RX | SDLA_INTR_STATUS | SDLA_INTR_MODEM;
-			intr.irq = dev->irq;
-			sdla_cmd(dev, SDLA_SET_IRQ_TRIGGER, 0, 0, &intr, sizeof(struct intr_info), NULL, NULL);
-			break;
-	}
-
-	if (flp->config.station == FRAD_STATION_CPE)
-	{
-		byte = SDLA_ICS_STATUS_ENQ;
-		sdla_cmd(dev, SDLA_ISSUE_IN_CHANNEL_SIGNAL, 0, 0, &byte, sizeof(byte), NULL, NULL);
-	}
-	else
-	{
-		sdla_cmd(dev, SDLA_ADD_DLCI, 0, 0, data.dlci, len - sizeof(struct frad_conf), NULL, NULL);
-		for(i=0;i<CONFIG_DLCI_MAX;i++)
-			if (flp->dlci[i] > 0)
-				sdla_cmd(dev, SDLA_ACTIVATE_DLCI, 0, 0, &flp->dlci[i], 2*sizeof(flp->dlci[i]), NULL, NULL);
-	}
-
-	/* configure any specific DLCI settings */
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-		if (flp->dlci[i])
-		{
-			dlp = netdev_priv(flp->master[i]);
-			if (dlp->configured)
-				sdla_cmd(dev, SDLA_SET_DLCI_CONFIGURATION, abs(flp->dlci[i]), 0, &dlp->config, sizeof(struct dlci_conf), NULL, NULL);
-		}
-
-	netif_start_queue(dev);
-	
-	return 0;
-}
-
-static int sdla_config(struct net_device *dev, struct frad_conf __user *conf, int get)
-{
-	struct frad_local *flp;
-	struct conf_data  data;
-	int               i;
-	short             size;
-
-	if (dev->type == 0xFFFF)
-		return -EUNATCH;
-
-	flp = netdev_priv(dev);
-
-	if (!get)
-	{
-		if (netif_running(dev))
-			return -EBUSY;
-
-		if(copy_from_user(&data.config, conf, sizeof(struct frad_conf)))
-			return -EFAULT;
-
-		if (data.config.station & ~FRAD_STATION_NODE)
-			return -EINVAL;
-
-		if (data.config.flags & ~FRAD_VALID_FLAGS)
-			return -EINVAL;
-
-		if ((data.config.kbaud < 0) || 
-			 ((data.config.kbaud > 128) && (flp->type != SDLA_S508)))
-			return -EINVAL;
-
-		if (data.config.clocking & ~(FRAD_CLOCK_INT | SDLA_S508_PORT_RS232))
-			return -EINVAL;
-
-		if ((data.config.mtu < 0) || (data.config.mtu > SDLA_MAX_MTU))
-			return -EINVAL;
-
-		if ((data.config.T391 < 5) || (data.config.T391 > 30))
-			return -EINVAL;
-
-		if ((data.config.T392 < 5) || (data.config.T392 > 30))
-			return -EINVAL;
-
-		if ((data.config.N391 < 1) || (data.config.N391 > 255))
-			return -EINVAL;
-
-		if ((data.config.N392 < 1) || (data.config.N392 > 10))
-			return -EINVAL;
-
-		if ((data.config.N393 < 1) || (data.config.N393 > 10))
-			return -EINVAL;
-
-		memcpy(&flp->config, &data.config, sizeof(struct frad_conf));
-		flp->config.flags |= SDLA_DIRECT_RECV;
-
-		if (flp->type == SDLA_S508)
-			flp->config.flags |= SDLA_TX70_RX30;
-
-		if (dev->mtu != flp->config.mtu)
-		{
-			/* this is required to change the MTU */
-			dev->mtu = flp->config.mtu;
-			for(i=0;i<CONFIG_DLCI_MAX;i++)
-				if (flp->master[i])
-					flp->master[i]->mtu = flp->config.mtu;
-		}
-
-		flp->config.mtu += sizeof(struct frhdr);
-
-		/* off to the races! */
-		if (!flp->configured)
-			sdla_start(dev);
-
-		flp->configured = 1;
-	}
-	else
-	{
-		/* no sense reading if the CPU isn't started */
-		if (netif_running(dev))
-		{
-			size = sizeof(data);
-			if (sdla_cmd(dev, SDLA_READ_DLCI_CONFIGURATION, 0, 0, NULL, 0, &data, &size) != SDLA_RET_OK)
-				return -EIO;
-		}
-		else
-			if (flp->configured)
-				memcpy(&data.config, &flp->config, sizeof(struct frad_conf));
-			else
-				memset(&data.config, 0, sizeof(struct frad_conf));
-
-		memcpy(&flp->config, &data.config, sizeof(struct frad_conf));
-		data.config.flags &= FRAD_VALID_FLAGS;
-		data.config.mtu -= data.config.mtu > sizeof(struct frhdr) ? sizeof(struct frhdr) : data.config.mtu;
-		return copy_to_user(conf, &data.config, sizeof(struct frad_conf))?-EFAULT:0;
-	}
-
-	return 0;
-}
-
-static int sdla_xfer(struct net_device *dev, struct sdla_mem __user *info, int read)
-{
-	struct sdla_mem mem;
-	char	*temp;
-
-	if(copy_from_user(&mem, info, sizeof(mem)))
-		return -EFAULT;
-		
-	if (read)
-	{	
-		temp = kzalloc(mem.len, GFP_KERNEL);
-		if (!temp)
-			return -ENOMEM;
-		sdla_read(dev, mem.addr, temp, mem.len);
-		if(copy_to_user(mem.data, temp, mem.len))
-		{
-			kfree(temp);
-			return -EFAULT;
-		}
-		kfree(temp);
-	}
-	else
-	{
-		temp = memdup_user(mem.data, mem.len);
-		if (IS_ERR(temp))
-			return PTR_ERR(temp);
-		sdla_write(dev, mem.addr, temp, mem.len);
-		kfree(temp);
-	}
-	return 0;
-}
-
-static int sdla_reconfig(struct net_device *dev)
-{
-	struct frad_local *flp;
-	struct conf_data  data;
-	int               i, len;
-
-	flp = netdev_priv(dev);
-
-	len = 0;
-	for(i=0;i<CONFIG_DLCI_MAX;i++)
-		if (flp->dlci[i])
-			data.dlci[len++] = flp->dlci[i];
-	len *= 2;
-
-	memcpy(&data, &flp->config, sizeof(struct frad_conf));
-	len += sizeof(struct frad_conf);
-
-	sdla_cmd(dev, SDLA_DISABLE_COMMUNICATIONS, 0, 0, NULL, 0, NULL, NULL);
-	sdla_cmd(dev, SDLA_SET_DLCI_CONFIGURATION, 0, 0, &data, len, NULL, NULL);
-	sdla_cmd(dev, SDLA_ENABLE_COMMUNICATIONS, 0, 0, NULL, 0, NULL, NULL);
-
-	return 0;
-}
-
-static int sdla_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	struct frad_local *flp;
-
-	if(!capable(CAP_NET_ADMIN))
-		return -EPERM;
-		
-	flp = netdev_priv(dev);
-
-	if (!flp->initialized)
-		return -EINVAL;
-
-	switch (cmd)
-	{
-		case FRAD_GET_CONF:
-		case FRAD_SET_CONF:
-			return sdla_config(dev, ifr->ifr_data, cmd == FRAD_GET_CONF);
-
-		case SDLA_IDENTIFY:
-			ifr->ifr_flags = flp->type;
-			break;
-
-		case SDLA_CPUSPEED:
-			return sdla_cpuspeed(dev, ifr);
-
-/* ==========================================================
-NOTE:  This is rather a useless action right now, as the
-       current driver does not support protocols other than
-       FR.  However, Sangoma has modules for a number of
-       other protocols in the works.
-============================================================*/
-		case SDLA_PROTOCOL:
-			if (flp->configured)
-				return -EALREADY;
-
-			switch (ifr->ifr_flags)
-			{
-				case ARPHRD_FRAD:
-					dev->type = ifr->ifr_flags;
-					break;
-				default:
-					return -ENOPROTOOPT;
-			}
-			break;
-
-		case SDLA_CLEARMEM:
-			sdla_clear(dev);
-			break;
-
-		case SDLA_WRITEMEM:
-		case SDLA_READMEM:
-			if(!capable(CAP_SYS_RAWIO))
-				return -EPERM;
-			return sdla_xfer(dev, ifr->ifr_data, cmd == SDLA_READMEM);
-
-		case SDLA_START:
-			sdla_start(dev);
-			break;
-
-		case SDLA_STOP:
-			sdla_stop(dev);
-			break;
-
-		default:
-			return -EOPNOTSUPP;
-	}
-	return 0;
-}
-
-static int sdla_change_mtu(struct net_device *dev, int new_mtu)
-{
-	if (netif_running(dev))
-		return -EBUSY;
-
-	/* for now, you can't change the MTU! */
-	return -EOPNOTSUPP;
-}
-
-static int sdla_set_config(struct net_device *dev, struct ifmap *map)
-{
-	struct frad_local *flp;
-	int               i;
-	char              byte;
-	unsigned base;
-	int err = -EINVAL;
-
-	flp = netdev_priv(dev);
-
-	if (flp->initialized)
-		return -EINVAL;
-
-	for(i=0; i < ARRAY_SIZE(valid_port); i++)
-		if (valid_port[i] == map->base_addr)
-			break;   
-
-	if (i == ARRAY_SIZE(valid_port))
-		return -EINVAL;
-
-	if (!request_region(map->base_addr, SDLA_IO_EXTENTS, dev->name)){
-		pr_warn("io-port 0x%04lx in use\n", dev->base_addr);
-		return -EINVAL;
-	}
-	base = map->base_addr;
-
-	/* test for card types, S502A, S502E, S507, S508                 */
-	/* these tests shut down the card completely, so clear the state */
-	flp->type = SDLA_UNKNOWN;
-	flp->state = 0;
-   
-	for(i=1;i<SDLA_IO_EXTENTS;i++)
-		if (inb(base + i) != 0xFF)
-			break;
-
-	if (i == SDLA_IO_EXTENTS) {   
-		outb(SDLA_HALT, base + SDLA_REG_Z80_CONTROL);
-		if ((inb(base + SDLA_S502_STS) & 0x0F) == 0x08) {
-			outb(SDLA_S502E_INTACK, base + SDLA_REG_CONTROL);
-			if ((inb(base + SDLA_S502_STS) & 0x0F) == 0x0C) {
-				outb(SDLA_HALT, base + SDLA_REG_CONTROL);
-				flp->type = SDLA_S502E;
-				goto got_type;
-			}
-		}
-	}
-
-	for(byte=inb(base),i=0;i<SDLA_IO_EXTENTS;i++)
-		if (inb(base + i) != byte)
-			break;
-
-	if (i == SDLA_IO_EXTENTS) {
-		outb(SDLA_HALT, base + SDLA_REG_CONTROL);
-		if ((inb(base + SDLA_S502_STS) & 0x7E) == 0x30) {
-			outb(SDLA_S507_ENABLE, base + SDLA_REG_CONTROL);
-			if ((inb(base + SDLA_S502_STS) & 0x7E) == 0x32) {
-				outb(SDLA_HALT, base + SDLA_REG_CONTROL);
-				flp->type = SDLA_S507;
-				goto got_type;
-			}
-		}
-	}
-
-	outb(SDLA_HALT, base + SDLA_REG_CONTROL);
-	if ((inb(base + SDLA_S508_STS) & 0x3F) == 0x00) {
-		outb(SDLA_S508_INTEN, base + SDLA_REG_CONTROL);
-		if ((inb(base + SDLA_S508_STS) & 0x3F) == 0x10) {
-			outb(SDLA_HALT, base + SDLA_REG_CONTROL);
-			flp->type = SDLA_S508;
-			goto got_type;
-		}
-	}
-
-	outb(SDLA_S502A_HALT, base + SDLA_REG_CONTROL);
-	if (inb(base + SDLA_S502_STS) == 0x40) {
-		outb(SDLA_S502A_START, base + SDLA_REG_CONTROL);
-		if (inb(base + SDLA_S502_STS) == 0x40) {
-			outb(SDLA_S502A_INTEN, base + SDLA_REG_CONTROL);
-			if (inb(base + SDLA_S502_STS) == 0x44) {
-				outb(SDLA_S502A_START, base + SDLA_REG_CONTROL);
-				flp->type = SDLA_S502A;
-				goto got_type;
-			}
-		}
-	}
-
-	netdev_notice(dev, "Unknown card type\n");
-	err = -ENODEV;
-	goto fail;
-
-got_type:
-	switch(base) {
-		case 0x270:
-		case 0x280:
-		case 0x380: 
-		case 0x390:
-			if (flp->type != SDLA_S508 && flp->type != SDLA_S507)
-				goto fail;
-	}
-
-	switch (map->irq) {
-		case 2:
-			if (flp->type != SDLA_S502E)
-				goto fail;
-			break;
-
-		case 10:
-		case 11:
-		case 12:
-		case 15:
-		case 4:
-			if (flp->type != SDLA_S508 && flp->type != SDLA_S507)
-				goto fail;
-			break;
-		case 3:
-		case 5:
-		case 7:
-			if (flp->type == SDLA_S502A)
-				goto fail;
-			break;
-
-		default:
-			goto fail;
-	}
-
-	err = -EAGAIN;
-	if (request_irq(dev->irq, sdla_isr, 0, dev->name, dev)) 
-		goto fail;
-
-	if (flp->type == SDLA_S507) {
-		switch(dev->irq) {
-			case 3:
-				flp->state = SDLA_S507_IRQ3;
-				break;
-			case 4:
-				flp->state = SDLA_S507_IRQ4;
-				break;
-			case 5:
-				flp->state = SDLA_S507_IRQ5;
-				break;
-			case 7:
-				flp->state = SDLA_S507_IRQ7;
-				break;
-			case 10:
-				flp->state = SDLA_S507_IRQ10;
-				break;
-			case 11:
-				flp->state = SDLA_S507_IRQ11;
-				break;
-			case 12:
-				flp->state = SDLA_S507_IRQ12;
-				break;
-			case 15:
-				flp->state = SDLA_S507_IRQ15;
-				break;
-		}
-	}
-
-	for(i=0; i < ARRAY_SIZE(valid_mem); i++)
-		if (valid_mem[i] == map->mem_start)
-			break;   
-
-	err = -EINVAL;
-	if (i == ARRAY_SIZE(valid_mem))
-		goto fail2;
-
-	if (flp->type == SDLA_S502A && (map->mem_start & 0xF000) >> 12 == 0x0E)
-		goto fail2;
-
-	if (flp->type != SDLA_S507 && map->mem_start >> 16 == 0x0B)
-		goto fail2;
-
-	if (flp->type == SDLA_S507 && map->mem_start >> 16 == 0x0D)
-		goto fail2;
-
-	byte = flp->type != SDLA_S508 ? SDLA_8K_WINDOW : 0;
-	byte |= (map->mem_start & 0xF000) >> (12 + (flp->type == SDLA_S508 ? 1 : 0));
-	switch(flp->type) {
-		case SDLA_S502A:
-		case SDLA_S502E:
-			switch (map->mem_start >> 16) {
-				case 0x0A:
-					byte |= SDLA_S502_SEG_A;
-					break;
-				case 0x0C:
-					byte |= SDLA_S502_SEG_C;
-					break;
-				case 0x0D:
-					byte |= SDLA_S502_SEG_D;
-					break;
-				case 0x0E:
-					byte |= SDLA_S502_SEG_E;
-					break;
-			}
-			break;
-		case SDLA_S507:
-			switch (map->mem_start >> 16) {
-				case 0x0A:
-					byte |= SDLA_S507_SEG_A;
-					break;
-				case 0x0B:
-					byte |= SDLA_S507_SEG_B;
-					break;
-				case 0x0C:
-					byte |= SDLA_S507_SEG_C;
-					break;
-				case 0x0E:
-					byte |= SDLA_S507_SEG_E;
-					break;
-			}
-			break;
-		case SDLA_S508:
-			switch (map->mem_start >> 16) {
-				case 0x0A:
-					byte |= SDLA_S508_SEG_A;
-					break;
-				case 0x0C:
-					byte |= SDLA_S508_SEG_C;
-					break;
-				case 0x0D:
-					byte |= SDLA_S508_SEG_D;
-					break;
-				case 0x0E:
-					byte |= SDLA_S508_SEG_E;
-					break;
-			}
-			break;
-	}
-
-	/* set the memory bits, and enable access */
-	outb(byte, base + SDLA_REG_PC_WINDOW);
-
-	switch(flp->type)
-	{
-		case SDLA_S502E:
-			flp->state = SDLA_S502E_ENABLE;
-			break;
-		case SDLA_S507:
-			flp->state |= SDLA_MEMEN;
-			break;
-		case SDLA_S508:
-			flp->state = SDLA_MEMEN;
-			break;
-	}
-	outb(flp->state, base + SDLA_REG_CONTROL);
-
-	dev->irq = map->irq;
-	dev->base_addr = base;
-	dev->mem_start = map->mem_start;
-	dev->mem_end = dev->mem_start + 0x2000;
-	flp->initialized = 1;
-	return 0;
-
-fail2:
-	free_irq(map->irq, dev);
-fail:
-	release_region(base, SDLA_IO_EXTENTS);
-	return err;
-}
- 
-static const struct net_device_ops sdla_netdev_ops = {
-	.ndo_open	= sdla_open,
-	.ndo_stop	= sdla_close,
-	.ndo_do_ioctl	= sdla_ioctl,
-	.ndo_set_config	= sdla_set_config,
-	.ndo_start_xmit	= sdla_transmit,
-	.ndo_change_mtu	= sdla_change_mtu,
-};
-
-static void setup_sdla(struct net_device *dev)
-{
-	struct frad_local *flp = netdev_priv(dev);
-
-	netdev_boot_setup_check(dev);
-
-	dev->netdev_ops		= &sdla_netdev_ops;
-	dev->flags		= 0;
-	dev->type		= 0xFFFF;
-	dev->hard_header_len	= 0;
-	dev->addr_len		= 0;
-	dev->mtu		= SDLA_MAX_MTU;
-
-	flp->activate		= sdla_activate;
-	flp->deactivate		= sdla_deactivate;
-	flp->assoc		= sdla_assoc;
-	flp->deassoc		= sdla_deassoc;
-	flp->dlci_conf		= sdla_dlci_conf;
-	flp->dev		= dev;
-
-	timer_setup(&flp->timer, sdla_poll, 0);
-	flp->timer.expires	= 1;
-}
-
-static struct net_device *sdla;
-
-static int __init init_sdla(void)
-{
-	int err;
-
-	printk("%s.\n", version);
-
-	sdla = alloc_netdev(sizeof(struct frad_local), "sdla0",
-			    NET_NAME_UNKNOWN, setup_sdla);
-	if (!sdla) 
-		return -ENOMEM;
-
-	err = register_netdev(sdla);
-	if (err) 
-		free_netdev(sdla);
-
-	return err;
-}
-
-static void __exit exit_sdla(void)
-{
-	struct frad_local *flp = netdev_priv(sdla);
-
-	unregister_netdev(sdla);
-	if (flp->initialized) {
-		free_irq(sdla->irq, sdla);
-		release_region(sdla->base_addr, SDLA_IO_EXTENTS);
-	}
-	del_timer_sync(&flp->timer);
-	free_netdev(sdla);
-}
-
-MODULE_LICENSE("GPL");
-
-module_init(init_sdla);
-module_exit(exit_sdla);
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
deleted file mode 100644
index 52224de798aa..000000000000
--- a/include/linux/if_frad.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * DLCI/FRAD	Definitions for Frame Relay Access Devices.  DLCI devices are
- *		created for each DLCI associated with a FRAD.  The FRAD driver
- *		is not truly a network device, but the lower level device
- *		handler.  This allows other FRAD manufacturers to use the DLCI
- *		code, including its RFC1490 encapsulation alongside the current
- *		implementation for the Sangoma cards.
- *
- * Version:	@(#)if_ifrad.h	0.15	31 Mar 96
- *
- * Author:	Mike McLagan <mike.mclagan@linux.org>
- *
- * Changes:
- *		0.15	Mike McLagan	changed structure defs (packed)
- *					re-arranged flags
- *					added DLCI_RET vars
- */
-#ifndef _FRAD_H_
-#define _FRAD_H_
-
-#include <uapi/linux/if_frad.h>
-
-
-#if defined(CONFIG_DLCI) || defined(CONFIG_DLCI_MODULE)
-
-/* these are the fields of an RFC 1490 header */
-struct frhdr
-{
-   unsigned char  control;
-
-   /* for IP packets, this can be the NLPID */
-   unsigned char  pad;
-
-   unsigned char  NLPID;
-   unsigned char  OUI[3];
-   __be16 PID;
-
-#define IP_NLPID pad 
-} __packed;
-
-/* see RFC 1490 for the definition of the following */
-#define FRAD_I_UI		0x03
-
-#define FRAD_P_PADDING		0x00
-#define FRAD_P_Q933		0x08
-#define FRAD_P_SNAP		0x80
-#define FRAD_P_CLNP		0x81
-#define FRAD_P_IP		0xCC
-
-struct dlci_local
-{
-   struct net_device      *master;
-   struct net_device      *slave;
-   struct dlci_conf       config;
-   int                    configured;
-   struct list_head	  list;
-
-   /* callback function */
-   void              (*receive)(struct sk_buff *skb, struct net_device *);
-};
-
-struct frad_local
-{
-   /* devices which this FRAD is slaved to */
-   struct net_device     *master[CONFIG_DLCI_MAX];
-   short             dlci[CONFIG_DLCI_MAX];
-
-   struct frad_conf  config;
-   int               configured;	/* has this device been configured */
-   int               initialized;	/* mem_start, port, irq set ? */
-
-   /* callback functions */
-   int               (*activate)(struct net_device *, struct net_device *);
-   int               (*deactivate)(struct net_device *, struct net_device *);
-   int               (*assoc)(struct net_device *, struct net_device *);
-   int               (*deassoc)(struct net_device *, struct net_device *);
-   int               (*dlci_conf)(struct net_device *, struct net_device *, int get);
-
-   /* fields that are used by the Sangoma SDLA cards */
-   struct timer_list timer;
-   struct net_device *dev;
-   int               type;		/* adapter type */
-   int               state;		/* state of the S502/8 control latch */
-   int               buffer;		/* current buffer for S508 firmware */
-};
-
-#endif /* CONFIG_DLCI || CONFIG_DLCI_MODULE */
-
-extern void dlci_ioctl_set(int (*hook)(unsigned int, void __user *));
-
-#endif
diff --git a/include/linux/sdla.h b/include/linux/sdla.h
deleted file mode 100644
index 00e8b3b614f0..000000000000
--- a/include/linux/sdla.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * INET		An implementation of the TCP/IP protocol suite for the LINUX
- *		operating system.  INET is implemented using the  BSD Socket
- *		interface as the means of communication with the user level.
- *
- *		Global definitions for the Frame relay interface.
- *
- * Version:	@(#)if_ifrad.h	0.20	13 Apr 96
- *
- * Author:	Mike McLagan <mike.mclagan@linux.org>
- *
- * Changes:
- *		0.15	Mike McLagan	Structure packing
- *
- *		0.20	Mike McLagan	New flags for S508 buffer handling
- */
-#ifndef SDLA_H
-#define SDLA_H
-
-#include <uapi/linux/sdla.h>
-
-
-/* important Z80 window addresses */
-#define SDLA_CONTROL_WND		0xE000
-
-#define SDLA_502_CMD_BUF		0xEF60
-#define SDLA_502_RCV_BUF		0xA900
-#define	SDLA_502_TXN_AVAIL		0xFFF1
-#define SDLA_502_RCV_AVAIL		0xFFF2
-#define SDLA_502_EVENT_FLAGS		0xFFF3
-#define SDLA_502_MDM_STATUS		0xFFF4
-#define SDLA_502_IRQ_INTERFACE		0xFFFD
-#define SDLA_502_IRQ_PERMISSION		0xFFFE
-#define SDLA_502_DATA_OFS		0x0010
-
-#define SDLA_508_CMD_BUF		0xE000
-#define SDLA_508_TXBUF_INFO		0xF100
-#define SDLA_508_RXBUF_INFO		0xF120
-#define SDLA_508_EVENT_FLAGS		0xF003
-#define SDLA_508_MDM_STATUS		0xF004
-#define SDLA_508_IRQ_INTERFACE		0xF010
-#define SDLA_508_IRQ_PERMISSION		0xF011
-#define SDLA_508_TSE_OFFSET		0xF012
-
-/* Event flags */
-#define SDLA_EVENT_STATUS		0x01
-#define SDLA_EVENT_DLCI_STATUS		0x02
-#define SDLA_EVENT_BAD_DLCI		0x04
-#define SDLA_EVENT_LINK_DOWN		0x40
-
-/* IRQ Trigger flags */
-#define SDLA_INTR_RX			0x01
-#define SDLA_INTR_TX			0x02
-#define SDLA_INTR_MODEM			0x04
-#define SDLA_INTR_COMPLETE		0x08
-#define SDLA_INTR_STATUS		0x10
-#define SDLA_INTR_TIMER			0x20
-
-/* DLCI status bits */
-#define SDLA_DLCI_DELETED		0x01
-#define SDLA_DLCI_ACTIVE		0x02
-#define SDLA_DLCI_WAITING		0x04
-#define SDLA_DLCI_NEW			0x08
-#define SDLA_DLCI_INCLUDED		0x40
-
-/* valid command codes */
-#define	SDLA_INFORMATION_WRITE		0x01
-#define	SDLA_INFORMATION_READ		0x02
-#define SDLA_ISSUE_IN_CHANNEL_SIGNAL	0x03
-#define	SDLA_SET_DLCI_CONFIGURATION	0x10
-#define	SDLA_READ_DLCI_CONFIGURATION	0x11
-#define	SDLA_DISABLE_COMMUNICATIONS	0x12
-#define	SDLA_ENABLE_COMMUNICATIONS	0x13
-#define	SDLA_READ_DLC_STATUS		0x14
-#define	SDLA_READ_DLC_STATISTICS	0x15
-#define	SDLA_FLUSH_DLC_STATISTICS	0x16
-#define	SDLA_LIST_ACTIVE_DLCI		0x17
-#define	SDLA_FLUSH_INFORMATION_BUFFERS	0x18
-#define	SDLA_ADD_DLCI			0x20
-#define	SDLA_DELETE_DLCI		0x21
-#define	SDLA_ACTIVATE_DLCI		0x22
-#define	SDLA_DEACTIVATE_DLCI		0x23
-#define	SDLA_READ_MODEM_STATUS		0x30
-#define	SDLA_SET_MODEM_STATUS		0x31
-#define	SDLA_READ_COMMS_ERR_STATS	0x32
-#define SDLA_FLUSH_COMMS_ERR_STATS	0x33
-#define	SDLA_READ_CODE_VERSION		0x40
-#define SDLA_SET_IRQ_TRIGGER		0x50
-#define SDLA_GET_IRQ_TRIGGER		0x51
-
-/* In channel signal types */
-#define SDLA_ICS_LINK_VERIFY		0x02
-#define SDLA_ICS_STATUS_ENQ		0x03
-
-/* modem status flags */
-#define SDLA_MODEM_DTR_HIGH		0x01
-#define SDLA_MODEM_RTS_HIGH		0x02
-#define SDLA_MODEM_DCD_HIGH		0x08
-#define SDLA_MODEM_CTS_HIGH		0x20
-
-/* used for RET_MODEM interpretation */
-#define SDLA_MODEM_DCD_LOW		0x01
-#define SDLA_MODEM_CTS_LOW		0x02
-
-/* return codes */
-#define SDLA_RET_OK			0x00
-#define SDLA_RET_COMMUNICATIONS		0x01
-#define SDLA_RET_CHANNEL_INACTIVE	0x02
-#define SDLA_RET_DLCI_INACTIVE		0x03
-#define SDLA_RET_DLCI_CONFIG		0x04
-#define SDLA_RET_BUF_TOO_BIG		0x05
-#define SDLA_RET_NO_DATA		0x05
-#define SDLA_RET_BUF_OVERSIZE		0x06
-#define SDLA_RET_CIR_OVERFLOW		0x07
-#define SDLA_RET_NO_BUFS		0x08
-#define SDLA_RET_TIMEOUT		0x0A
-#define SDLA_RET_MODEM			0x10
-#define SDLA_RET_CHANNEL_OFF		0x11
-#define SDLA_RET_CHANNEL_ON		0x12
-#define SDLA_RET_DLCI_STATUS		0x13
-#define SDLA_RET_DLCI_UNKNOWN       	0x14
-#define SDLA_RET_COMMAND_INVALID    	0x1F
-
-/* Configuration flags */
-#define SDLA_DIRECT_RECV		0x0080
-#define SDLA_TX_NO_EXCEPT		0x0020
-#define SDLA_NO_ICF_MSGS		0x1000
-#define SDLA_TX50_RX50			0x0000
-#define SDLA_TX70_RX30			0x2000
-#define SDLA_TX30_RX70			0x4000
-
-/* IRQ selection flags */
-#define SDLA_IRQ_RECEIVE		0x01
-#define SDLA_IRQ_TRANSMIT		0x02
-#define SDLA_IRQ_MODEM_STAT		0x04
-#define SDLA_IRQ_COMMAND		0x08
-#define SDLA_IRQ_CHANNEL		0x10
-#define SDLA_IRQ_TIMER			0x20
-
-/* definitions for PC memory mapping */
-#define SDLA_8K_WINDOW			0x01
-#define SDLA_S502_SEG_A			0x10
-#define SDLA_S502_SEG_C			0x20
-#define SDLA_S502_SEG_D			0x00
-#define SDLA_S502_SEG_E			0x30
-#define SDLA_S507_SEG_A			0x00
-#define SDLA_S507_SEG_B			0x40
-#define SDLA_S507_SEG_C			0x80
-#define SDLA_S507_SEG_E			0xC0
-#define SDLA_S508_SEG_A			0x00
-#define SDLA_S508_SEG_C			0x10
-#define SDLA_S508_SEG_D			0x08
-#define SDLA_S508_SEG_E			0x18
-
-/* SDLA adapter port constants */
-#define SDLA_IO_EXTENTS			0x04
-	
-#define SDLA_REG_CONTROL		0x00
-#define SDLA_REG_PC_WINDOW		0x01	/* offset for PC window select latch */
-#define SDLA_REG_Z80_WINDOW 		0x02	/* offset for Z80 window select latch */
-#define SDLA_REG_Z80_CONTROL		0x03	/* offset for Z80 control latch */
-	
-#define SDLA_S502_STS			0x00	/* status reg for 502, 502E, 507 */
-#define SDLA_S508_GNRL			0x00	/* general purp. reg for 508 */
-#define SDLA_S508_STS			0x01	/* status reg for 508 */
-#define SDLA_S508_IDR			0x02	/* ID reg for 508 */
-	
-/* control register flags */
-#define SDLA_S502A_START		0x00	/* start the CPU */
-#define SDLA_S502A_INTREQ		0x02
-#define SDLA_S502A_INTEN		0x04
-#define SDLA_S502A_HALT			0x08	/* halt the CPU */	
-#define SDLA_S502A_NMI			0x10	/* issue an NMI to the CPU */
-
-#define SDLA_S502E_CPUEN		0x01
-#define SDLA_S502E_ENABLE		0x02
-#define SDLA_S502E_INTACK		0x04
-	
-#define SDLA_S507_ENABLE		0x01
-#define SDLA_S507_IRQ3			0x00
-#define SDLA_S507_IRQ4			0x20
-#define SDLA_S507_IRQ5			0x40
-#define SDLA_S507_IRQ7			0x60
-#define SDLA_S507_IRQ10			0x80
-#define SDLA_S507_IRQ11			0xA0
-#define SDLA_S507_IRQ12			0xC0
-#define SDLA_S507_IRQ15			0xE0
-	
-#define SDLA_HALT			0x00
-#define SDLA_CPUEN			0x02
-#define SDLA_MEMEN			0x04
-#define SDLA_S507_EPROMWR		0x08
-#define SDLA_S507_EPROMCLK		0x10
-#define SDLA_S508_INTRQ			0x08
-#define SDLA_S508_INTEN			0x10
-
-struct sdla_cmd {
-   char  opp_flag;
-   char  cmd;
-   short length;
-   char  retval;
-   short dlci;
-   char  flags;
-   short rxlost_int;
-   long  rxlost_app;
-   char  reserve[2];
-   char  data[SDLA_MAX_DATA];	/* transfer data buffer */
-} __attribute__((packed));
-
-struct intr_info {
-   char  flags;
-   short txlen;
-   char  irq;
-   char  flags2;
-   short timeout;
-} __attribute__((packed));
-
-/* found in the 508's control window at RXBUF_INFO */
-struct buf_info {
-   unsigned short rse_num;
-   unsigned long  rse_base;
-   unsigned long  rse_next;
-   unsigned long  buf_base;
-   unsigned short reserved;
-   unsigned long  buf_top;
-} __attribute__((packed));
-
-/* structure pointed to by rse_base in RXBUF_INFO struct */
-struct buf_entry {
-   char  opp_flag;
-   short length;
-   short dlci;
-   char  flags;
-   short timestamp;
-   short reserved[2];
-   long  buf_addr;
-} __attribute__((packed));
-
-#endif
diff --git a/include/uapi/linux/if_frad.h b/include/uapi/linux/if_frad.h
deleted file mode 100644
index 3c6ee85f6262..000000000000
--- a/include/uapi/linux/if_frad.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
-/*
- * DLCI/FRAD	Definitions for Frame Relay Access Devices.  DLCI devices are
- *		created for each DLCI associated with a FRAD.  The FRAD driver
- *		is not truly a network device, but the lower level device
- *		handler.  This allows other FRAD manufacturers to use the DLCI
- *		code, including its RFC1490 encapsulation alongside the current
- *		implementation for the Sangoma cards.
- *
- * Version:	@(#)if_ifrad.h	0.15	31 Mar 96
- *
- * Author:	Mike McLagan <mike.mclagan@linux.org>
- *
- * Changes:
- *		0.15	Mike McLagan	changed structure defs (packed)
- *					re-arranged flags
- *					added DLCI_RET vars
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#ifndef _UAPI_FRAD_H_
-#define _UAPI_FRAD_H_
-
-#include <linux/if.h>
-
-/* Structures and constants associated with the DLCI device driver */
-
-struct dlci_add
-{
-   char  devname[IFNAMSIZ];
-   short dlci;
-};
-
-#define DLCI_GET_CONF	(SIOCDEVPRIVATE + 2)
-#define DLCI_SET_CONF	(SIOCDEVPRIVATE + 3)
-
-/* 
- * These are related to the Sangoma SDLA and should remain in order. 
- * Code within the SDLA module is based on the specifics of this 
- * structure.  Change at your own peril.
- */
-struct dlci_conf {
-   short flags;
-   short CIR_fwd;
-   short Bc_fwd;
-   short Be_fwd;
-   short CIR_bwd;
-   short Bc_bwd;
-   short Be_bwd; 
-
-/* these are part of the status read */
-   short Tc_fwd;
-   short Tc_bwd;
-   short Tf_max;
-   short Tb_max;
-
-/* add any new fields here above is a mirror of sdla_dlci_conf */
-};
-
-#define DLCI_GET_SLAVE	(SIOCDEVPRIVATE + 4)
-
-/* configuration flags for DLCI */
-#define DLCI_IGNORE_CIR_OUT	0x0001
-#define DLCI_ACCOUNT_CIR_IN	0x0002
-#define DLCI_BUFFER_IF		0x0008
-
-#define DLCI_VALID_FLAGS	0x000B
-
-/* defines for the actual Frame Relay hardware */
-#define FRAD_GET_CONF	(SIOCDEVPRIVATE)
-#define FRAD_SET_CONF	(SIOCDEVPRIVATE + 1)
-
-#define FRAD_LAST_IOCTL	FRAD_SET_CONF
-
-/*
- * Based on the setup for the Sangoma SDLA.  If changes are 
- * necessary to this structure, a routine will need to be 
- * added to that module to copy fields.
- */
-struct frad_conf 
-{
-   short station;
-   short flags;
-   short kbaud;
-   short clocking;
-   short mtu;
-   short T391;
-   short T392;
-   short N391;
-   short N392;
-   short N393;
-   short CIR_fwd;
-   short Bc_fwd;
-   short Be_fwd;
-   short CIR_bwd;
-   short Bc_bwd;
-   short Be_bwd;
-
-/* Add new fields here, above is a mirror of the sdla_conf */
-
-};
-
-#define FRAD_STATION_CPE	0x0000
-#define FRAD_STATION_NODE	0x0001
-
-#define FRAD_TX_IGNORE_CIR	0x0001
-#define FRAD_RX_ACCOUNT_CIR	0x0002
-#define FRAD_DROP_ABORTED	0x0004
-#define FRAD_BUFFERIF		0x0008
-#define FRAD_STATS		0x0010
-#define FRAD_MCI		0x0100
-#define FRAD_AUTODLCI		0x8000
-#define FRAD_VALID_FLAGS	0x811F
-
-#define FRAD_CLOCK_INT		0x0001
-#define FRAD_CLOCK_EXT		0x0000
-
-
-#endif /* _UAPI_FRAD_H_ */
diff --git a/include/uapi/linux/sdla.h b/include/uapi/linux/sdla.h
deleted file mode 100644
index 1e3735be6511..000000000000
--- a/include/uapi/linux/sdla.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
-/*
- * INET		An implementation of the TCP/IP protocol suite for the LINUX
- *		operating system.  INET is implemented using the  BSD Socket
- *		interface as the means of communication with the user level.
- *
- *		Global definitions for the Frame relay interface.
- *
- * Version:	@(#)if_ifrad.h	0.20	13 Apr 96
- *
- * Author:	Mike McLagan <mike.mclagan@linux.org>
- *
- * Changes:
- *		0.15	Mike McLagan	Structure packing
- *
- *		0.20	Mike McLagan	New flags for S508 buffer handling
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#ifndef _UAPISDLA_H
-#define _UAPISDLA_H
-
-/* adapter type */
-#define SDLA_TYPES
-#define SDLA_S502A			5020
-#define SDLA_S502E			5021
-#define SDLA_S503			5030
-#define SDLA_S507			5070
-#define SDLA_S508			5080
-#define SDLA_S509			5090
-#define SDLA_UNKNOWN			-1
-
-/* port selection flags for the S508 */
-#define SDLA_S508_PORT_V35		0x00
-#define SDLA_S508_PORT_RS232		0x02
-
-/* Z80 CPU speeds */
-#define SDLA_CPU_3M			0x00
-#define SDLA_CPU_5M			0x01
-#define SDLA_CPU_7M			0x02
-#define SDLA_CPU_8M			0x03
-#define SDLA_CPU_10M			0x04
-#define SDLA_CPU_16M			0x05
-#define SDLA_CPU_12M			0x06
-
-/* some private IOCTLs */
-#define SDLA_IDENTIFY			(FRAD_LAST_IOCTL + 1)
-#define SDLA_CPUSPEED			(FRAD_LAST_IOCTL + 2)
-#define SDLA_PROTOCOL			(FRAD_LAST_IOCTL + 3)
-
-#define SDLA_CLEARMEM			(FRAD_LAST_IOCTL + 4)
-#define SDLA_WRITEMEM			(FRAD_LAST_IOCTL + 5)
-#define SDLA_READMEM			(FRAD_LAST_IOCTL + 6)
-
-struct sdla_mem {
-   int  addr;
-   int  len;
-   void __user *data;
-};
-
-#define SDLA_START			(FRAD_LAST_IOCTL + 7)
-#define SDLA_STOP			(FRAD_LAST_IOCTL + 8)
-
-/* some offsets in the Z80's memory space */
-#define SDLA_NMIADDR			0x0000
-#define SDLA_CONF_ADDR			0x0010
-#define SDLA_S502A_NMIADDR		0x0066
-#define SDLA_CODE_BASEADDR		0x0100
-#define SDLA_WINDOW_SIZE		0x2000
-#define SDLA_ADDR_MASK			0x1FFF
-
-/* largest handleable block of data */
-#define SDLA_MAX_DATA			4080
-#define SDLA_MAX_MTU			4072	/* MAX_DATA - sizeof(fradhdr) */
-#define SDLA_MAX_DLCI			24
-
-/* this should be the same as frad_conf */
-struct sdla_conf {
-   short station;
-   short config;
-   short kbaud;
-   short clocking;
-   short max_frm;
-   short T391;
-   short T392;
-   short N391;
-   short N392;
-   short N393;
-   short CIR_fwd;
-   short Bc_fwd;
-   short Be_fwd;
-   short CIR_bwd;
-   short Bc_bwd;
-   short Be_bwd;
-};
-
-/* this should be the same as dlci_conf */
-struct sdla_dlci_conf {
-   short config;
-   short CIR_fwd;
-   short Bc_fwd;
-   short Be_fwd;
-   short CIR_bwd;
-   short Bc_bwd;
-   short Be_bwd; 
-   short Tc_fwd;
-   short Tc_bwd;
-   short Tf_max;
-   short Tb_max;
-};
-
-
-#endif /* _UAPISDLA_H */
diff --git a/net/socket.c b/net/socket.c
index 6e6cccc2104f..152b1dcf93c6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -64,7 +64,6 @@
 #include <linux/seq_file.h>
 #include <linux/mutex.h>
 #include <linux/if_bridge.h>
-#include <linux/if_frad.h>
 #include <linux/if_vlan.h>
 #include <linux/ptp_classify.h>
 #include <linux/init.h>
@@ -1027,17 +1026,6 @@ void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
 }
 EXPORT_SYMBOL(vlan_ioctl_set);
 
-static DEFINE_MUTEX(dlci_ioctl_mutex);
-static int (*dlci_ioctl_hook) (unsigned int, void __user *);
-
-void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
-{
-	mutex_lock(&dlci_ioctl_mutex);
-	dlci_ioctl_hook = hook;
-	mutex_unlock(&dlci_ioctl_mutex);
-}
-EXPORT_SYMBOL(dlci_ioctl_set);
-
 static long sock_do_ioctl(struct net *net, struct socket *sock,
 			  unsigned int cmd, unsigned long arg)
 {
@@ -1156,17 +1144,6 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 				err = vlan_ioctl_hook(net, argp);
 			mutex_unlock(&vlan_ioctl_mutex);
 			break;
-		case SIOCADDDLCI:
-		case SIOCDELDLCI:
-			err = -ENOPKG;
-			if (!dlci_ioctl_hook)
-				request_module("dlci");
-
-			mutex_lock(&dlci_ioctl_mutex);
-			if (dlci_ioctl_hook)
-				err = dlci_ioctl_hook(cmd, argp);
-			mutex_unlock(&dlci_ioctl_mutex);
-			break;
 		case SIOCGSKNS:
 			err = -EPERM;
 			if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -3427,8 +3404,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCBRDELBR:
 	case SIOCGIFVLAN:
 	case SIOCSIFVLAN:
-	case SIOCADDDLCI:
-	case SIOCDELDLCI:
 	case SIOCGSKNS:
 	case SIOCGSTAMP_NEW:
 	case SIOCGSTAMPNS_NEW:
-- 
cgit 


From 270f3385cddf5db3799b393459476bd9abff89f1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 16 Nov 2020 11:17:59 +0100
Subject: net: core: fix some kernel-doc markups

Some identifiers have different names between their prototypes
and the kernel-doc markup.

In the specific case of netif_subqueue_stopped(), keep the
current markup for __netif_subqueue_stopped(), adding a
new one for netif_subqueue_stopped().

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7ce648a564f7..03433a4c929e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1490,7 +1490,7 @@ struct net_device_ops {
 };
 
 /**
- * enum net_device_priv_flags - &struct net_device priv_flags
+ * enum netdev_priv_flags - &struct net_device priv_flags
  *
  * These are the &struct net_device, they are only set internally
  * by drivers and used in the kernel. These flags are invisible to
@@ -3602,7 +3602,7 @@ static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
 }
 
 /**
- *	netif_subqueue_stopped - test status of subqueue
+ *	__netif_subqueue_stopped - test status of subqueue
  *	@dev: network device
  *	@queue_index: sub queue index
  *
@@ -3616,6 +3616,13 @@ static inline bool __netif_subqueue_stopped(const struct net_device *dev,
 	return netif_tx_queue_stopped(txq);
 }
 
+/**
+ *	netif_subqueue_stopped - test status of subqueue
+ *	@dev: network device
+ *	@skb: sub queue buffer pointer
+ *
+ * Check individual transmit queue of a device with multiple transmit queues.
+ */
 static inline bool netif_subqueue_stopped(const struct net_device *dev,
 					  struct sk_buff *skb)
 {
-- 
cgit 


From ed5298c7d500abaf34ed7783969e953a1f028e5b Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Mon, 28 Sep 2020 09:39:50 +0530
Subject: bus: mhi: Remove auto-start option

There is really no point having an auto-start for channels.
This is confusing for the device drivers, some have to enable the
channels, others don't have... and waste resources (e.g. pre allocated
buffers) that may never be used.

This is really up to the MHI device(channel) driver to manage the state
of its channels.

While at it, let's also remove the auto-start option from ath11k mhi
controller.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Acked-by: Kalle Valo <kvalo@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
[mani: clubbed ath11k change]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/init.c           | 9 ---------
 drivers/bus/mhi/core/internal.h       | 1 -
 drivers/net/wireless/ath/ath11k/mhi.c | 4 ----
 include/linux/mhi.h                   | 2 --
 4 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c
index 0ffdebde8265..381fdea2eb9f 100644
--- a/drivers/bus/mhi/core/init.c
+++ b/drivers/bus/mhi/core/init.c
@@ -758,7 +758,6 @@ static int parse_ch_cfg(struct mhi_controller *mhi_cntrl,
 		mhi_chan->offload_ch = ch_cfg->offload_channel;
 		mhi_chan->db_cfg.reset_req = ch_cfg->doorbell_mode_switch;
 		mhi_chan->pre_alloc = ch_cfg->auto_queue;
-		mhi_chan->auto_start = ch_cfg->auto_start;
 
 		/*
 		 * If MHI host allocates buffers, then the channel direction
@@ -1160,11 +1159,6 @@ static int mhi_driver_probe(struct device *dev)
 			goto exit_probe;
 
 		ul_chan->xfer_cb = mhi_drv->ul_xfer_cb;
-		if (ul_chan->auto_start) {
-			ret = mhi_prepare_channel(mhi_cntrl, ul_chan);
-			if (ret)
-				goto exit_probe;
-		}
 	}
 
 	ret = -EINVAL;
@@ -1198,9 +1192,6 @@ static int mhi_driver_probe(struct device *dev)
 	if (ret)
 		goto exit_probe;
 
-	if (dl_chan && dl_chan->auto_start)
-		mhi_prepare_channel(mhi_cntrl, dl_chan);
-
 	mhi_device_put(mhi_dev);
 
 	return ret;
diff --git a/drivers/bus/mhi/core/internal.h b/drivers/bus/mhi/core/internal.h
index 7989269ddd96..33c23203c531 100644
--- a/drivers/bus/mhi/core/internal.h
+++ b/drivers/bus/mhi/core/internal.h
@@ -563,7 +563,6 @@ struct mhi_chan {
 	bool configured;
 	bool offload_ch;
 	bool pre_alloc;
-	bool auto_start;
 	bool wake_capable;
 };
 
diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c
index aded9a719d51..47a1ce1bee4f 100644
--- a/drivers/net/wireless/ath/ath11k/mhi.c
+++ b/drivers/net/wireless/ath/ath11k/mhi.c
@@ -24,7 +24,6 @@ static struct mhi_channel_config ath11k_mhi_channels[] = {
 		.offload_channel = false,
 		.doorbell_mode_switch = false,
 		.auto_queue = false,
-		.auto_start = false,
 	},
 	{
 		.num = 1,
@@ -39,7 +38,6 @@ static struct mhi_channel_config ath11k_mhi_channels[] = {
 		.offload_channel = false,
 		.doorbell_mode_switch = false,
 		.auto_queue = false,
-		.auto_start = false,
 	},
 	{
 		.num = 20,
@@ -54,7 +52,6 @@ static struct mhi_channel_config ath11k_mhi_channels[] = {
 		.offload_channel = false,
 		.doorbell_mode_switch = false,
 		.auto_queue = false,
-		.auto_start = true,
 	},
 	{
 		.num = 21,
@@ -69,7 +66,6 @@ static struct mhi_channel_config ath11k_mhi_channels[] = {
 		.offload_channel = false,
 		.doorbell_mode_switch = false,
 		.auto_queue = true,
-		.auto_start = true,
 	},
 };
 
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index d4841e5a5f45..6522a4adc794 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -214,7 +214,6 @@ enum mhi_db_brst_mode {
  * @offload_channel: The client manages the channel completely
  * @doorbell_mode_switch: Channel switches to doorbell mode on M0 transition
  * @auto_queue: Framework will automatically queue buffers for DL traffic
- * @auto_start: Automatically start (open) this channel
  * @wake-capable: Channel capable of waking up the system
  */
 struct mhi_channel_config {
@@ -232,7 +231,6 @@ struct mhi_channel_config {
 	bool offload_channel;
 	bool doorbell_mode_switch;
 	bool auto_queue;
-	bool auto_start;
 	bool wake_capable;
 };
 
-- 
cgit 


From 16fee29b07358293f135759d9fdbf1267da57ebd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 6 Nov 2020 17:02:17 +0100
Subject: dma-mapping: remove the dma_direct_set_offset export

Drop the dma_direct_set_offset export and move the declaration to
dma-map-ops.h now that the Allwinner drivers have stopped calling it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
---
 arch/arm/mach-keystone/keystone.c | 2 +-
 arch/arm/mach-omap1/usb.c         | 2 +-
 arch/sh/drivers/pci/pcie-sh7786.c | 2 +-
 arch/x86/pci/sta2x11-fixup.c      | 3 ++-
 include/linux/dma-map-ops.h       | 3 +++
 include/linux/dma-mapping.h       | 7 -------
 kernel/dma/direct.c               | 1 -
 7 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c
index 09a65c2dfd73..cd711bfc591f 100644
--- a/arch/arm/mach-keystone/keystone.c
+++ b/arch/arm/mach-keystone/keystone.c
@@ -8,7 +8,7 @@
  */
 #include <linux/io.h>
 #include <linux/of.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
 #include <linux/init.h>
 #include <linux/of_platform.h>
 #include <linux/of_address.h>
diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c
index ba8566204ea9..86d3b3c157af 100644
--- a/arch/arm/mach-omap1/usb.c
+++ b/arch/arm/mach-omap1/usb.c
@@ -9,7 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
 #include <linux/io.h>
 
 #include <asm/irq.h>
diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c
index 4468289ab2ca..4d499476c33a 100644
--- a/arch/sh/drivers/pci/pcie-sh7786.c
+++ b/arch/sh/drivers/pci/pcie-sh7786.c
@@ -12,7 +12,7 @@
 #include <linux/io.h>
 #include <linux/async.h>
 #include <linux/delay.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
 #include <linux/slab.h>
 #include <linux/clk.h>
 #include <linux/sh_clk.h>
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 5701d5ba3df4..7d2525691854 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -11,7 +11,8 @@
 #include <linux/pci_ids.h>
 #include <linux/export.h>
 #include <linux/list.h>
-#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
+#include <linux/swiotlb.h>
 #include <asm/iommu.h>
 
 #define STA2X11_SWIOTLB_SIZE (4*1024*1024)
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index a5f89fc4d6df..03925e438ec3 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -226,6 +226,9 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size,
 		bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t));
 bool dma_free_from_pool(struct device *dev, void *start, size_t size);
 
+int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
+		dma_addr_t dma_start, u64 size);
+
 #ifdef CONFIG_ARCH_HAS_DMA_COHERENCE_H
 #include <asm/dma-coherence.h>
 #elif defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 956151052d45..199d85285246 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -558,13 +558,6 @@ static inline int dma_mmap_wc(struct device *dev,
 #define dma_unmap_len_set(PTR, LEN_NAME, VAL)    do { } while (0)
 #endif
 
-/*
- * Legacy interface to set up the dma offset map.  Drivers really should not
- * actually use it, but we have a few legacy cases left.
- */
-int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
-		dma_addr_t dma_start, u64 size);
-
 extern const struct dma_map_ops dma_virt_ops;
 
 #endif /* _LINUX_DMA_MAPPING_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 06c111544f61..002268262c9a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -547,4 +547,3 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
 	dev->dma_range_map = map;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dma_direct_set_offset);
-- 
cgit 


From 78e1d22687ff1fd320abac12e8a607ea79782c48 Mon Sep 17 00:00:00 2001
From: Bhaumik Bhatt <bbhatt@codeaurora.org>
Date: Fri, 6 Nov 2020 09:44:47 -0800
Subject: bus: mhi: core: Expose mhi_get_exec_env() API for controllers

The mhi_get_exec_env() APIs can be used by the controller drivers
to query the execution environment of the MHI device. Expose it
so it can be used in some scenarios to determine behavior of
controllers.

Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/internal.h | 1 -
 drivers/bus/mhi/core/main.c     | 1 +
 include/linux/mhi.h             | 6 ++++++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/internal.h b/drivers/bus/mhi/core/internal.h
index 78e4e84d6743..d8af8a702493 100644
--- a/drivers/bus/mhi/core/internal.h
+++ b/drivers/bus/mhi/core/internal.h
@@ -609,7 +609,6 @@ enum mhi_pm_state __must_check mhi_tryset_pm_state(
 					struct mhi_controller *mhi_cntrl,
 					enum mhi_pm_state state);
 const char *to_mhi_pm_state_str(enum mhi_pm_state state);
-enum mhi_ee_type mhi_get_exec_env(struct mhi_controller *mhi_cntrl);
 int mhi_queue_state_transition(struct mhi_controller *mhi_cntrl,
 			       enum dev_st_transition state);
 void mhi_pm_st_worker(struct work_struct *work);
diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c
index 6ecaacaa8b54..f953e2a6d58a 100644
--- a/drivers/bus/mhi/core/main.c
+++ b/drivers/bus/mhi/core/main.c
@@ -123,6 +123,7 @@ enum mhi_ee_type mhi_get_exec_env(struct mhi_controller *mhi_cntrl)
 
 	return (ret) ? MHI_EE_MAX : exec;
 }
+EXPORT_SYMBOL_GPL(mhi_get_exec_env);
 
 enum mhi_state mhi_get_mhi_state(struct mhi_controller *mhi_cntrl)
 {
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index d4841e5a5f45..9225d5551d69 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -658,6 +658,12 @@ int mhi_download_rddm_img(struct mhi_controller *mhi_cntrl, bool in_panic);
  */
 int mhi_force_rddm_mode(struct mhi_controller *mhi_cntrl);
 
+/**
+ * mhi_get_exec_env - Get BHI execution environment of the device
+ * @mhi_cntrl: MHI controller
+ */
+enum mhi_ee_type mhi_get_exec_env(struct mhi_controller *mhi_cntrl);
+
 /**
  * mhi_get_mhi_state - Get MHI state of the device
  * @mhi_cntrl: MHI controller
-- 
cgit 


From 9e1660e5c396ee081907386d0b95b8e0804a6c86 Mon Sep 17 00:00:00 2001
From: Bhaumik Bhatt <bbhatt@codeaurora.org>
Date: Fri, 6 Nov 2020 09:44:49 -0800
Subject: bus: mhi: core: Rename RDDM download function to use proper words

mhi_download_rddm_img() uses a shorter version of the word image.
Expand it and rename the function to mhi_download_rddm_image().

Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/boot.c | 4 ++--
 include/linux/mhi.h         | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/boot.c b/drivers/bus/mhi/core/boot.c
index 6b6fd9668c3b..16244cc8fbe7 100644
--- a/drivers/bus/mhi/core/boot.c
+++ b/drivers/bus/mhi/core/boot.c
@@ -147,7 +147,7 @@ static int __mhi_download_rddm_in_panic(struct mhi_controller *mhi_cntrl)
 }
 
 /* Download RDDM image from device */
-int mhi_download_rddm_img(struct mhi_controller *mhi_cntrl, bool in_panic)
+int mhi_download_rddm_image(struct mhi_controller *mhi_cntrl, bool in_panic)
 {
 	void __iomem *base = mhi_cntrl->bhie;
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
@@ -169,7 +169,7 @@ int mhi_download_rddm_img(struct mhi_controller *mhi_cntrl, bool in_panic)
 
 	return (rx_status == BHIE_RXVECSTATUS_STATUS_XFER_COMPL) ? 0 : -EIO;
 }
-EXPORT_SYMBOL_GPL(mhi_download_rddm_img);
+EXPORT_SYMBOL_GPL(mhi_download_rddm_image);
 
 static int mhi_fw_load_amss(struct mhi_controller *mhi_cntrl,
 			    const struct mhi_buf *mhi_buf)
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 9225d5551d69..52b3c60bf9bb 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -645,12 +645,12 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl);
 int mhi_pm_resume(struct mhi_controller *mhi_cntrl);
 
 /**
- * mhi_download_rddm_img - Download ramdump image from device for
- *                         debugging purpose.
+ * mhi_download_rddm_image - Download ramdump image from device for
+ *                           debugging purpose.
  * @mhi_cntrl: MHI controller
  * @in_panic: Download rddm image during kernel panic
  */
-int mhi_download_rddm_img(struct mhi_controller *mhi_cntrl, bool in_panic);
+int mhi_download_rddm_image(struct mhi_controller *mhi_cntrl, bool in_panic);
 
 /**
  * mhi_force_rddm_mode - Force device into rddm mode
-- 
cgit 


From 8f70397876872789b2a5deba804eb6216fb5deb7 Mon Sep 17 00:00:00 2001
From: Bhaumik Bhatt <bbhatt@codeaurora.org>
Date: Mon, 9 Nov 2020 12:47:21 -0800
Subject: bus: mhi: core: Move to using high priority workqueue

MHI work is currently scheduled on the global/system workqueue and can
encounter delays on a stressed system. To avoid those unforeseen
delays which can hamper bootup or shutdown times, use a dedicated high
priority workqueue instead of the global/system workqueue.

Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/init.c | 9 +++++++++
 drivers/bus/mhi/core/pm.c   | 2 +-
 include/linux/mhi.h         | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c
index 8cefa359fccd..877e40c86801 100644
--- a/drivers/bus/mhi/core/init.c
+++ b/drivers/bus/mhi/core/init.c
@@ -880,6 +880,13 @@ int mhi_register_controller(struct mhi_controller *mhi_cntrl,
 	INIT_WORK(&mhi_cntrl->st_worker, mhi_pm_st_worker);
 	init_waitqueue_head(&mhi_cntrl->state_event);
 
+	mhi_cntrl->hiprio_wq = alloc_ordered_workqueue
+				("mhi_hiprio_wq", WQ_MEM_RECLAIM | WQ_HIGHPRI);
+	if (!mhi_cntrl->hiprio_wq) {
+		dev_err(mhi_cntrl->cntrl_dev, "Failed to allocate workqueue\n");
+		goto error_alloc_cmd;
+	}
+
 	mhi_cmd = mhi_cntrl->mhi_cmd;
 	for (i = 0; i < NR_OF_CMD_RINGS; i++, mhi_cmd++)
 		spin_lock_init(&mhi_cmd->lock);
@@ -969,6 +976,7 @@ error_alloc_dev:
 error_alloc_cmd:
 	vfree(mhi_cntrl->mhi_chan);
 	kfree(mhi_cntrl->mhi_event);
+	destroy_workqueue(mhi_cntrl->hiprio_wq);
 
 	return ret;
 }
@@ -982,6 +990,7 @@ void mhi_unregister_controller(struct mhi_controller *mhi_cntrl)
 
 	mhi_destroy_debugfs(mhi_cntrl);
 
+	destroy_workqueue(mhi_cntrl->hiprio_wq);
 	kfree(mhi_cntrl->mhi_cmd);
 	kfree(mhi_cntrl->mhi_event);
 
diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c
index 3de7b1639ec6..805b6fa748f0 100644
--- a/drivers/bus/mhi/core/pm.c
+++ b/drivers/bus/mhi/core/pm.c
@@ -597,7 +597,7 @@ int mhi_queue_state_transition(struct mhi_controller *mhi_cntrl,
 	list_add_tail(&item->node, &mhi_cntrl->transition_list);
 	spin_unlock_irqrestore(&mhi_cntrl->transition_lock, flags);
 
-	schedule_work(&mhi_cntrl->st_worker);
+	queue_work(mhi_cntrl->hiprio_wq, &mhi_cntrl->st_worker);
 
 	return 0;
 }
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 52b3c60bf9bb..1ed5f2aa224b 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -337,6 +337,7 @@ struct mhi_controller_config {
  * @wlock: Lock for protecting device wakeup
  * @mhi_link_info: Device bandwidth info
  * @st_worker: State transition worker
+ * @hiprio_wq: High priority workqueue for MHI work such as state transitions
  * @state_event: State change event
  * @status_cb: CB function to notify power states of the device (required)
  * @wake_get: CB function to assert device wake (optional)
@@ -419,6 +420,7 @@ struct mhi_controller {
 	spinlock_t wlock;
 	struct mhi_link_info mhi_link_info;
 	struct work_struct st_worker;
+	struct workqueue_struct *hiprio_wq;
 	wait_queue_head_t state_event;
 
 	void (*status_cb)(struct mhi_controller *mhi_cntrl,
-- 
cgit 


From 5c62634fc65101d350cbd47722fb76f02693059d Mon Sep 17 00:00:00 2001
From: Hui Su <sh_def@163.com>
Date: Wed, 18 Nov 2020 00:17:50 +0800
Subject: namespace: make timens_on_fork() return nothing

timens_on_fork() always return 0, and maybe not
need to judge the return value in copy_namespaces().

So make timens_on_fork() return nothing and do not
judge its return val in copy_namespaces().

Signed-off-by: Hui Su <sh_def@163.com>
Link: https://lore.kernel.org/r/20201117161750.GA45121@rlk
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/time_namespace.h | 6 +++---
 kernel/nsproxy.c               | 7 +------
 kernel/time/namespace.c        | 6 ++----
 3 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 68770ac9ba89..30312166e70a 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -45,7 +45,7 @@ struct time_namespace *copy_time_ns(unsigned long flags,
 				    struct user_namespace *user_ns,
 				    struct time_namespace *old_ns);
 void free_time_ns(struct kref *kref);
-int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
+void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
 struct vdso_data *arch_get_vdso_data(void *vvar_page);
 
 static inline void put_time_ns(struct time_namespace *ns)
@@ -136,10 +136,10 @@ struct time_namespace *copy_time_ns(unsigned long flags,
 	return old_ns;
 }
 
-static inline int timens_on_fork(struct nsproxy *nsproxy,
+static inline void timens_on_fork(struct nsproxy *nsproxy,
 				 struct task_struct *tsk)
 {
-	return 0;
+	return;
 }
 
 static inline void timens_add_monotonic(struct timespec64 *ts) { }
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 12dd41b39a7f..e2e6c5dc433f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -153,7 +153,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
-	int ret;
 
 	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
 			      CLONE_NEWPID | CLONE_NEWNET |
@@ -180,11 +179,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	if (IS_ERR(new_ns))
 		return  PTR_ERR(new_ns);
 
-	ret = timens_on_fork(new_ns, tsk);
-	if (ret) {
-		free_nsproxy(new_ns);
-		return ret;
-	}
+	timens_on_fork(new_ns, tsk);
 
 	tsk->nsproxy = new_ns;
 	return 0;
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index afc65e6be33e..e0f9509b17c3 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -308,22 +308,20 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
 	return 0;
 }
 
-int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
 {
 	struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
 	struct time_namespace *ns = to_time_ns(nsc);
 
 	/* create_new_namespaces() already incremented the ref counter */
 	if (nsproxy->time_ns == nsproxy->time_ns_for_children)
-		return 0;
+		return;
 
 	get_time_ns(ns);
 	put_time_ns(nsproxy->time_ns);
 	nsproxy->time_ns = ns;
 
 	timens_commit(tsk, ns);
-
-	return 0;
 }
 
 static struct user_namespace *timens_owner(struct ns_common *ns)
-- 
cgit 


From 13d40ff85da8a85935e3fd47061dee71a02af0d4 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Fri, 13 Nov 2020 12:24:56 -0800
Subject: usb: typec: Correct the bit values for the Thunderbolt
 rounded/non-rounded cable support

Rounded and non-rounded Thunderbolt cables are represented by two bits as
per USB Type-C Connector specification v2.0 section F.2.6.
Corrected that in the Thunderbolt 3 cable discover mode VDO.

Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
--
Changes in v2:
- Removed the fixes tag as there is no functional implication.
--
Link: https://lore.kernel.org/r/20201113202503.6559-2-utkarsh.h.patel@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_tbt.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h
index 47c2d501ddce..aad648d14bb3 100644
--- a/include/linux/usb/typec_tbt.h
+++ b/include/linux/usb/typec_tbt.h
@@ -40,11 +40,16 @@ struct typec_thunderbolt_data {
 #define   TBT_CABLE_USB3_PASSIVE	2
 #define   TBT_CABLE_10_AND_20GBPS	3
 #define TBT_CABLE_ROUNDED		BIT(19)
+#define TBT_CABLE_ROUNDED_SUPPORT(_vdo_) \
+					(((_vdo_) & GENMASK(20, 19)) >> 19)
+#define   TBT_GEN3_NON_ROUNDED                 0
+#define   TBT_GEN3_GEN4_ROUNDED_NON_ROUNDED    1
 #define TBT_CABLE_OPTICAL		BIT(21)
 #define TBT_CABLE_RETIMER		BIT(22)
 #define TBT_CABLE_LINK_TRAINING		BIT(23)
 
 #define TBT_SET_CABLE_SPEED(_s_)	(((_s_) & GENMASK(2, 0)) << 16)
+#define TBT_SET_CABLE_ROUNDED(_g_)	(((_g_) & GENMASK(1, 0)) << 19)
 
 /* TBT3 Device Enter Mode VDO bits */
 #define TBT_ENTER_MODE_CABLE_SPEED(s)	TBT_SET_CABLE_SPEED(s)
-- 
cgit 


From 523a97aa3b756c1e0efe33ae48d450f8b0052073 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Fri, 13 Nov 2020 12:24:59 -0800
Subject: usb: typec: Remove one bit support for the Thunderbolt
 rounded/non-rounded cable

Two bits support for the Thunderbolt rounded/non-rounded cable has been
added to the header file.
Hence, removing unused TBT_CABLE_ROUNDED definition from the header file.

Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
--
changes in v2:
- Removed the fixes tag as there is no functional implication.
--
Link: https://lore.kernel.org/r/20201113202503.6559-5-utkarsh.h.patel@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_tbt.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h
index aad648d14bb3..63dd44b72e0c 100644
--- a/include/linux/usb/typec_tbt.h
+++ b/include/linux/usb/typec_tbt.h
@@ -39,7 +39,6 @@ struct typec_thunderbolt_data {
 #define   TBT_CABLE_USB3_GEN1		1
 #define   TBT_CABLE_USB3_PASSIVE	2
 #define   TBT_CABLE_10_AND_20GBPS	3
-#define TBT_CABLE_ROUNDED		BIT(19)
 #define TBT_CABLE_ROUNDED_SUPPORT(_vdo_) \
 					(((_vdo_) & GENMASK(20, 19)) >> 19)
 #define   TBT_GEN3_NON_ROUNDED                 0
-- 
cgit 


From 8a5ca78f603977d6b9b2d7dcb3b1b6ef601d3cc7 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 16 Nov 2020 12:11:38 -0800
Subject: usb: pd: Add captive Type C cable type

The USB Power Delivery Specification R3.0 adds a captive cable type
to the "USB Type-C plug to USB Type-C/Captive" field (Bits 19-18,
Passive/Active Cable VDO, Table 6-38 & 6-39).

Add the corresponding definition to the Cable VDO header. Also add a
helper macro to get the Type C cable connector type, when provided
the cable VDO.

Cc: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Prashant Malani <pmalani@chromium.org>
Reviewed-by: Benson Leung <bleung@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20201116201150.2919178-2-pmalani@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/pd_vdo.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h
index 68bdc4e2f5a9..8c5cb5830754 100644
--- a/include/linux/usb/pd_vdo.h
+++ b/include/linux/usb/pd_vdo.h
@@ -177,7 +177,7 @@
  * <31:28> :: Cable HW version
  * <27:24> :: Cable FW version
  * <23:20> :: Reserved, Shall be set to zero
- * <19:18> :: type-C to Type-A/B/C (00b == A, 01 == B, 10 == C)
+ * <19:18> :: type-C to Type-A/B/C/Captive (00b == A, 01 == B, 10 == C, 11 == Captive)
  * <17>    :: Type-C to Plug/Receptacle (0b == plug, 1b == receptacle)
  * <16:13> :: cable latency (0001 == <10ns(~1m length))
  * <12:11> :: cable termination type (11b == both ends active VCONN req)
@@ -193,6 +193,7 @@
 #define CABLE_ATYPE		0
 #define CABLE_BTYPE		1
 #define CABLE_CTYPE		2
+#define CABLE_CAPTIVE		3
 #define CABLE_PLUG		0
 #define CABLE_RECEPTACLE	1
 #define CABLE_CURR_1A5		0
@@ -208,6 +209,7 @@
 	 | (tx1d) << 10 | (tx2d) << 9 | (rx1d) << 8 | (rx2d) << 7	\
 	 | ((cur) & 0x3) << 5 | (vps) << 4 | (sopp) << 3		\
 	 | ((usbss) & 0x7))
+#define VDO_TYPEC_CABLE_TYPE(vdo)	(((vdo) >> 18) & 0x3)
 
 /*
  * AMA VDO
-- 
cgit 


From a0ccdc4a77a1b36b682ae60361879eca0a0f88d6 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 16 Nov 2020 12:11:40 -0800
Subject: usb: typec: Add number of altmodes partner attr

Add a user-visible attribute for the number of alternate modes available
in a partner. This allows userspace to determine whether there are any
remaining alternate modes left to be registered by the kernel driver. It
can begin executing any policy state machine after all available
alternate modes have been registered with the connector class framework.

This value is set to "-1" initially, signifying that a valid number of
alternate modes haven't been set for the partner.

Also add a sysfs file which exposes this attribute. The file remains
hidden as long as the attribute value is -1.

Cc: Benson Leung <bleung@chromium.org>
Cc: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Prashant Malani <pmalani@chromium.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20201116201150.2919178-3-pmalani@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-typec |  8 ++++
 drivers/usb/typec/class.c                   | 66 ++++++++++++++++++++++++++++-
 include/linux/usb/typec.h                   |  1 +
 3 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec
index b7794e02ad20..a195247d83ee 100644
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -139,6 +139,14 @@ Description:
 		Shows if the partner supports USB Power Delivery communication:
 		Valid values: yes, no
 
+What:		/sys/class/typec/<port>-partner/number_of_alternate_modes
+Date:		November 2020
+Contact:	Prashant Malani <pmalani@chromium.org>
+Description:
+		Shows the number of alternate modes which are advertised by the partner
+		during Power Delivery discovery. This file remains hidden until a value
+		greater than or equal to 0 is set by Type C port driver.
+
 What:		/sys/class/typec/<port>-partner>/identity/
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index 35eec707cb51..c7412ddbd311 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -33,6 +33,7 @@ struct typec_partner {
 	struct usb_pd_identity		*identity;
 	enum typec_accessory		accessory;
 	struct ida			mode_ids;
+	int				num_altmodes;
 };
 
 struct typec_port {
@@ -532,12 +533,43 @@ static ssize_t supports_usb_power_delivery_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(supports_usb_power_delivery);
 
+static ssize_t number_of_alternate_modes_show(struct device *dev, struct device_attribute *attr,
+					      char *buf)
+{
+	struct typec_partner *p = to_typec_partner(dev);
+
+	return sysfs_emit(buf, "%d\n", p->num_altmodes);
+}
+static DEVICE_ATTR_RO(number_of_alternate_modes);
+
 static struct attribute *typec_partner_attrs[] = {
 	&dev_attr_accessory_mode.attr,
 	&dev_attr_supports_usb_power_delivery.attr,
+	&dev_attr_number_of_alternate_modes.attr,
+	NULL
+};
+
+static umode_t typec_partner_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n)
+{
+	struct typec_partner *partner = to_typec_partner(kobj_to_dev(kobj));
+
+	if (attr == &dev_attr_number_of_alternate_modes.attr) {
+		if (partner->num_altmodes < 0)
+			return 0;
+	}
+
+	return attr->mode;
+}
+
+static struct attribute_group typec_partner_group = {
+	.is_visible = typec_partner_attr_is_visible,
+	.attrs = typec_partner_attrs
+};
+
+static const struct attribute_group *typec_partner_groups[] = {
+	&typec_partner_group,
 	NULL
 };
-ATTRIBUTE_GROUPS(typec_partner);
 
 static void typec_partner_release(struct device *dev)
 {
@@ -570,6 +602,37 @@ int typec_partner_set_identity(struct typec_partner *partner)
 }
 EXPORT_SYMBOL_GPL(typec_partner_set_identity);
 
+/**
+ * typec_partner_set_num_altmodes - Set the number of available partner altmodes
+ * @partner: The partner to be updated.
+ * @num_alt_modes: The number of altmodes we want to specify as available.
+ *
+ * This routine is used to report the number of alternate modes supported by the
+ * partner. This value is *not* enforced in alternate mode registration routines.
+ *
+ * @partner.num_altmodes is set to -1 on partner registration, denoting that
+ * a valid value has not been set for it yet.
+ *
+ * Returns 0 on success or negative error number on failure.
+ */
+int typec_partner_set_num_altmodes(struct typec_partner *partner, int num_altmodes)
+{
+	int ret;
+
+	if (num_altmodes < 0)
+		return -EINVAL;
+
+	partner->num_altmodes = num_altmodes;
+	ret = sysfs_update_group(&partner->dev.kobj, &typec_partner_group);
+	if (ret < 0)
+		return ret;
+
+	sysfs_notify(&partner->dev.kobj, NULL, "number_of_alternate_modes");
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(typec_partner_set_num_altmodes);
+
 /**
  * typec_partner_register_altmode - Register USB Type-C Partner Alternate Mode
  * @partner: USB Type-C Partner that supports the alternate mode
@@ -612,6 +675,7 @@ struct typec_partner *typec_register_partner(struct typec_port *port,
 	ida_init(&partner->mode_ids);
 	partner->usb_pd = desc->usb_pd;
 	partner->accessory = desc->accessory;
+	partner->num_altmodes = -1;
 
 	if (desc->identity) {
 		/*
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 6be558045942..bc6b1a71cb8a 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -126,6 +126,7 @@ struct typec_altmode_desc {
 	enum typec_port_data	roles;
 };
 
+int typec_partner_set_num_altmodes(struct typec_partner *partner, int num_altmodes);
 struct typec_altmode
 *typec_partner_register_altmode(struct typec_partner *partner,
 				const struct typec_altmode_desc *desc);
-- 
cgit 


From a30a00e37ceb094f949e4d96c2c586e6503b5d1d Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Wed, 28 Oct 2020 23:31:32 -0700
Subject: usb: typec: tcpm: frs sourcing vbus callback

During FRS hardware autonomously starts to source vbus. Provide
callback to perform chip specific operations.

Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20201029063138.1429760-5-badhri@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 9 +++++++++
 include/linux/usb/tcpm.h      | 4 ++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index a091a63758f4..c196bd4b8a37 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -4091,7 +4091,16 @@ static void _tcpm_pd_vbus_on(struct tcpm_port *port)
 	case SRC_TRY_DEBOUNCE:
 		/* Do nothing, waiting for sink detection */
 		break;
+	case FR_SWAP_SEND:
+	case FR_SWAP_SEND_TIMEOUT:
+	case FR_SWAP_SNK_SRC_TRANSITION_TO_OFF:
+	case FR_SWAP_SNK_SRC_SOURCE_VBUS_APPLIED:
+		if (port->tcpc->frs_sourcing_vbus)
+			port->tcpc->frs_sourcing_vbus(port->tcpc);
+		break;
 	case FR_SWAP_SNK_SRC_NEW_SINK_READY:
+		if (port->tcpc->frs_sourcing_vbus)
+			port->tcpc->frs_sourcing_vbus(port->tcpc);
 		tcpm_set_state(port, FR_SWAP_SNK_SRC_SOURCE_VBUS_APPLIED, 0);
 		break;
 
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index 09762d26fa0c..7303f518ba49 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -83,6 +83,9 @@ enum tcpm_transmit_type {
  *		Optional; Called to enable/disable PD 3.0 fast role swap.
  *		Enabling frs is accessory dependent as not all PD3.0
  *		accessories support fast role swap.
+ * @frs_sourcing_vbus:
+ *		Optional; Called to notify that vbus is now being sourced.
+ *		Low level drivers can perform chip specific operations, if any.
  */
 struct tcpc_dev {
 	struct fwnode_handle *fwnode;
@@ -109,6 +112,7 @@ struct tcpc_dev {
 			   const struct pd_message *msg);
 	int (*set_bist_data)(struct tcpc_dev *dev, bool on);
 	int (*enable_frs)(struct tcpc_dev *dev, bool enable);
+	void (*frs_sourcing_vbus)(struct tcpc_dev *dev);
 };
 
 struct tcpm_port;
-- 
cgit 


From f321a02caebdd0c56e167610cda2fa148cd96e8b Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Wed, 28 Oct 2020 23:31:35 -0700
Subject: usb: typec: tcpm: Implement enabling Auto Discharge disconnect
 support

TCPCI spec allows TCPC hardware to autonomously discharge the vbus
capacitance upon disconnect. The expectation is that the TCPM enables
AutoDischargeDisconnect while entering SNK/SRC_ATTACHED states. Hardware
then automously discharges vbus when the vbus falls below a certain
threshold i.e. VBUS_SINK_DISCONNECT_THRESHOLD.

Apart from enabling the vbus discharge circuit, AutoDischargeDisconnect
is also used a flag to move TCPCI based TCPC implementations into
Attached.Snk/Attached.Src state as mentioned in
Figure 4-15. TCPC State Diagram before a Connection of the
USB Type-C Port Controller Interface Specification.
In such TCPC implementations, setting AutoDischargeDisconnect would
prevent TCPC into entering "Connection_Invalid" state as well.

Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20201029063138.1429760-8-badhri@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 60 ++++++++++++++++++++++++++++++++++++++++---
 include/linux/usb/tcpm.h      | 15 +++++++++++
 2 files changed, 71 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index c196bd4b8a37..4aac0efdb720 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -1706,6 +1706,24 @@ static void tcpm_handle_alert(struct tcpm_port *port, const __le32 *payload,
 	}
 }
 
+static int tcpm_set_auto_vbus_discharge_threshold(struct tcpm_port *port,
+						  enum typec_pwr_opmode mode, bool pps_active,
+						  u32 requested_vbus_voltage)
+{
+	int ret;
+
+	if (!port->tcpc->set_auto_vbus_discharge_threshold)
+		return 0;
+
+	ret = port->tcpc->set_auto_vbus_discharge_threshold(port->tcpc, mode, pps_active,
+							    requested_vbus_voltage);
+	tcpm_log_force(port,
+		       "set_auto_vbus_discharge_threshold mode:%d pps_active:%c vbus:%u ret:%d",
+		       mode, pps_active ? 'y' : 'n', requested_vbus_voltage, ret);
+
+	return ret;
+}
+
 static void tcpm_pd_data_request(struct tcpm_port *port,
 				 const struct pd_message *msg)
 {
@@ -1876,6 +1894,10 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 						       port->current_limit,
 						       port->supply_voltage);
 				port->explicit_contract = true;
+				tcpm_set_auto_vbus_discharge_threshold(port,
+								       TYPEC_PWR_MODE_PD,
+								       port->pps_data.active,
+								       port->supply_voltage);
 				tcpm_set_state(port, SNK_READY, 0);
 			} else {
 				/*
@@ -2790,8 +2812,12 @@ static int tcpm_src_attach(struct tcpm_port *port)
 	if (ret < 0)
 		return ret;
 
-	ret = tcpm_set_roles(port, true, TYPEC_SOURCE,
-			     tcpm_data_role_for_source(port));
+	if (port->tcpc->enable_auto_vbus_discharge) {
+		ret = port->tcpc->enable_auto_vbus_discharge(port->tcpc, true);
+		tcpm_log_force(port, "enable vbus discharge ret:%d", ret);
+	}
+
+	ret = tcpm_set_roles(port, true, TYPEC_SOURCE, tcpm_data_role_for_source(port));
 	if (ret < 0)
 		return ret;
 
@@ -2858,6 +2884,12 @@ static void tcpm_unregister_altmodes(struct tcpm_port *port)
 
 static void tcpm_reset_port(struct tcpm_port *port)
 {
+	int ret;
+
+	if (port->tcpc->enable_auto_vbus_discharge) {
+		ret = port->tcpc->enable_auto_vbus_discharge(port->tcpc, false);
+		tcpm_log_force(port, "Disable vbus discharge ret:%d", ret);
+	}
 	tcpm_unregister_altmodes(port);
 	tcpm_typec_disconnect(port);
 	port->attached = false;
@@ -2922,8 +2954,13 @@ static int tcpm_snk_attach(struct tcpm_port *port)
 	if (ret < 0)
 		return ret;
 
-	ret = tcpm_set_roles(port, true, TYPEC_SINK,
-			     tcpm_data_role_for_sink(port));
+	if (port->tcpc->enable_auto_vbus_discharge) {
+		tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB, false, VSAFE5V);
+		ret = port->tcpc->enable_auto_vbus_discharge(port->tcpc, true);
+		tcpm_log_force(port, "enable vbus discharge ret:%d", ret);
+	}
+
+	ret = tcpm_set_roles(port, true, TYPEC_SINK, tcpm_data_role_for_sink(port));
 	if (ret < 0)
 		return ret;
 
@@ -3507,6 +3544,8 @@ static void run_state_machine(struct tcpm_port *port)
 		tcpm_set_state(port, SRC_UNATTACHED, PD_T_PS_SOURCE_ON);
 		break;
 	case SNK_HARD_RESET_SINK_OFF:
+		/* Do not discharge/disconnect during hard reseet */
+		tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB, false, 0);
 		memset(&port->pps_data, 0, sizeof(port->pps_data));
 		tcpm_set_vconn(port, false);
 		if (port->pd_capable)
@@ -3549,6 +3588,7 @@ static void run_state_machine(struct tcpm_port *port)
 			tcpm_set_charge(port, true);
 		}
 		tcpm_set_attached_state(port, true);
+		tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB, false, VSAFE5V);
 		tcpm_set_state(port, SNK_STARTUP, 0);
 		break;
 
@@ -3650,6 +3690,10 @@ static void run_state_machine(struct tcpm_port *port)
 			tcpm_set_state(port, PR_SWAP_SNK_SRC_SINK_OFF, 0);
 		break;
 	case PR_SWAP_SRC_SNK_TRANSITION_OFF:
+		/*
+		 * Prevent vbus discharge circuit from turning on during PR_SWAP
+		 * as this is not a disconnect.
+		 */
 		tcpm_set_vbus(port, false);
 		port->explicit_contract = false;
 		/* allow time for Vbus discharge, must be < tSrcSwapStdby */
@@ -3678,9 +3722,17 @@ static void run_state_machine(struct tcpm_port *port)
 		tcpm_set_state_cond(port, SNK_UNATTACHED, PD_T_PS_SOURCE_ON);
 		break;
 	case PR_SWAP_SRC_SNK_SINK_ON:
+		/* Set the vbus disconnect threshold for implicit contract */
+		tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB, false, VSAFE5V);
 		tcpm_set_state(port, SNK_STARTUP, 0);
 		break;
 	case PR_SWAP_SNK_SRC_SINK_OFF:
+		/*
+		 * Prevent vbus discharge circuit from turning on during PR_SWAP
+		 * as this is not a disconnect.
+		 */
+		tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB,
+						       port->pps_data.active, 0);
 		tcpm_set_charge(port, false);
 		tcpm_set_state(port, hard_reset_state(port),
 			       PD_T_PS_SOURCE_OFF);
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index 7303f518ba49..e68aaa12886f 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -86,6 +86,18 @@ enum tcpm_transmit_type {
  * @frs_sourcing_vbus:
  *		Optional; Called to notify that vbus is now being sourced.
  *		Low level drivers can perform chip specific operations, if any.
+ * @enable_auto_vbus_discharge:
+ *		Optional; TCPCI spec based TCPC implementations can optionally
+ *		support hardware to autonomously dischrge vbus upon disconnecting
+ *		as sink or source. TCPM signals TCPC to enable the mechanism upon
+ *		entering connected state and signals disabling upon disconnect.
+ * @set_auto_vbus_discharge_threshold:
+ *		Mandatory when enable_auto_vbus_discharge is implemented. TCPM
+ *		calls this function to allow lower levels drivers to program the
+ *		vbus threshold voltage below which the vbus discharge circuit
+ *		will be turned on. requested_vbus_voltage is set to 0 when vbus
+ *		is going to disappear knowingly i.e. during PR_SWAP and
+ *		HARD_RESET etc.
  */
 struct tcpc_dev {
 	struct fwnode_handle *fwnode;
@@ -113,6 +125,9 @@ struct tcpc_dev {
 	int (*set_bist_data)(struct tcpc_dev *dev, bool on);
 	int (*enable_frs)(struct tcpc_dev *dev, bool enable);
 	void (*frs_sourcing_vbus)(struct tcpc_dev *dev);
+	int (*enable_auto_vbus_discharge)(struct tcpc_dev *dev, bool enable);
+	int (*set_auto_vbus_discharge_threshold)(struct tcpc_dev *dev, enum typec_pwr_opmode mode,
+						 bool pps_active, u32 requested_vbus_voltage);
 };
 
 struct tcpm_port;
-- 
cgit 


From e1e52361c61afdf81d81cfbbfa3ce08971e60f50 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 16 Nov 2020 12:11:42 -0800
Subject: usb: typec: Add plug num_altmodes sysfs attr

Add a field to the typec_plug struct to record the number of available
altmodes as well as the corresponding sysfs attribute to expose this to
userspace.

This allows userspace to determine whether there are any
remaining alternate modes left to be registered by the kernel driver. It
can begin executing any policy state machine after all available
alternate modes have been registered with the connector class framework.

This value is set to "-1" initially, signifying that a valid number of
alternate modes haven't been set for the plug. The sysfs file remains
hidden as long as the attribute value is -1.

We re-use the partner attribute for number_of_alternate_modes since the
usage and name is similar, and update the corresponding *_show() command
to support both partner and plugs.

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20201116201150.2919178-4-pmalani@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-typec |  9 ++++
 drivers/usb/typec/class.c                   | 77 ++++++++++++++++++++++++++++-
 include/linux/usb/typec.h                   |  1 +
 3 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec
index a195247d83ee..4eccb343fc7b 100644
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -210,6 +210,15 @@ Description:
 		- type-c
 		- captive
 
+What:		/sys/class/typec/<port>-<plug>/number_of_alternate_modes
+Date:		November 2020
+Contact:	Prashant Malani <pmalani@chromium.org>
+Description:
+		Shows the number of alternate modes which are advertised by the plug
+		associated with a particular cable during Power Delivery discovery.
+		This file remains hidden until a value greater than or equal to 0
+		is set by Type C port driver.
+
 What:		/sys/class/typec/<port>-cable/identity/
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index c7412ddbd311..e68798599ca8 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -18,6 +18,7 @@ struct typec_plug {
 	struct device			dev;
 	enum typec_plug_index		index;
 	struct ida			mode_ids;
+	int				num_altmodes;
 };
 
 struct typec_cable {
@@ -536,9 +537,21 @@ static DEVICE_ATTR_RO(supports_usb_power_delivery);
 static ssize_t number_of_alternate_modes_show(struct device *dev, struct device_attribute *attr,
 					      char *buf)
 {
-	struct typec_partner *p = to_typec_partner(dev);
+	struct typec_partner *partner;
+	struct typec_plug *plug;
+	int num_altmodes;
+
+	if (is_typec_partner(dev)) {
+		partner = to_typec_partner(dev);
+		num_altmodes = partner->num_altmodes;
+	} else if (is_typec_plug(dev)) {
+		plug = to_typec_plug(dev);
+		num_altmodes = plug->num_altmodes;
+	} else {
+		return 0;
+	}
 
-	return sysfs_emit(buf, "%d\n", p->num_altmodes);
+	return sysfs_emit(buf, "%d\n", num_altmodes);
 }
 static DEVICE_ATTR_RO(number_of_alternate_modes);
 
@@ -726,11 +739,70 @@ static void typec_plug_release(struct device *dev)
 	kfree(plug);
 }
 
+static struct attribute *typec_plug_attrs[] = {
+	&dev_attr_number_of_alternate_modes.attr,
+	NULL
+};
+
+static umode_t typec_plug_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n)
+{
+	struct typec_plug *plug = to_typec_plug(kobj_to_dev(kobj));
+
+	if (attr == &dev_attr_number_of_alternate_modes.attr) {
+		if (plug->num_altmodes < 0)
+			return 0;
+	}
+
+	return attr->mode;
+}
+
+static struct attribute_group typec_plug_group = {
+	.is_visible = typec_plug_attr_is_visible,
+	.attrs = typec_plug_attrs
+};
+
+static const struct attribute_group *typec_plug_groups[] = {
+	&typec_plug_group,
+	NULL
+};
+
 static const struct device_type typec_plug_dev_type = {
 	.name = "typec_plug",
+	.groups = typec_plug_groups,
 	.release = typec_plug_release,
 };
 
+/**
+ * typec_plug_set_num_altmodes - Set the number of available plug altmodes
+ * @plug: The plug to be updated.
+ * @num_altmodes: The number of altmodes we want to specify as available.
+ *
+ * This routine is used to report the number of alternate modes supported by the
+ * plug. This value is *not* enforced in alternate mode registration routines.
+ *
+ * @plug.num_altmodes is set to -1 on plug registration, denoting that
+ * a valid value has not been set for it yet.
+ *
+ * Returns 0 on success or negative error number on failure.
+ */
+int typec_plug_set_num_altmodes(struct typec_plug *plug, int num_altmodes)
+{
+	int ret;
+
+	if (num_altmodes < 0)
+		return -EINVAL;
+
+	plug->num_altmodes = num_altmodes;
+	ret = sysfs_update_group(&plug->dev.kobj, &typec_plug_group);
+	if (ret < 0)
+		return ret;
+
+	sysfs_notify(&plug->dev.kobj, NULL, "number_of_alternate_modes");
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(typec_plug_set_num_altmodes);
+
 /**
  * typec_plug_register_altmode - Register USB Type-C Cable Plug Alternate Mode
  * @plug: USB Type-C Cable Plug that supports the alternate mode
@@ -776,6 +848,7 @@ struct typec_plug *typec_register_plug(struct typec_cable *cable,
 	sprintf(name, "plug%d", desc->index);
 
 	ida_init(&plug->mode_ids);
+	plug->num_altmodes = -1;
 	plug->index = desc->index;
 	plug->dev.class = typec_class;
 	plug->dev.parent = &cable->dev;
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index bc6b1a71cb8a..54475323f83b 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -130,6 +130,7 @@ int typec_partner_set_num_altmodes(struct typec_partner *partner, int num_altmod
 struct typec_altmode
 *typec_partner_register_altmode(struct typec_partner *partner,
 				const struct typec_altmode_desc *desc);
+int typec_plug_set_num_altmodes(struct typec_plug *plug, int num_altmodes);
 struct typec_altmode
 *typec_plug_register_altmode(struct typec_plug *plug,
 			     const struct typec_altmode_desc *desc);
-- 
cgit 


From 4d213e76a359e540ca786ee937da7f35faa8e5f8 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@gmail.com>
Date: Tue, 10 Nov 2020 15:19:08 +0800
Subject: iommu/vt-d: Avoid panic if iommu init fails in tboot system

"intel_iommu=off" command line is used to disable iommu but iommu is force
enabled in a tboot system for security reason.

However for better performance on high speed network device, a new option
"intel_iommu=tboot_noforce" is introduced to disable the force on.

By default kernel should panic if iommu init fail in tboot for security
reason, but it's unnecessory if we use "intel_iommu=tboot_noforce,off".

Fix the code setting force_on and move intel_iommu_tboot_noforce
from tboot code to intel iommu code.

Fixes: 7304e8f28bb2 ("iommu/vt-d: Correctly disable Intel IOMMU force on")
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@gmail.com>
Tested-by: Lukasz Hawrylko <lukasz.hawrylko@linux.intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201110071908.3133-1-zhenzhong.duan@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/x86/kernel/tboot.c     | 3 ---
 drivers/iommu/intel/iommu.c | 5 +++--
 include/linux/intel-iommu.h | 1 -
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 992fb1415c0f..420be871d9d4 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,9 +514,6 @@ int tboot_force_iommu(void)
 	if (!tboot_enabled())
 		return 0;
 
-	if (intel_iommu_tboot_noforce)
-		return 1;
-
 	if (no_iommu || swiotlb || dmar_disabled)
 		pr_warn("Forcing Intel-IOMMU to enabled\n");
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 1b1ca63e6bbe..4d9b298002f0 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -179,7 +179,7 @@ static int rwbf_quirk;
  * (used when kernel is launched w/ TXT)
  */
 static int force_on = 0;
-int intel_iommu_tboot_noforce;
+static int intel_iommu_tboot_noforce;
 static int no_platform_optin;
 
 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
@@ -4885,7 +4885,8 @@ int __init intel_iommu_init(void)
 	 * Intel IOMMU is required for a TXT/tboot launch or platform
 	 * opt in, so enforce that.
 	 */
-	force_on = tboot_force_iommu() || platform_optin_force_iommu();
+	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
+		    platform_optin_force_iommu();
 
 	if (iommu_init_mempool()) {
 		if (force_on)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index fbf5b3e7707e..d956987ed032 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -798,7 +798,6 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu);
 extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
 extern int dmar_disabled;
 extern int intel_iommu_enabled;
-extern int intel_iommu_tboot_noforce;
 extern int intel_iommu_gfx_mapped;
 #else
 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
-- 
cgit 


From d04a53b1c48766665806eb75b73137734abdaa95 Mon Sep 17 00:00:00 2001
From: Ahmad Fatoum <a.fatoum@pengutronix.de>
Date: Tue, 17 Nov 2020 22:38:26 +0100
Subject: ptp: document struct ptp_clock_request members

It's arguable most people interested in configuring a PPS signal
want it as external output, not as kernel input. PTP_CLK_REQ_PPS
is for input though. Add documentation to nudge readers into
the correct direction.

Signed-off-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Link: https://lore.kernel.org/r/20201117213826.18235-1-a.fatoum@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ptp_clock_kernel.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index d3e8ba5c7125..0d47fd33b228 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -12,6 +12,19 @@
 #include <linux/pps_kernel.h>
 #include <linux/ptp_clock.h>
 
+/**
+ * struct ptp_clock_request - request PTP clock event
+ *
+ * @type:   The type of the request.
+ *	    EXTTS:  Configure external trigger timestamping
+ *	    PEROUT: Configure periodic output signal (e.g. PPS)
+ *	    PPS:    trigger internal PPS event for input
+ *	            into kernel PPS subsystem
+ * @extts:  describes configuration for external trigger timestamping.
+ *          This is only valid when event == PTP_CLK_REQ_EXTTS.
+ * @perout: describes configuration for periodic output.
+ *	    This is only valid when event == PTP_CLK_REQ_PEROUT.
+ */
 
 struct ptp_clock_request {
 	enum {
-- 
cgit 


From d055126180564a57fe533728a4e93d0cb53d49b3 Mon Sep 17 00:00:00 2001
From: Dmitrii Banshchikov <me@ubique.spb.ru>
Date: Tue, 17 Nov 2020 18:45:49 +0000
Subject: bpf: Add bpf_ktime_get_coarse_ns helper

The helper uses CLOCK_MONOTONIC_COARSE source of time that is less
accurate but more performant.

We have a BPF CGROUP_SKB firewall that supports event logging through
bpf_perf_event_output(). Each event has a timestamp and currently we use
bpf_ktime_get_ns() for it. Use of bpf_ktime_get_coarse_ns() saves ~15-20
ns in time required for event logging.

bpf_ktime_get_ns():
EgressLogByRemoteEndpoint                              113.82ns    8.79M

bpf_ktime_get_coarse_ns():
EgressLogByRemoteEndpoint                               95.40ns   10.48M

Signed-off-by: Dmitrii Banshchikov <me@ubique.spb.ru>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201117184549.257280-1-me@ubique.spb.ru
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 11 +++++++++++
 kernel/bpf/core.c              |  1 +
 kernel/bpf/helpers.c           | 13 +++++++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h | 11 +++++++++++
 6 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 581b2a2e78eb..e1bcb6d7345c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1842,6 +1842,7 @@ extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
 extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
+extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a52299b80b9d..3ca6146f001a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3797,6 +3797,16 @@ union bpf_attr {
  *		is cleared if the flag is not specified.
  *	Return
  *		**-EINVAL** if invalid *flags* are passed, zero otherwise.
+ *
+ * u64 bpf_ktime_get_coarse_ns(void)
+ * 	Description
+ * 		Return a coarse-grained version of the time elapsed since
+ * 		system boot, in nanoseconds. Does not include time the system
+ * 		was suspended.
+ *
+ * 		See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**)
+ * 	Return
+ * 		Current *ktime*.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3959,6 +3969,7 @@ union bpf_attr {
 	FN(task_storage_delete),	\
 	FN(get_current_task_btf),	\
 	FN(bprm_opts_set),		\
+	FN(ktime_get_coarse_ns),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 55454d2278b1..ff55cbcfbab4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2211,6 +2211,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
 
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 25520f5eeaf6..2c395deae279 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -167,6 +167,17 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+BPF_CALL_0(bpf_ktime_get_coarse_ns)
+{
+	return ktime_get_coarse_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
+	.func		= bpf_ktime_get_coarse_ns,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
 BPF_CALL_0(bpf_get_current_pid_tgid)
 {
 	struct task_struct *task = current;
@@ -685,6 +696,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_ktime_get_boot_ns:
 		return &bpf_ktime_get_boot_ns_proto;
+	case BPF_FUNC_ktime_get_coarse_ns:
+		return &bpf_ktime_get_coarse_ns_proto;
 	case BPF_FUNC_ringbuf_output:
 		return &bpf_ringbuf_output_proto;
 	case BPF_FUNC_ringbuf_reserve:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 02986c7b90eb..d255bc9b2bfa 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1280,6 +1280,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_ktime_get_boot_ns:
 		return &bpf_ktime_get_boot_ns_proto;
+	case BPF_FUNC_ktime_get_coarse_ns:
+		return &bpf_ktime_get_coarse_ns_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_get_current_pid_tgid:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a52299b80b9d..3ca6146f001a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3797,6 +3797,16 @@ union bpf_attr {
  *		is cleared if the flag is not specified.
  *	Return
  *		**-EINVAL** if invalid *flags* are passed, zero otherwise.
+ *
+ * u64 bpf_ktime_get_coarse_ns(void)
+ * 	Description
+ * 		Return a coarse-grained version of the time elapsed since
+ * 		system boot, in nanoseconds. Does not include time the system
+ * 		was suspended.
+ *
+ * 		See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**)
+ * 	Return
+ * 		Current *ktime*.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3959,6 +3969,7 @@ union bpf_attr {
 	FN(task_storage_delete),	\
 	FN(get_current_task_btf),	\
 	FN(bprm_opts_set),		\
+	FN(ktime_get_coarse_ns),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit 


From f2bcc2fa275b913093ff5b403be5d11342fdb055 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 16 Nov 2020 17:21:15 +0100
Subject: atm: nicstar: Replace in_interrupt() usage

push_scqe() uses in_interrupt() to figure out if it is allowed to sleep.

The usage of in_interrupt() in drivers is phased out and Linus clearly
requested that code which changes behaviour depending on context should
either be separated or the context be conveyed in an argument passed by the
caller, which usually knows the context.

Aside of that in_interrupt() is not correct as it does not catch preempt
disabled regions which neither can sleep.

ns_send() (the only caller of push_scqe()) has the following callers:

- vcc_sendmsg() used as proto_ops::sendmsg is expected to be invoked in
  preemtible context.
  -> vcc->dev->ops->send() (ns_send())

- atm_vcc::send via atmdev_ops::send either directly (pointer copied by
  atm_init_aal34() or atm_init_aal5()) or via atm_send_aal0().
  This is invoked by drivers (like br2684, clip, pppoatm, ...) which are
  called from net_device_ops::ndo_start_xmit with BH disabled.

Add atmdev_ops::send_bh which is used by callers from BH context
(atm_send_aal*()) and if this callback missing then ::send is used
instead.
Implement this callback in nicstar and use it to replace in_interrupt().

Cc: Chas Williams <3chas3@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/atm/nicstar.c  | 24 ++++++++++++++++++------
 include/linux/atmdev.h |  1 +
 net/atm/raw.c          | 12 ++++++++++--
 3 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index 7af74fb450a0..a602e6b96ff7 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -130,8 +130,9 @@ static int ns_open(struct atm_vcc *vcc);
 static void ns_close(struct atm_vcc *vcc);
 static void fill_tst(ns_dev * card, int n, vc_map * vc);
 static int ns_send(struct atm_vcc *vcc, struct sk_buff *skb);
+static int ns_send_bh(struct atm_vcc *vcc, struct sk_buff *skb);
 static int push_scqe(ns_dev * card, vc_map * vc, scq_info * scq, ns_scqe * tbd,
-		     struct sk_buff *skb);
+		     struct sk_buff *skb, bool may_sleep);
 static void process_tsq(ns_dev * card);
 static void drain_scq(ns_dev * card, scq_info * scq, int pos);
 static void process_rsq(ns_dev * card);
@@ -160,6 +161,7 @@ static const struct atmdev_ops atm_ops = {
 	.close = ns_close,
 	.ioctl = ns_ioctl,
 	.send = ns_send,
+	.send_bh = ns_send_bh,
 	.phy_put = ns_phy_put,
 	.phy_get = ns_phy_get,
 	.proc_read = ns_proc_read,
@@ -1620,7 +1622,7 @@ static void fill_tst(ns_dev * card, int n, vc_map * vc)
 	card->tst_addr = new_tst;
 }
 
-static int ns_send(struct atm_vcc *vcc, struct sk_buff *skb)
+static int _ns_send(struct atm_vcc *vcc, struct sk_buff *skb, bool may_sleep)
 {
 	ns_dev *card;
 	vc_map *vc;
@@ -1704,7 +1706,7 @@ static int ns_send(struct atm_vcc *vcc, struct sk_buff *skb)
 		scq = card->scq0;
 	}
 
-	if (push_scqe(card, vc, scq, &scqe, skb) != 0) {
+	if (push_scqe(card, vc, scq, &scqe, skb, may_sleep) != 0) {
 		atomic_inc(&vcc->stats->tx_err);
 		dev_kfree_skb_any(skb);
 		return -EIO;
@@ -1714,8 +1716,18 @@ static int ns_send(struct atm_vcc *vcc, struct sk_buff *skb)
 	return 0;
 }
 
+static int ns_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	return _ns_send(vcc, skb, true);
+}
+
+static int ns_send_bh(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	return _ns_send(vcc, skb, false);
+}
+
 static int push_scqe(ns_dev * card, vc_map * vc, scq_info * scq, ns_scqe * tbd,
-		     struct sk_buff *skb)
+		     struct sk_buff *skb, bool may_sleep)
 {
 	unsigned long flags;
 	ns_scqe tsr;
@@ -1726,7 +1738,7 @@ static int push_scqe(ns_dev * card, vc_map * vc, scq_info * scq, ns_scqe * tbd,
 
 	spin_lock_irqsave(&scq->lock, flags);
 	while (scq->tail == scq->next) {
-		if (in_interrupt()) {
+		if (!may_sleep) {
 			spin_unlock_irqrestore(&scq->lock, flags);
 			printk("nicstar%d: Error pushing TBD.\n", card->index);
 			return 1;
@@ -1771,7 +1783,7 @@ static int push_scqe(ns_dev * card, vc_map * vc, scq_info * scq, ns_scqe * tbd,
 		int has_run = 0;
 
 		while (scq->tail == scq->next) {
-			if (in_interrupt()) {
+			if (!may_sleep) {
 				data = scq_virt_to_bus(scq, scq->next);
 				ns_write_sram(card, scq->scd, &data, 1);
 				spin_unlock_irqrestore(&scq->lock, flags);
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index 5d5ff2203fa2..d7493016cd46 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -186,6 +186,7 @@ struct atmdev_ops { /* only send is required */
 			    void __user *arg);
 #endif
 	int (*send)(struct atm_vcc *vcc,struct sk_buff *skb);
+	int (*send_bh)(struct atm_vcc *vcc, struct sk_buff *skb);
 	int (*send_oam)(struct atm_vcc *vcc,void *cell,int flags);
 	void (*phy_put)(struct atm_dev *dev,unsigned char value,
 	    unsigned long addr);
diff --git a/net/atm/raw.c b/net/atm/raw.c
index b3ba44aab0ee..2b5f78a7ec3e 100644
--- a/net/atm/raw.c
+++ b/net/atm/raw.c
@@ -54,6 +54,8 @@ static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb)
 		kfree_skb(skb);
 		return -EADDRNOTAVAIL;
 	}
+	if (vcc->dev->ops->send_bh)
+		return vcc->dev->ops->send_bh(vcc, skb);
 	return vcc->dev->ops->send(vcc, skb);
 }
 
@@ -71,7 +73,10 @@ int atm_init_aal34(struct atm_vcc *vcc)
 	vcc->push = atm_push_raw;
 	vcc->pop = atm_pop_raw;
 	vcc->push_oam = NULL;
-	vcc->send = vcc->dev->ops->send;
+	if (vcc->dev->ops->send_bh)
+		vcc->send = vcc->dev->ops->send_bh;
+	else
+		vcc->send = vcc->dev->ops->send;
 	return 0;
 }
 
@@ -80,7 +85,10 @@ int atm_init_aal5(struct atm_vcc *vcc)
 	vcc->push = atm_push_raw;
 	vcc->pop = atm_pop_raw;
 	vcc->push_oam = NULL;
-	vcc->send = vcc->dev->ops->send;
+	if (vcc->dev->ops->send_bh)
+		vcc->send = vcc->dev->ops->send_bh;
+	else
+		vcc->send = vcc->dev->ops->send;
 	return 0;
 }
 EXPORT_SYMBOL(atm_init_aal5);
-- 
cgit 


From 4abb1e5b63ac3281275315fc6b0cde0b9c2e2e42 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 11 Nov 2020 15:53:17 +0100
Subject: powerpc/mm: factor out creating/removing linear mapping

We want to stop abusing memory hotplug infrastructure in memtrace code
to perform allocations and remove the linear mapping. Instead we will use
alloc_contig_pages() and remove the linear mapping manually.

Let's factor out creating/removing the linear mapping into
arch_create_linear_mapping() / arch_remove_linear_mapping() - so in the
future, we might be able to have whole arch_add_memory() /
arch_remove_memory() be implemented in common code.

Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201111145322.15793-4-david@redhat.com
---
 arch/powerpc/mm/mem.c          | 41 ++++++++++++++++++++++++++++-------------
 include/linux/memory_hotplug.h |  3 +++
 2 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 01ec2a252f09..8a86d81f8df0 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -120,34 +120,26 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
 	}
 }
 
-int __ref arch_add_memory(int nid, u64 start, u64 size,
-			  struct mhp_params *params)
+int __ref arch_create_linear_mapping(int nid, u64 start, u64 size,
+				     struct mhp_params *params)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int rc;
 
 	start = (unsigned long)__va(start);
 	rc = create_section_mapping(start, start + size, nid,
 				    params->pgprot);
 	if (rc) {
-		pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
+		pr_warn("Unable to create linear mapping for 0x%llx..0x%llx: %d\n",
 			start, start + size, rc);
 		return -EFAULT;
 	}
-
-	return __add_pages(nid, start_pfn, nr_pages, params);
+	return 0;
 }
 
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
-			     struct vmem_altmap *altmap)
+void __ref arch_remove_linear_mapping(u64 start, u64 size)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	__remove_pages(start_pfn, nr_pages, altmap);
-
 	/* Remove htab bolted mappings for this section of memory */
 	start = (unsigned long)__va(start);
 	flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE);
@@ -160,6 +152,29 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 	 */
 	vm_unmap_aliases();
 }
+
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+			  struct mhp_params *params)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int rc;
+
+	rc = arch_create_linear_mapping(nid, start, size, params);
+	if (rc)
+		return rc;
+	return __add_pages(nid, start_pfn, nr_pages, params);
+}
+
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+			      struct vmem_altmap *altmap)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	__remove_pages(start_pfn, nr_pages, altmap);
+	arch_remove_linear_mapping(start, size);
+}
 #endif
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d65c6fdc5cfc..00b9e9bd3850 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -375,6 +375,9 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 		unsigned long nr_pages);
+extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
+				      struct mhp_params *params);
+void arch_remove_linear_mapping(u64 start, u64 size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
-- 
cgit 


From 3b95bc57c86b064fd140ccec3642ad14f40b687f Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Wed, 18 Nov 2020 22:49:27 -0800
Subject: Input: adp5589-keys - remove setup/teardown hooks for gpios

This is currently just dead code. It's from around a time when
platform-data was used, and a board could hook it's own special callback
for setup/teardown, and a private object (via 'context').

This change removes it, as there are no more users in mainline for this.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20201112074308.71351-4-alexandru.ardelean@analog.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/adp5589-keys.c | 21 ---------------------
 include/linux/input/adp5589.h         |  7 -------
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/adp5589-keys.c b/drivers/input/keyboard/adp5589-keys.c
index 0e2ad9628ba9..4008cd3be18e 100644
--- a/drivers/input/keyboard/adp5589-keys.c
+++ b/drivers/input/keyboard/adp5589-keys.c
@@ -539,35 +539,14 @@ static int adp5589_gpio_add(struct adp5589_kpad *kpad)
 					    ADP5589_GPIO_DIRECTION_A) + i);
 	}
 
-	if (gpio_data->setup) {
-		error = gpio_data->setup(kpad->client,
-					 kpad->gc.base, kpad->gc.ngpio,
-					 gpio_data->context);
-		if (error)
-			dev_warn(dev, "setup failed, %d\n", error);
-	}
-
 	return 0;
 }
 
 static void adp5589_gpio_remove(struct adp5589_kpad *kpad)
 {
-	struct device *dev = &kpad->client->dev;
-	const struct adp5589_kpad_platform_data *pdata = dev_get_platdata(dev);
-	const struct adp5589_gpio_platform_data *gpio_data = pdata->gpio_data;
-	int error;
-
 	if (!kpad->export_gpio)
 		return;
 
-	if (gpio_data->teardown) {
-		error = gpio_data->teardown(kpad->client,
-					    kpad->gc.base, kpad->gc.ngpio,
-					    gpio_data->context);
-		if (error)
-			dev_warn(dev, "teardown failed %d\n", error);
-	}
-
 	gpiochip_remove(&kpad->gc);
 }
 #else
diff --git a/include/linux/input/adp5589.h b/include/linux/input/adp5589.h
index c0523af96893..0e4742c8c81e 100644
--- a/include/linux/input/adp5589.h
+++ b/include/linux/input/adp5589.h
@@ -175,13 +175,6 @@ struct i2c_client; /* forward declaration */
 
 struct adp5589_gpio_platform_data {
 	int	gpio_start;	/* GPIO Chip base # */
-	int	(*setup)(struct i2c_client *client,
-				int gpio, unsigned ngpio,
-				void *context);
-	int	(*teardown)(struct i2c_client *client,
-				int gpio, unsigned ngpio,
-				void *context);
-	void	*context;
 };
 
 #endif
-- 
cgit 


From 86b9d170da98bae13b307d621638954aef645331 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Tue, 10 Nov 2020 17:13:37 +0100
Subject: mfd: syscon: Add syscon_regmap_lookup_by_phandle_optional() function.

This adds syscon_regmap_lookup_by_phandle_optional() function to get an
optional regmap.

It behaves the same as syscon_regmap_lookup_by_phandle() except where
there is no regmap phandle. In this case, instead of returning -ENODEV,
the function returns NULL. This makes error checking simpler when the
regmap phandle is optional.

Suggested-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/syscon.c       | 18 ++++++++++++++++++
 include/linux/mfd/syscon.h | 11 +++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c
index ca465794ea9c..c6f139b2e0c0 100644
--- a/drivers/mfd/syscon.c
+++ b/drivers/mfd/syscon.c
@@ -255,6 +255,24 @@ struct regmap *syscon_regmap_lookup_by_phandle_args(struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(syscon_regmap_lookup_by_phandle_args);
 
+/*
+ * It behaves the same as syscon_regmap_lookup_by_phandle() except where
+ * there is no regmap phandle. In this case, instead of returning -ENODEV,
+ * the function returns NULL.
+ */
+struct regmap *syscon_regmap_lookup_by_phandle_optional(struct device_node *np,
+					const char *property)
+{
+	struct regmap *regmap;
+
+	regmap = syscon_regmap_lookup_by_phandle(np, property);
+	if (IS_ERR(regmap) && PTR_ERR(regmap) == -ENODEV)
+		return NULL;
+
+	return regmap;
+}
+EXPORT_SYMBOL_GPL(syscon_regmap_lookup_by_phandle_optional);
+
 static int syscon_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h
index 7f20e9b502a5..fecc2fa2a364 100644
--- a/include/linux/mfd/syscon.h
+++ b/include/linux/mfd/syscon.h
@@ -28,6 +28,9 @@ extern struct regmap *syscon_regmap_lookup_by_phandle_args(
 					const char *property,
 					int arg_count,
 					unsigned int *out_args);
+extern struct regmap *syscon_regmap_lookup_by_phandle_optional(
+					struct device_node *np,
+					const char *property);
 #else
 static inline struct regmap *device_node_to_regmap(struct device_node *np)
 {
@@ -59,6 +62,14 @@ static inline struct regmap *syscon_regmap_lookup_by_phandle_args(
 {
 	return ERR_PTR(-ENOTSUPP);
 }
+
+static inline struct regmap *syscon_regmap_lookup_by_phandle_optional(
+					struct device_node *np,
+					const char *property)
+{
+	return NULL;
+}
+
 #endif
 
 #endif /* __LINUX_MFD_SYSCON_H__ */
-- 
cgit 


From 68a90a6c6443b07036ae4a878f6d85bf141471fd Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Fri, 25 Sep 2020 10:14:46 +0100
Subject: mfd: madera: Delete register field xxx_WIDTH defines

The register field xxx_WIDTH defines are not used in current code.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/madera/registers.h | 635 -----------------------------------
 1 file changed, 635 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index fe909d177762..b44aeb461d0c 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -1286,566 +1286,438 @@
 /* (0x0000)  Software_Reset */
 #define MADERA_SW_RST_DEV_ID1_MASK			0xFFFF
 #define MADERA_SW_RST_DEV_ID1_SHIFT			     0
-#define MADERA_SW_RST_DEV_ID1_WIDTH			    16
 
 /* (0x0001)  Hardware_Revision */
 #define MADERA_HW_REVISION_MASK				0x00FF
 #define MADERA_HW_REVISION_SHIFT			     0
-#define MADERA_HW_REVISION_WIDTH			     8
 
 /* (0x0020)  Tone_Generator_1 */
 #define MADERA_TONE2_ENA				0x0002
 #define MADERA_TONE2_ENA_MASK				0x0002
 #define MADERA_TONE2_ENA_SHIFT				     1
-#define MADERA_TONE2_ENA_WIDTH				     1
 #define MADERA_TONE1_ENA				0x0001
 #define MADERA_TONE1_ENA_MASK				0x0001
 #define MADERA_TONE1_ENA_SHIFT				     0
-#define MADERA_TONE1_ENA_WIDTH				     1
 
 /* (0x0021)  Tone_Generator_2 */
 #define MADERA_TONE1_LVL_0_MASK				0xFFFF
 #define MADERA_TONE1_LVL_0_SHIFT			     0
-#define MADERA_TONE1_LVL_0_WIDTH			    16
 
 /* (0x0022)  Tone_Generator_3 */
 #define MADERA_TONE1_LVL_MASK				0x00FF
 #define MADERA_TONE1_LVL_SHIFT				     0
-#define MADERA_TONE1_LVL_WIDTH				     8
 
 /* (0x0023)  Tone_Generator_4 */
 #define MADERA_TONE2_LVL_0_MASK				0xFFFF
 #define MADERA_TONE2_LVL_0_SHIFT			     0
-#define MADERA_TONE2_LVL_0_WIDTH			    16
 
 /* (0x0024)  Tone_Generator_5 */
 #define MADERA_TONE2_LVL_MASK				0x00FF
 #define MADERA_TONE2_LVL_SHIFT				     0
-#define MADERA_TONE2_LVL_WIDTH				     8
 
 /* (0x0030)  PWM_Drive_1 */
 #define MADERA_PWM2_ENA					0x0002
 #define MADERA_PWM2_ENA_MASK				0x0002
 #define MADERA_PWM2_ENA_SHIFT				     1
-#define MADERA_PWM2_ENA_WIDTH				     1
 #define MADERA_PWM1_ENA					0x0001
 #define MADERA_PWM1_ENA_MASK				0x0001
 #define MADERA_PWM1_ENA_SHIFT				     0
-#define MADERA_PWM1_ENA_WIDTH				     1
 
 /* (0x00A0)  Comfort_Noise_Generator */
 #define MADERA_NOISE_GEN_ENA				0x0020
 #define MADERA_NOISE_GEN_ENA_MASK			0x0020
 #define MADERA_NOISE_GEN_ENA_SHIFT			     5
-#define MADERA_NOISE_GEN_ENA_WIDTH			     1
 #define MADERA_NOISE_GEN_GAIN_MASK			0x001F
 #define MADERA_NOISE_GEN_GAIN_SHIFT			     0
-#define MADERA_NOISE_GEN_GAIN_WIDTH			     5
 
 /* (0x0100)  Clock_32k_1 */
 #define MADERA_CLK_32K_ENA				0x0040
 #define MADERA_CLK_32K_ENA_MASK				0x0040
 #define MADERA_CLK_32K_ENA_SHIFT			     6
-#define MADERA_CLK_32K_ENA_WIDTH			     1
 #define MADERA_CLK_32K_SRC_MASK				0x0003
 #define MADERA_CLK_32K_SRC_SHIFT			     0
-#define MADERA_CLK_32K_SRC_WIDTH			     2
 
 /* (0x0101)  System_Clock_1 */
 #define MADERA_SYSCLK_FRAC				0x8000
 #define MADERA_SYSCLK_FRAC_MASK				0x8000
 #define MADERA_SYSCLK_FRAC_SHIFT			    15
-#define MADERA_SYSCLK_FRAC_WIDTH			     1
 #define MADERA_SYSCLK_FREQ_MASK				0x0700
 #define MADERA_SYSCLK_FREQ_SHIFT			     8
-#define MADERA_SYSCLK_FREQ_WIDTH			     3
 #define MADERA_SYSCLK_ENA				0x0040
 #define MADERA_SYSCLK_ENA_MASK				0x0040
 #define MADERA_SYSCLK_ENA_SHIFT				     6
-#define MADERA_SYSCLK_ENA_WIDTH				     1
 #define MADERA_SYSCLK_SRC_MASK				0x000F
 #define MADERA_SYSCLK_SRC_SHIFT				     0
-#define MADERA_SYSCLK_SRC_WIDTH				     4
 
 /* (0x0102)  Sample_rate_1 */
 #define MADERA_SAMPLE_RATE_1_MASK			0x001F
 #define MADERA_SAMPLE_RATE_1_SHIFT			     0
-#define MADERA_SAMPLE_RATE_1_WIDTH			     5
 
 /* (0x0103)  Sample_rate_2 */
 #define MADERA_SAMPLE_RATE_2_MASK			0x001F
 #define MADERA_SAMPLE_RATE_2_SHIFT			     0
-#define MADERA_SAMPLE_RATE_2_WIDTH			     5
 
 /* (0x0104)  Sample_rate_3 */
 #define MADERA_SAMPLE_RATE_3_MASK			0x001F
 #define MADERA_SAMPLE_RATE_3_SHIFT			     0
-#define MADERA_SAMPLE_RATE_3_WIDTH			     5
 
 /* (0x0112)  Async_clock_1 */
 #define MADERA_ASYNC_CLK_FREQ_MASK			0x0700
 #define MADERA_ASYNC_CLK_FREQ_SHIFT			     8
-#define MADERA_ASYNC_CLK_FREQ_WIDTH			     3
 #define MADERA_ASYNC_CLK_ENA				0x0040
 #define MADERA_ASYNC_CLK_ENA_MASK			0x0040
 #define MADERA_ASYNC_CLK_ENA_SHIFT			     6
-#define MADERA_ASYNC_CLK_ENA_WIDTH			     1
 #define MADERA_ASYNC_CLK_SRC_MASK			0x000F
 #define MADERA_ASYNC_CLK_SRC_SHIFT			     0
-#define MADERA_ASYNC_CLK_SRC_WIDTH			     4
 
 /* (0x0113)  Async_sample_rate_1 */
 #define MADERA_ASYNC_SAMPLE_RATE_1_MASK			0x001F
 #define MADERA_ASYNC_SAMPLE_RATE_1_SHIFT		     0
-#define MADERA_ASYNC_SAMPLE_RATE_1_WIDTH		     5
 
 /* (0x0114)  Async_sample_rate_2 */
 #define MADERA_ASYNC_SAMPLE_RATE_2_MASK			0x001F
 #define MADERA_ASYNC_SAMPLE_RATE_2_SHIFT		     0
-#define MADERA_ASYNC_SAMPLE_RATE_2_WIDTH		     5
 
 /* (0x0120)  DSP_Clock_1 */
 #define MADERA_DSP_CLK_FREQ_LEGACY			0x0700
 #define MADERA_DSP_CLK_FREQ_LEGACY_MASK			0x0700
 #define MADERA_DSP_CLK_FREQ_LEGACY_SHIFT		     8
-#define MADERA_DSP_CLK_FREQ_LEGACY_WIDTH		     3
 #define MADERA_DSP_CLK_ENA				0x0040
 #define MADERA_DSP_CLK_ENA_MASK				0x0040
 #define MADERA_DSP_CLK_ENA_SHIFT			     6
-#define MADERA_DSP_CLK_ENA_WIDTH			     1
 #define MADERA_DSP_CLK_SRC				0x000F
 #define MADERA_DSP_CLK_SRC_MASK				0x000F
 #define MADERA_DSP_CLK_SRC_SHIFT			     0
-#define MADERA_DSP_CLK_SRC_WIDTH			     4
 
 /* (0x0122)  DSP_Clock_2 */
 #define MADERA_DSP_CLK_FREQ_MASK			0x03FF
 #define MADERA_DSP_CLK_FREQ_SHIFT			     0
-#define MADERA_DSP_CLK_FREQ_WIDTH			    10
 
 /* (0x0149)  Output_system_clock */
 #define MADERA_OPCLK_ENA				0x8000
 #define MADERA_OPCLK_ENA_MASK				0x8000
 #define MADERA_OPCLK_ENA_SHIFT				    15
-#define MADERA_OPCLK_ENA_WIDTH				     1
 #define MADERA_OPCLK_DIV_MASK				0x00F8
 #define MADERA_OPCLK_DIV_SHIFT				     3
-#define MADERA_OPCLK_DIV_WIDTH				     5
 #define MADERA_OPCLK_SEL_MASK				0x0007
 #define MADERA_OPCLK_SEL_SHIFT				     0
-#define MADERA_OPCLK_SEL_WIDTH				     3
 
 /* (0x014A)  Output_async_clock */
 #define MADERA_OPCLK_ASYNC_ENA				0x8000
 #define MADERA_OPCLK_ASYNC_ENA_MASK			0x8000
 #define MADERA_OPCLK_ASYNC_ENA_SHIFT			    15
-#define MADERA_OPCLK_ASYNC_ENA_WIDTH			     1
 #define MADERA_OPCLK_ASYNC_DIV_MASK			0x00F8
 #define MADERA_OPCLK_ASYNC_DIV_SHIFT			     3
-#define MADERA_OPCLK_ASYNC_DIV_WIDTH			     5
 #define MADERA_OPCLK_ASYNC_SEL_MASK			0x0007
 #define MADERA_OPCLK_ASYNC_SEL_SHIFT			     0
-#define MADERA_OPCLK_ASYNC_SEL_WIDTH			     3
 
 /* (0x0171)  FLL1_Control_1 */
 #define CS47L92_FLL1_REFCLK_SRC_MASK			0xF000
 #define CS47L92_FLL1_REFCLK_SRC_SHIFT			    12
-#define CS47L92_FLL1_REFCLK_SRC_WIDTH			     4
 #define MADERA_FLL1_HOLD_MASK				0x0004
 #define MADERA_FLL1_HOLD_SHIFT				     2
-#define MADERA_FLL1_HOLD_WIDTH				     1
 #define MADERA_FLL1_FREERUN				0x0002
 #define MADERA_FLL1_FREERUN_MASK			0x0002
 #define MADERA_FLL1_FREERUN_SHIFT			     1
-#define MADERA_FLL1_FREERUN_WIDTH			     1
 #define MADERA_FLL1_ENA					0x0001
 #define MADERA_FLL1_ENA_MASK				0x0001
 #define MADERA_FLL1_ENA_SHIFT				     0
-#define MADERA_FLL1_ENA_WIDTH				     1
 
 /* (0x0172)  FLL1_Control_2 */
 #define MADERA_FLL1_CTRL_UPD				0x8000
 #define MADERA_FLL1_CTRL_UPD_MASK			0x8000
 #define MADERA_FLL1_CTRL_UPD_SHIFT			    15
-#define MADERA_FLL1_CTRL_UPD_WIDTH			     1
 #define MADERA_FLL1_N_MASK				0x03FF
 #define MADERA_FLL1_N_SHIFT				     0
-#define MADERA_FLL1_N_WIDTH				    10
 
 /* (0x0173)  FLL1_Control_3 */
 #define MADERA_FLL1_THETA_MASK				0xFFFF
 #define MADERA_FLL1_THETA_SHIFT				     0
-#define MADERA_FLL1_THETA_WIDTH				    16
 
 /* (0x0174)  FLL1_Control_4 */
 #define MADERA_FLL1_LAMBDA_MASK				0xFFFF
 #define MADERA_FLL1_LAMBDA_SHIFT			     0
-#define MADERA_FLL1_LAMBDA_WIDTH			    16
 
 /* (0x0175)  FLL1_Control_5 */
 #define MADERA_FLL1_FRATIO_MASK				0x0F00
 #define MADERA_FLL1_FRATIO_SHIFT			     8
-#define MADERA_FLL1_FRATIO_WIDTH			     4
 #define MADERA_FLL1_FB_DIV_MASK				0x03FF
 #define MADERA_FLL1_FB_DIV_SHIFT			     0
-#define MADERA_FLL1_FB_DIV_WIDTH			    10
 
 /* (0x0176)  FLL1_Control_6 */
 #define MADERA_FLL1_REFCLK_DIV_MASK			0x00C0
 #define MADERA_FLL1_REFCLK_DIV_SHIFT			     6
-#define MADERA_FLL1_REFCLK_DIV_WIDTH			     2
 #define MADERA_FLL1_REFCLK_SRC_MASK			0x000F
 #define MADERA_FLL1_REFCLK_SRC_SHIFT			     0
-#define MADERA_FLL1_REFCLK_SRC_WIDTH			     4
 
 /* (0x0179)  FLL1_Control_7 */
 #define MADERA_FLL1_GAIN_MASK				0x003c
 #define MADERA_FLL1_GAIN_SHIFT				     2
-#define MADERA_FLL1_GAIN_WIDTH				     4
 
 /* (0x017A)  FLL1_EFS_2 */
 #define MADERA_FLL1_PHASE_GAIN_MASK			0xF000
 #define MADERA_FLL1_PHASE_GAIN_SHIFT			    12
-#define MADERA_FLL1_PHASE_GAIN_WIDTH			     4
 #define MADERA_FLL1_PHASE_ENA_MASK			0x0800
 #define MADERA_FLL1_PHASE_ENA_SHIFT			    11
-#define MADERA_FLL1_PHASE_ENA_WIDTH			     1
 
 /* (0x017A)  FLL1_Control_10 */
 #define MADERA_FLL1_HP_MASK				0xC000
 #define MADERA_FLL1_HP_SHIFT				    14
-#define MADERA_FLL1_HP_WIDTH				     2
 #define MADERA_FLL1_PHASEDET_ENA_MASK			0x1000
 #define MADERA_FLL1_PHASEDET_ENA_SHIFT			    12
-#define MADERA_FLL1_PHASEDET_ENA_WIDTH			     1
 
 /* (0x017B)  FLL1_Control_11 */
 #define MADERA_FLL1_LOCKDET_THR_MASK			0x001E
 #define MADERA_FLL1_LOCKDET_THR_SHIFT			     1
-#define MADERA_FLL1_LOCKDET_THR_WIDTH			     4
 #define MADERA_FLL1_LOCKDET_MASK			0x0001
 #define MADERA_FLL1_LOCKDET_SHIFT			     0
-#define MADERA_FLL1_LOCKDET_WIDTH			     1
 
 /* (0x017D)  FLL1_Digital_Test_1 */
 #define MADERA_FLL1_SYNC_EFS_ENA_MASK			0x0100
 #define MADERA_FLL1_SYNC_EFS_ENA_SHIFT			     8
-#define MADERA_FLL1_SYNC_EFS_ENA_WIDTH			     1
 #define MADERA_FLL1_CLK_VCO_FAST_SRC_MASK		0x0003
 #define MADERA_FLL1_CLK_VCO_FAST_SRC_SHIFT		     0
-#define MADERA_FLL1_CLK_VCO_FAST_SRC_WIDTH		     2
 
 /* (0x0181)  FLL1_Synchroniser_1 */
 #define MADERA_FLL1_SYNC_ENA				0x0001
 #define MADERA_FLL1_SYNC_ENA_MASK			0x0001
 #define MADERA_FLL1_SYNC_ENA_SHIFT			     0
-#define MADERA_FLL1_SYNC_ENA_WIDTH			     1
 
 /* (0x0182)  FLL1_Synchroniser_2 */
 #define MADERA_FLL1_SYNC_N_MASK				0x03FF
 #define MADERA_FLL1_SYNC_N_SHIFT			     0
-#define MADERA_FLL1_SYNC_N_WIDTH			    10
 
 /* (0x0183)  FLL1_Synchroniser_3 */
 #define MADERA_FLL1_SYNC_THETA_MASK			0xFFFF
 #define MADERA_FLL1_SYNC_THETA_SHIFT			     0
-#define MADERA_FLL1_SYNC_THETA_WIDTH			    16
 
 /* (0x0184)  FLL1_Synchroniser_4 */
 #define MADERA_FLL1_SYNC_LAMBDA_MASK			0xFFFF
 #define MADERA_FLL1_SYNC_LAMBDA_SHIFT			     0
-#define MADERA_FLL1_SYNC_LAMBDA_WIDTH			    16
 
 /* (0x0185)  FLL1_Synchroniser_5 */
 #define MADERA_FLL1_SYNC_FRATIO_MASK			0x0700
 #define MADERA_FLL1_SYNC_FRATIO_SHIFT			     8
-#define MADERA_FLL1_SYNC_FRATIO_WIDTH			     3
 
 /* (0x0186)  FLL1_Synchroniser_6 */
 #define MADERA_FLL1_SYNCCLK_DIV_MASK			0x00C0
 #define MADERA_FLL1_SYNCCLK_DIV_SHIFT			     6
-#define MADERA_FLL1_SYNCCLK_DIV_WIDTH			     2
 #define MADERA_FLL1_SYNCCLK_SRC_MASK			0x000F
 #define MADERA_FLL1_SYNCCLK_SRC_SHIFT			     0
-#define MADERA_FLL1_SYNCCLK_SRC_WIDTH			     4
 
 /* (0x0187)  FLL1_Synchroniser_7 */
 #define MADERA_FLL1_SYNC_GAIN_MASK			0x003c
 #define MADERA_FLL1_SYNC_GAIN_SHIFT			     2
-#define MADERA_FLL1_SYNC_GAIN_WIDTH			     4
 #define MADERA_FLL1_SYNC_DFSAT				0x0001
 #define MADERA_FLL1_SYNC_DFSAT_MASK			0x0001
 #define MADERA_FLL1_SYNC_DFSAT_SHIFT			     0
-#define MADERA_FLL1_SYNC_DFSAT_WIDTH			     1
 
 /* (0x01D1)  FLL_AO_Control_1 */
 #define MADERA_FLL_AO_HOLD				0x0004
 #define MADERA_FLL_AO_HOLD_MASK				0x0004
 #define MADERA_FLL_AO_HOLD_SHIFT			     2
-#define MADERA_FLL_AO_HOLD_WIDTH			     1
 #define MADERA_FLL_AO_FREERUN				0x0002
 #define MADERA_FLL_AO_FREERUN_MASK			0x0002
 #define MADERA_FLL_AO_FREERUN_SHIFT			     1
-#define MADERA_FLL_AO_FREERUN_WIDTH			     1
 #define MADERA_FLL_AO_ENA				0x0001
 #define MADERA_FLL_AO_ENA_MASK				0x0001
 #define MADERA_FLL_AO_ENA_SHIFT				     0
-#define MADERA_FLL_AO_ENA_WIDTH				     1
 
 /* (0x01D2)  FLL_AO_Control_2 */
 #define MADERA_FLL_AO_CTRL_UPD				0x8000
 #define MADERA_FLL_AO_CTRL_UPD_MASK			0x8000
 #define MADERA_FLL_AO_CTRL_UPD_SHIFT			    15
-#define MADERA_FLL_AO_CTRL_UPD_WIDTH			     1
 
 /* (0x01D6)  FLL_AO_Control_6 */
 #define MADERA_FLL_AO_REFCLK_SRC_MASK			0x000F
 #define MADERA_FLL_AO_REFCLK_SRC_SHIFT			     0
-#define MADERA_FLL_AO_REFCLK_SRC_WIDTH			     4
 
 /* (0x0200)  Mic_Charge_Pump_1 */
 #define MADERA_CPMIC_BYPASS				0x0002
 #define MADERA_CPMIC_BYPASS_MASK			0x0002
 #define MADERA_CPMIC_BYPASS_SHIFT			     1
-#define MADERA_CPMIC_BYPASS_WIDTH			     1
 #define MADERA_CPMIC_ENA				0x0001
 #define MADERA_CPMIC_ENA_MASK				0x0001
 #define MADERA_CPMIC_ENA_SHIFT				     0
-#define MADERA_CPMIC_ENA_WIDTH				     1
 
 /* (0x0210)  LDO1_Control_1 */
 #define MADERA_LDO1_VSEL_MASK				0x07E0
 #define MADERA_LDO1_VSEL_SHIFT				     5
-#define MADERA_LDO1_VSEL_WIDTH				     6
 #define MADERA_LDO1_FAST				0x0010
 #define MADERA_LDO1_FAST_MASK				0x0010
 #define MADERA_LDO1_FAST_SHIFT				     4
-#define MADERA_LDO1_FAST_WIDTH				     1
 #define MADERA_LDO1_DISCH				0x0004
 #define MADERA_LDO1_DISCH_MASK				0x0004
 #define MADERA_LDO1_DISCH_SHIFT				     2
-#define MADERA_LDO1_DISCH_WIDTH				     1
 #define MADERA_LDO1_BYPASS				0x0002
 #define MADERA_LDO1_BYPASS_MASK				0x0002
 #define MADERA_LDO1_BYPASS_SHIFT			     1
-#define MADERA_LDO1_BYPASS_WIDTH			     1
 #define MADERA_LDO1_ENA					0x0001
 #define MADERA_LDO1_ENA_MASK				0x0001
 #define MADERA_LDO1_ENA_SHIFT				     0
-#define MADERA_LDO1_ENA_WIDTH				     1
 
 /* (0x0213)  LDO2_Control_1 */
 #define MADERA_LDO2_VSEL_MASK				0x07E0
 #define MADERA_LDO2_VSEL_SHIFT				     5
-#define MADERA_LDO2_VSEL_WIDTH				     6
 #define MADERA_LDO2_FAST				0x0010
 #define MADERA_LDO2_FAST_MASK				0x0010
 #define MADERA_LDO2_FAST_SHIFT				     4
-#define MADERA_LDO2_FAST_WIDTH				     1
 #define MADERA_LDO2_DISCH				0x0004
 #define MADERA_LDO2_DISCH_MASK				0x0004
 #define MADERA_LDO2_DISCH_SHIFT				     2
-#define MADERA_LDO2_DISCH_WIDTH				     1
 #define MADERA_LDO2_BYPASS				0x0002
 #define MADERA_LDO2_BYPASS_MASK				0x0002
 #define MADERA_LDO2_BYPASS_SHIFT			     1
-#define MADERA_LDO2_BYPASS_WIDTH			     1
 #define MADERA_LDO2_ENA					0x0001
 #define MADERA_LDO2_ENA_MASK				0x0001
 #define MADERA_LDO2_ENA_SHIFT				     0
-#define MADERA_LDO2_ENA_WIDTH				     1
 
 /* (0x0218)  Mic_Bias_Ctrl_1 */
 #define MADERA_MICB1_EXT_CAP				0x8000
 #define MADERA_MICB1_EXT_CAP_MASK			0x8000
 #define MADERA_MICB1_EXT_CAP_SHIFT			    15
-#define MADERA_MICB1_EXT_CAP_WIDTH			     1
 #define MADERA_MICB1_LVL_MASK				0x01E0
 #define MADERA_MICB1_LVL_SHIFT				     5
-#define MADERA_MICB1_LVL_WIDTH				     4
 #define MADERA_MICB1_ENA				0x0001
 #define MADERA_MICB1_ENA_MASK				0x0001
 #define MADERA_MICB1_ENA_SHIFT				     0
-#define MADERA_MICB1_ENA_WIDTH				     1
 
 /* (0x021C)  Mic_Bias_Ctrl_5 */
 #define MADERA_MICB1D_ENA				0x1000
 #define MADERA_MICB1D_ENA_MASK				0x1000
 #define MADERA_MICB1D_ENA_SHIFT				    12
-#define MADERA_MICB1D_ENA_WIDTH				     1
 #define MADERA_MICB1C_ENA				0x0100
 #define MADERA_MICB1C_ENA_MASK				0x0100
 #define MADERA_MICB1C_ENA_SHIFT				     8
-#define MADERA_MICB1C_ENA_WIDTH				     1
 #define MADERA_MICB1B_ENA				0x0010
 #define MADERA_MICB1B_ENA_MASK				0x0010
 #define MADERA_MICB1B_ENA_SHIFT				     4
-#define MADERA_MICB1B_ENA_WIDTH				     1
 #define MADERA_MICB1A_ENA				0x0001
 #define MADERA_MICB1A_ENA_MASK				0x0001
 #define MADERA_MICB1A_ENA_SHIFT				     0
-#define MADERA_MICB1A_ENA_WIDTH				     1
 
 /* (0x021E)  Mic_Bias_Ctrl_6 */
 #define MADERA_MICB2D_ENA				0x1000
 #define MADERA_MICB2D_ENA_MASK				0x1000
 #define MADERA_MICB2D_ENA_SHIFT				    12
-#define MADERA_MICB2D_ENA_WIDTH				     1
 #define MADERA_MICB2C_ENA				0x0100
 #define MADERA_MICB2C_ENA_MASK				0x0100
 #define MADERA_MICB2C_ENA_SHIFT				     8
-#define MADERA_MICB2C_ENA_WIDTH				     1
 #define MADERA_MICB2B_ENA				0x0010
 #define MADERA_MICB2B_ENA_MASK				0x0010
 #define MADERA_MICB2B_ENA_SHIFT				     4
-#define MADERA_MICB2B_ENA_WIDTH				     1
 #define MADERA_MICB2A_ENA				0x0001
 #define MADERA_MICB2A_ENA_MASK				0x0001
 #define MADERA_MICB2A_ENA_SHIFT				     0
-#define MADERA_MICB2A_ENA_WIDTH				     1
 
 /* (0x0225) - HP Ctrl 1L */
 #define MADERA_RMV_SHRT_HP1L				0x4000
 #define MADERA_RMV_SHRT_HP1L_MASK			0x4000
 #define MADERA_RMV_SHRT_HP1L_SHIFT			    14
-#define MADERA_RMV_SHRT_HP1L_WIDTH			     1
 #define MADERA_HP1L_FLWR				0x0004
 #define MADERA_HP1L_FLWR_MASK				0x0004
 #define MADERA_HP1L_FLWR_SHIFT				     2
-#define MADERA_HP1L_FLWR_WIDTH				     1
 #define MADERA_HP1L_SHRTI				0x0002
 #define MADERA_HP1L_SHRTI_MASK				0x0002
 #define MADERA_HP1L_SHRTI_SHIFT				     1
-#define MADERA_HP1L_SHRTI_WIDTH				     1
 #define MADERA_HP1L_SHRTO				0x0001
 #define MADERA_HP1L_SHRTO_MASK				0x0001
 #define MADERA_HP1L_SHRTO_SHIFT				     0
-#define MADERA_HP1L_SHRTO_WIDTH				     1
 
 /* (0x0226) - HP Ctrl 1R */
 #define MADERA_RMV_SHRT_HP1R				0x4000
 #define MADERA_RMV_SHRT_HP1R_MASK			0x4000
 #define MADERA_RMV_SHRT_HP1R_SHIFT			    14
-#define MADERA_RMV_SHRT_HP1R_WIDTH			     1
 #define MADERA_HP1R_FLWR				0x0004
 #define MADERA_HP1R_FLWR_MASK				0x0004
 #define MADERA_HP1R_FLWR_SHIFT				     2
-#define MADERA_HP1R_FLWR_WIDTH				     1
 #define MADERA_HP1R_SHRTI				0x0002
 #define MADERA_HP1R_SHRTI_MASK				0x0002
 #define MADERA_HP1R_SHRTI_SHIFT				     1
-#define MADERA_HP1R_SHRTI_WIDTH				     1
 #define MADERA_HP1R_SHRTO				0x0001
 #define MADERA_HP1R_SHRTO_MASK				0x0001
 #define MADERA_HP1R_SHRTO_SHIFT				     0
-#define MADERA_HP1R_SHRTO_WIDTH				     1
 
 /* (0x0293)  Accessory_Detect_Mode_1 */
 #define MADERA_ACCDET_SRC				0x2000
 #define MADERA_ACCDET_SRC_MASK				0x2000
 #define MADERA_ACCDET_SRC_SHIFT				    13
-#define MADERA_ACCDET_SRC_WIDTH				     1
 #define MADERA_ACCDET_POLARITY_INV_ENA			0x0080
 #define MADERA_ACCDET_POLARITY_INV_ENA_MASK		0x0080
 #define MADERA_ACCDET_POLARITY_INV_ENA_SHIFT		     7
-#define MADERA_ACCDET_POLARITY_INV_ENA_WIDTH		     1
 #define MADERA_ACCDET_MODE_MASK				0x0007
 #define MADERA_ACCDET_MODE_SHIFT			     0
-#define MADERA_ACCDET_MODE_WIDTH			     3
 
 /* (0x0299)  Headphone_Detect_0 */
 #define MADERA_HPD_GND_SEL				0x0007
 #define MADERA_HPD_GND_SEL_MASK				0x0007
 #define MADERA_HPD_GND_SEL_SHIFT			     0
-#define MADERA_HPD_GND_SEL_WIDTH			     3
 #define MADERA_HPD_SENSE_SEL				0x00F0
 #define MADERA_HPD_SENSE_SEL_MASK			0x00F0
 #define MADERA_HPD_SENSE_SEL_SHIFT			     4
-#define MADERA_HPD_SENSE_SEL_WIDTH			     4
 #define MADERA_HPD_FRC_SEL				0x0F00
 #define MADERA_HPD_FRC_SEL_MASK				0x0F00
 #define MADERA_HPD_FRC_SEL_SHIFT			     8
-#define MADERA_HPD_FRC_SEL_WIDTH			     4
 #define MADERA_HPD_OUT_SEL				0x7000
 #define MADERA_HPD_OUT_SEL_MASK				0x7000
 #define MADERA_HPD_OUT_SEL_SHIFT			    12
-#define MADERA_HPD_OUT_SEL_WIDTH			     3
 #define MADERA_HPD_OVD_ENA_SEL				0x8000
 #define MADERA_HPD_OVD_ENA_SEL_MASK			0x8000
 #define MADERA_HPD_OVD_ENA_SEL_SHIFT			    15
-#define MADERA_HPD_OVD_ENA_SEL_WIDTH			     1
 
 /* (0x029B)  Headphone_Detect_1 */
 #define MADERA_HP_IMPEDANCE_RANGE_MASK			0x0600
 #define MADERA_HP_IMPEDANCE_RANGE_SHIFT			     9
-#define MADERA_HP_IMPEDANCE_RANGE_WIDTH			     2
 #define MADERA_HP_STEP_SIZE				0x0100
 #define MADERA_HP_STEP_SIZE_MASK			0x0100
 #define MADERA_HP_STEP_SIZE_SHIFT			     8
-#define MADERA_HP_STEP_SIZE_WIDTH			     1
 #define MADERA_HP_CLK_DIV_MASK				0x0018
 #define MADERA_HP_CLK_DIV_SHIFT				     3
-#define MADERA_HP_CLK_DIV_WIDTH				     2
 #define MADERA_HP_RATE_MASK				0x0006
 #define MADERA_HP_RATE_SHIFT				     1
-#define MADERA_HP_RATE_WIDTH				     2
 #define MADERA_HP_POLL					0x0001
 #define MADERA_HP_POLL_MASK				0x0001
 #define MADERA_HP_POLL_SHIFT				     0
-#define MADERA_HP_POLL_WIDTH				     1
 
 /* (0x029C)  Headphone_Detect_2 */
 #define MADERA_HP_DONE_MASK				0x8000
 #define MADERA_HP_DONE_SHIFT				    15
-#define MADERA_HP_DONE_WIDTH				     1
 #define MADERA_HP_LVL_MASK				0x7FFF
 #define MADERA_HP_LVL_SHIFT				     0
-#define MADERA_HP_LVL_WIDTH				    15
 
 /* (0x029D)  Headphone_Detect_3 */
 #define MADERA_HP_DACVAL_MASK				0x03FF
 #define MADERA_HP_DACVAL_SHIFT				     0
-#define MADERA_HP_DACVAL_WIDTH				    10
 
 /* (0x029F) - Headphone Detect 5 */
 #define MADERA_HP_DACVAL_DOWN_MASK			0x03FF
 #define MADERA_HP_DACVAL_DOWN_SHIFT			     0
-#define MADERA_HP_DACVAL_DOWN_WIDTH			    10
 
 /* (0x02A2)  Mic_Detect_1_Control_0 */
 #define MADERA_MICD1_GND_MASK				0x0007
 #define MADERA_MICD1_GND_SHIFT				     0
-#define MADERA_MICD1_GND_WIDTH				     3
 #define MADERA_MICD1_SENSE_MASK				0x00F0
 #define MADERA_MICD1_SENSE_SHIFT			     4
-#define MADERA_MICD1_SENSE_WIDTH			     4
 #define MADERA_MICD1_ADC_MODE_MASK			0x8000
 #define MADERA_MICD1_ADC_MODE_SHIFT			    15
-#define MADERA_MICD1_ADC_MODE_WIDTH			     1
 
 /* (0x02A3)  Mic_Detect_1_Control_1 */
 #define MADERA_MICD_BIAS_STARTTIME_MASK			0xF000
 #define MADERA_MICD_BIAS_STARTTIME_SHIFT		    12
-#define MADERA_MICD_BIAS_STARTTIME_WIDTH		     4
 #define MADERA_MICD_RATE_MASK				0x0F00
 #define MADERA_MICD_RATE_SHIFT				     8
-#define MADERA_MICD_RATE_WIDTH				     4
 #define MADERA_MICD_BIAS_SRC_MASK			0x00F0
 #define MADERA_MICD_BIAS_SRC_SHIFT			     4
-#define MADERA_MICD_BIAS_SRC_WIDTH			     4
 #define MADERA_MICD_DBTIME				0x0002
 #define MADERA_MICD_DBTIME_MASK				0x0002
 #define MADERA_MICD_DBTIME_SHIFT			     1
-#define MADERA_MICD_DBTIME_WIDTH			     1
 #define MADERA_MICD_ENA					0x0001
 #define MADERA_MICD_ENA_MASK				0x0001
 #define MADERA_MICD_ENA_SHIFT				     0
-#define MADERA_MICD_ENA_WIDTH				     1
 
 /* (0x02A4)  Mic_Detect_1_Control_2 */
 #define MADERA_MICD_LVL_SEL_MASK			0x00FF
 #define MADERA_MICD_LVL_SEL_SHIFT			     0
-#define MADERA_MICD_LVL_SEL_WIDTH			     8
 
 /* (0x02A5)  Mic_Detect_1_Control_3 */
 #define MADERA_MICD_LVL_0				0x0004
@@ -1859,1746 +1731,1341 @@
 #define MADERA_MICD_LVL_8				0x0400
 #define MADERA_MICD_LVL_MASK				0x07FC
 #define MADERA_MICD_LVL_SHIFT				     2
-#define MADERA_MICD_LVL_WIDTH				     9
 #define MADERA_MICD_VALID				0x0002
 #define MADERA_MICD_VALID_MASK				0x0002
 #define MADERA_MICD_VALID_SHIFT				     1
-#define MADERA_MICD_VALID_WIDTH				     1
 #define MADERA_MICD_STS					0x0001
 #define MADERA_MICD_STS_MASK				0x0001
 #define MADERA_MICD_STS_SHIFT				     0
-#define MADERA_MICD_STS_WIDTH				     1
 
 /* (0x02AB)  Mic_Detect_1_Control_4 */
 #define MADERA_MICDET_ADCVAL_DIFF_MASK			0xFF00
 #define MADERA_MICDET_ADCVAL_DIFF_SHIFT			     8
-#define MADERA_MICDET_ADCVAL_DIFF_WIDTH			     8
 #define MADERA_MICDET_ADCVAL_MASK			0x007F
 #define MADERA_MICDET_ADCVAL_SHIFT			     0
-#define MADERA_MICDET_ADCVAL_WIDTH			     7
 
 /* (0x02C6)  Micd_Clamp_control */
 #define MADERA_MICD_CLAMP_OVD				0x0010
 #define MADERA_MICD_CLAMP_OVD_MASK			0x0010
 #define MADERA_MICD_CLAMP_OVD_SHIFT			     4
-#define MADERA_MICD_CLAMP_OVD_WIDTH			     1
 #define MADERA_MICD_CLAMP_MODE_MASK			0x000F
 #define MADERA_MICD_CLAMP_MODE_SHIFT			     0
-#define MADERA_MICD_CLAMP_MODE_WIDTH			     4
 
 /* (0x02C8)  GP_Switch_1 */
 #define MADERA_SW2_MODE_MASK				0x000C
 #define MADERA_SW2_MODE_SHIFT				     2
-#define MADERA_SW2_MODE_WIDTH				     2
 #define MADERA_SW1_MODE_MASK				0x0003
 #define MADERA_SW1_MODE_SHIFT				     0
-#define MADERA_SW1_MODE_WIDTH				     2
 
 /* (0x02D3)  Jack_detect_analogue */
 #define MADERA_JD2_ENA					0x0002
 #define MADERA_JD2_ENA_MASK				0x0002
 #define MADERA_JD2_ENA_SHIFT				     1
-#define MADERA_JD2_ENA_WIDTH				     1
 #define MADERA_JD1_ENA					0x0001
 #define MADERA_JD1_ENA_MASK				0x0001
 #define MADERA_JD1_ENA_SHIFT				     0
-#define MADERA_JD1_ENA_WIDTH				     1
 
 /* (0x0300)  Input_Enables */
 #define MADERA_IN6L_ENA					0x0800
 #define MADERA_IN6L_ENA_MASK				0x0800
 #define MADERA_IN6L_ENA_SHIFT				    11
-#define MADERA_IN6L_ENA_WIDTH				     1
 #define MADERA_IN6R_ENA					0x0400
 #define MADERA_IN6R_ENA_MASK				0x0400
 #define MADERA_IN6R_ENA_SHIFT				    10
-#define MADERA_IN6R_ENA_WIDTH				     1
 #define MADERA_IN5L_ENA					0x0200
 #define MADERA_IN5L_ENA_MASK				0x0200
 #define MADERA_IN5L_ENA_SHIFT				     9
-#define MADERA_IN5L_ENA_WIDTH				     1
 #define MADERA_IN5R_ENA					0x0100
 #define MADERA_IN5R_ENA_MASK				0x0100
 #define MADERA_IN5R_ENA_SHIFT				     8
-#define MADERA_IN5R_ENA_WIDTH				     1
 #define MADERA_IN4L_ENA					0x0080
 #define MADERA_IN4L_ENA_MASK				0x0080
 #define MADERA_IN4L_ENA_SHIFT				     7
-#define MADERA_IN4L_ENA_WIDTH				     1
 #define MADERA_IN4R_ENA					0x0040
 #define MADERA_IN4R_ENA_MASK				0x0040
 #define MADERA_IN4R_ENA_SHIFT				     6
-#define MADERA_IN4R_ENA_WIDTH				     1
 #define MADERA_IN3L_ENA					0x0020
 #define MADERA_IN3L_ENA_MASK				0x0020
 #define MADERA_IN3L_ENA_SHIFT				     5
-#define MADERA_IN3L_ENA_WIDTH				     1
 #define MADERA_IN3R_ENA					0x0010
 #define MADERA_IN3R_ENA_MASK				0x0010
 #define MADERA_IN3R_ENA_SHIFT				     4
-#define MADERA_IN3R_ENA_WIDTH				     1
 #define MADERA_IN2L_ENA					0x0008
 #define MADERA_IN2L_ENA_MASK				0x0008
 #define MADERA_IN2L_ENA_SHIFT				     3
-#define MADERA_IN2L_ENA_WIDTH				     1
 #define MADERA_IN2R_ENA					0x0004
 #define MADERA_IN2R_ENA_MASK				0x0004
 #define MADERA_IN2R_ENA_SHIFT				     2
-#define MADERA_IN2R_ENA_WIDTH				     1
 #define MADERA_IN1L_ENA					0x0002
 #define MADERA_IN1L_ENA_MASK				0x0002
 #define MADERA_IN1L_ENA_SHIFT				     1
-#define MADERA_IN1L_ENA_WIDTH				     1
 #define MADERA_IN1R_ENA					0x0001
 #define MADERA_IN1R_ENA_MASK				0x0001
 #define MADERA_IN1R_ENA_SHIFT				     0
-#define MADERA_IN1R_ENA_WIDTH				     1
 
 /* (0x0308)  Input_Rate */
 #define MADERA_IN_RATE_MASK				0xF800
 #define MADERA_IN_RATE_SHIFT				    11
-#define MADERA_IN_RATE_WIDTH				     5
 #define MADERA_IN_MODE_MASK				0x0400
 #define MADERA_IN_MODE_SHIFT				    10
-#define MADERA_IN_MODE_WIDTH				     1
 
 /* (0x0309)  Input_Volume_Ramp */
 #define MADERA_IN_VD_RAMP_MASK				0x0070
 #define MADERA_IN_VD_RAMP_SHIFT				     4
-#define MADERA_IN_VD_RAMP_WIDTH				     3
 #define MADERA_IN_VI_RAMP_MASK				0x0007
 #define MADERA_IN_VI_RAMP_SHIFT				     0
-#define MADERA_IN_VI_RAMP_WIDTH				     3
 
 /* (0x030C)  HPF_Control */
 #define MADERA_IN_HPF_CUT_MASK				0x0007
 #define MADERA_IN_HPF_CUT_SHIFT				     0
-#define MADERA_IN_HPF_CUT_WIDTH				     3
 
 /* (0x0310)  IN1L_Control */
 #define MADERA_IN1L_HPF_MASK				0x8000
 #define MADERA_IN1L_HPF_SHIFT				    15
-#define MADERA_IN1L_HPF_WIDTH				     1
 #define MADERA_IN1_DMIC_SUP_MASK			0x1800
 #define MADERA_IN1_DMIC_SUP_SHIFT			    11
-#define MADERA_IN1_DMIC_SUP_WIDTH			     2
 #define MADERA_IN1_MODE_MASK				0x0400
 #define MADERA_IN1_MODE_SHIFT				    10
-#define MADERA_IN1_MODE_WIDTH				     1
 #define MADERA_IN1L_PGA_VOL_MASK			0x00FE
 #define MADERA_IN1L_PGA_VOL_SHIFT			     1
-#define MADERA_IN1L_PGA_VOL_WIDTH			     7
 
 /* (0x0311)  ADC_Digital_Volume_1L */
 #define MADERA_IN1L_SRC_MASK				0x4000
 #define MADERA_IN1L_SRC_SHIFT				    14
-#define MADERA_IN1L_SRC_WIDTH				     1
 #define MADERA_IN1L_SRC_SE_MASK				0x2000
 #define MADERA_IN1L_SRC_SE_SHIFT			    13
-#define MADERA_IN1L_SRC_SE_WIDTH			     1
 #define MADERA_IN1L_LP_MODE				0x0800
 #define MADERA_IN1L_LP_MODE_MASK			0x0800
 #define MADERA_IN1L_LP_MODE_SHIFT			    11
-#define MADERA_IN1L_LP_MODE_WIDTH			     1
 #define MADERA_IN_VU					0x0200
 #define MADERA_IN_VU_MASK				0x0200
 #define MADERA_IN_VU_SHIFT				     9
-#define MADERA_IN_VU_WIDTH				     1
 #define MADERA_IN1L_MUTE				0x0100
 #define MADERA_IN1L_MUTE_MASK				0x0100
 #define MADERA_IN1L_MUTE_SHIFT				     8
-#define MADERA_IN1L_MUTE_WIDTH				     1
 #define MADERA_IN1L_DIG_VOL_MASK			0x00FF
 #define MADERA_IN1L_DIG_VOL_SHIFT			     0
-#define MADERA_IN1L_DIG_VOL_WIDTH			     8
 
 /* (0x0312)  DMIC1L_Control */
 #define MADERA_IN1_OSR_MASK				0x0700
 #define MADERA_IN1_OSR_SHIFT				     8
-#define MADERA_IN1_OSR_WIDTH				     3
 
 /* (0x0313)  IN1L_Rate_Control */
 #define MADERA_IN1L_RATE_MASK				0xF800
 #define MADERA_IN1L_RATE_SHIFT				    11
-#define MADERA_IN1L_RATE_WIDTH				     5
 
 /* (0x0314)  IN1R_Control */
 #define MADERA_IN1R_HPF_MASK				0x8000
 #define MADERA_IN1R_HPF_SHIFT				    15
-#define MADERA_IN1R_HPF_WIDTH				     1
 #define MADERA_IN1R_PGA_VOL_MASK			0x00FE
 #define MADERA_IN1R_PGA_VOL_SHIFT			     1
-#define MADERA_IN1R_PGA_VOL_WIDTH			     7
 #define MADERA_IN1_DMICCLK_SRC_MASK			0x1800
 #define MADERA_IN1_DMICCLK_SRC_SHIFT			    11
-#define MADERA_IN1_DMICCLK_SRC_WIDTH			     2
 
 /* (0x0315)  ADC_Digital_Volume_1R */
 #define MADERA_IN1R_SRC_MASK				0x4000
 #define MADERA_IN1R_SRC_SHIFT				    14
-#define MADERA_IN1R_SRC_WIDTH				     1
 #define MADERA_IN1R_SRC_SE_MASK				0x2000
 #define MADERA_IN1R_SRC_SE_SHIFT			    13
-#define MADERA_IN1R_SRC_SE_WIDTH			     1
 #define MADERA_IN1R_LP_MODE				0x0800
 #define MADERA_IN1R_LP_MODE_MASK			0x0800
 #define MADERA_IN1R_LP_MODE_SHIFT			    11
-#define MADERA_IN1R_LP_MODE_WIDTH			     1
 #define MADERA_IN1R_MUTE				0x0100
 #define MADERA_IN1R_MUTE_MASK				0x0100
 #define MADERA_IN1R_MUTE_SHIFT				     8
-#define MADERA_IN1R_MUTE_WIDTH				     1
 #define MADERA_IN1R_DIG_VOL_MASK			0x00FF
 #define MADERA_IN1R_DIG_VOL_SHIFT			     0
-#define MADERA_IN1R_DIG_VOL_WIDTH			     8
 
 /* (0x0317)  IN1R_Rate_Control */
 #define MADERA_IN1R_RATE_MASK				0xF800
 #define MADERA_IN1R_RATE_SHIFT				    11
-#define MADERA_IN1R_RATE_WIDTH				     5
 
 /* (0x0318)  IN2L_Control */
 #define MADERA_IN2L_HPF_MASK				0x8000
 #define MADERA_IN2L_HPF_SHIFT				    15
-#define MADERA_IN2L_HPF_WIDTH				     1
 #define MADERA_IN2_DMIC_SUP_MASK			0x1800
 #define MADERA_IN2_DMIC_SUP_SHIFT			    11
-#define MADERA_IN2_DMIC_SUP_WIDTH			     2
 #define MADERA_IN2_MODE_MASK				0x0400
 #define MADERA_IN2_MODE_SHIFT				    10
-#define MADERA_IN2_MODE_WIDTH				     1
 #define MADERA_IN2L_PGA_VOL_MASK			0x00FE
 #define MADERA_IN2L_PGA_VOL_SHIFT			     1
-#define MADERA_IN2L_PGA_VOL_WIDTH			     7
 
 /* (0x0319)  ADC_Digital_Volume_2L */
 #define MADERA_IN2L_SRC_MASK				0x4000
 #define MADERA_IN2L_SRC_SHIFT				    14
-#define MADERA_IN2L_SRC_WIDTH				     1
 #define MADERA_IN2L_SRC_SE_MASK				0x2000
 #define MADERA_IN2L_SRC_SE_SHIFT			    13
-#define MADERA_IN2L_SRC_SE_WIDTH			     1
 #define MADERA_IN2L_LP_MODE				0x0800
 #define MADERA_IN2L_LP_MODE_MASK			0x0800
 #define MADERA_IN2L_LP_MODE_SHIFT			    11
-#define MADERA_IN2L_LP_MODE_WIDTH			     1
 #define MADERA_IN2L_MUTE				0x0100
 #define MADERA_IN2L_MUTE_MASK				0x0100
 #define MADERA_IN2L_MUTE_SHIFT				     8
-#define MADERA_IN2L_MUTE_WIDTH				     1
 #define MADERA_IN2L_DIG_VOL_MASK			0x00FF
 #define MADERA_IN2L_DIG_VOL_SHIFT			     0
-#define MADERA_IN2L_DIG_VOL_WIDTH			     8
 
 /* (0x031A)  DMIC2L_Control */
 #define MADERA_IN2_OSR_MASK				0x0700
 #define MADERA_IN2_OSR_SHIFT				     8
-#define MADERA_IN2_OSR_WIDTH				     3
 
 /* (0x031C)  IN2R_Control */
 #define MADERA_IN2R_HPF_MASK				0x8000
 #define MADERA_IN2R_HPF_SHIFT				    15
-#define MADERA_IN2R_HPF_WIDTH				     1
 #define MADERA_IN2R_PGA_VOL_MASK			0x00FE
 #define MADERA_IN2R_PGA_VOL_SHIFT			     1
-#define MADERA_IN2R_PGA_VOL_WIDTH			     7
 #define MADERA_IN2_DMICCLK_SRC_MASK			0x1800
 #define MADERA_IN2_DMICCLK_SRC_SHIFT			    11
-#define MADERA_IN2_DMICCLK_SRC_WIDTH			     2
 
 /* (0x031D)  ADC_Digital_Volume_2R */
 #define MADERA_IN2R_SRC_MASK				0x4000
 #define MADERA_IN2R_SRC_SHIFT				    14
-#define MADERA_IN2R_SRC_WIDTH				     1
 #define MADERA_IN2R_SRC_SE_MASK				0x2000
 #define MADERA_IN2R_SRC_SE_SHIFT			    13
-#define MADERA_IN2R_SRC_SE_WIDTH			     1
 #define MADERA_IN2R_LP_MODE				0x0800
 #define MADERA_IN2R_LP_MODE_MASK			0x0800
 #define MADERA_IN2R_LP_MODE_SHIFT			    11
-#define MADERA_IN2R_LP_MODE_WIDTH			     1
 #define MADERA_IN2R_MUTE				0x0100
 #define MADERA_IN2R_MUTE_MASK				0x0100
 #define MADERA_IN2R_MUTE_SHIFT				     8
-#define MADERA_IN2R_MUTE_WIDTH				     1
 #define MADERA_IN2R_DIG_VOL_MASK			0x00FF
 #define MADERA_IN2R_DIG_VOL_SHIFT			     0
-#define MADERA_IN2R_DIG_VOL_WIDTH			     8
 
 /* (0x0320)  IN3L_Control */
 #define MADERA_IN3L_HPF_MASK				0x8000
 #define MADERA_IN3L_HPF_SHIFT				    15
-#define MADERA_IN3L_HPF_WIDTH				     1
 #define MADERA_IN3_DMIC_SUP_MASK			0x1800
 #define MADERA_IN3_DMIC_SUP_SHIFT			    11
-#define MADERA_IN3_DMIC_SUP_WIDTH			     2
 #define MADERA_IN3_MODE_MASK				0x0400
 #define MADERA_IN3_MODE_SHIFT				    10
-#define MADERA_IN3_MODE_WIDTH				     1
 #define MADERA_IN3L_PGA_VOL_MASK			0x00FE
 #define MADERA_IN3L_PGA_VOL_SHIFT			     1
-#define MADERA_IN3L_PGA_VOL_WIDTH			     7
 
 /* (0x0321)  ADC_Digital_Volume_3L */
 #define MADERA_IN3L_MUTE				0x0100
 #define MADERA_IN3L_MUTE_MASK				0x0100
 #define MADERA_IN3L_MUTE_SHIFT				     8
-#define MADERA_IN3L_MUTE_WIDTH				     1
 #define MADERA_IN3L_DIG_VOL_MASK			0x00FF
 #define MADERA_IN3L_DIG_VOL_SHIFT			     0
-#define MADERA_IN3L_DIG_VOL_WIDTH			     8
 
 /* (0x0322)  DMIC3L_Control */
 #define MADERA_IN3_OSR_MASK				0x0700
 #define MADERA_IN3_OSR_SHIFT				     8
-#define MADERA_IN3_OSR_WIDTH				     3
 
 /* (0x0324)  IN3R_Control */
 #define MADERA_IN3R_HPF_MASK				0x8000
 #define MADERA_IN3R_HPF_SHIFT				    15
-#define MADERA_IN3R_HPF_WIDTH				     1
 #define MADERA_IN3R_PGA_VOL_MASK			0x00FE
 #define MADERA_IN3R_PGA_VOL_SHIFT			     1
-#define MADERA_IN3R_PGA_VOL_WIDTH			     7
 #define MADERA_IN3_DMICCLK_SRC_MASK			0x1800
 #define MADERA_IN3_DMICCLK_SRC_SHIFT			    11
-#define MADERA_IN3_DMICCLK_SRC_WIDTH			     2
 
 /* (0x0325)  ADC_Digital_Volume_3R */
 #define MADERA_IN3R_MUTE				0x0100
 #define MADERA_IN3R_MUTE_MASK				0x0100
 #define MADERA_IN3R_MUTE_SHIFT				     8
-#define MADERA_IN3R_MUTE_WIDTH				     1
 #define MADERA_IN3R_DIG_VOL_MASK			0x00FF
 #define MADERA_IN3R_DIG_VOL_SHIFT			     0
-#define MADERA_IN3R_DIG_VOL_WIDTH			     8
 
 /* (0x0328)  IN4L_Control */
 #define MADERA_IN4L_HPF_MASK				0x8000
 #define MADERA_IN4L_HPF_SHIFT				    15
-#define MADERA_IN4L_HPF_WIDTH				     1
 #define MADERA_IN4_DMIC_SUP_MASK			0x1800
 #define MADERA_IN4_DMIC_SUP_SHIFT			    11
-#define MADERA_IN4_DMIC_SUP_WIDTH			     2
 
 /* (0x0329)  ADC_Digital_Volume_4L */
 #define MADERA_IN4L_MUTE				0x0100
 #define MADERA_IN4L_MUTE_MASK				0x0100
 #define MADERA_IN4L_MUTE_SHIFT				     8
-#define MADERA_IN4L_MUTE_WIDTH				     1
 #define MADERA_IN4L_DIG_VOL_MASK			0x00FF
 #define MADERA_IN4L_DIG_VOL_SHIFT			     0
-#define MADERA_IN4L_DIG_VOL_WIDTH			     8
 
 /* (0x032A)  DMIC4L_Control */
 #define MADERA_IN4_OSR_MASK				0x0700
 #define MADERA_IN4_OSR_SHIFT				     8
-#define MADERA_IN4_OSR_WIDTH				     3
 
 /* (0x032C)  IN4R_Control */
 #define MADERA_IN4R_HPF_MASK				0x8000
 #define MADERA_IN4R_HPF_SHIFT				    15
-#define MADERA_IN4R_HPF_WIDTH				     1
 #define MADERA_IN4_DMICCLK_SRC_MASK			0x1800
 #define MADERA_IN4_DMICCLK_SRC_SHIFT			    11
-#define MADERA_IN4_DMICCLK_SRC_WIDTH			     2
 
 /* (0x032D)  ADC_Digital_Volume_4R */
 #define MADERA_IN4R_MUTE				0x0100
 #define MADERA_IN4R_MUTE_MASK				0x0100
 #define MADERA_IN4R_MUTE_SHIFT				     8
-#define MADERA_IN4R_MUTE_WIDTH				     1
 #define MADERA_IN4R_DIG_VOL_MASK			0x00FF
 #define MADERA_IN4R_DIG_VOL_SHIFT			     0
-#define MADERA_IN4R_DIG_VOL_WIDTH			     8
 
 /* (0x0330)  IN5L_Control */
 #define MADERA_IN5L_HPF_MASK				0x8000
 #define MADERA_IN5L_HPF_SHIFT				    15
-#define MADERA_IN5L_HPF_WIDTH				     1
 #define MADERA_IN5_DMIC_SUP_MASK			0x1800
 #define MADERA_IN5_DMIC_SUP_SHIFT			    11
-#define MADERA_IN5_DMIC_SUP_WIDTH			     2
 
 /* (0x0331)  ADC_Digital_Volume_5L */
 #define MADERA_IN5L_MUTE				0x0100
 #define MADERA_IN5L_MUTE_MASK				0x0100
 #define MADERA_IN5L_MUTE_SHIFT				     8
-#define MADERA_IN5L_MUTE_WIDTH				     1
 #define MADERA_IN5L_DIG_VOL_MASK			0x00FF
 #define MADERA_IN5L_DIG_VOL_SHIFT			     0
-#define MADERA_IN5L_DIG_VOL_WIDTH			     8
 
 /* (0x0332)  DMIC5L_Control */
 #define MADERA_IN5_OSR_MASK				0x0700
 #define MADERA_IN5_OSR_SHIFT				     8
-#define MADERA_IN5_OSR_WIDTH				     3
 
 /* (0x0334)  IN5R_Control */
 #define MADERA_IN5R_HPF_MASK				0x8000
 #define MADERA_IN5R_HPF_SHIFT				    15
-#define MADERA_IN5R_HPF_WIDTH				     1
 #define MADERA_IN5_DMICCLK_SRC_MASK			0x1800
 #define MADERA_IN5_DMICCLK_SRC_SHIFT			    11
-#define MADERA_IN5_DMICCLK_SRC_WIDTH			     2
 
 /* (0x0335)  ADC_Digital_Volume_5R */
 #define MADERA_IN5R_MUTE				0x0100
 #define MADERA_IN5R_MUTE_MASK				0x0100
 #define MADERA_IN5R_MUTE_SHIFT				     8
-#define MADERA_IN5R_MUTE_WIDTH				     1
 #define MADERA_IN5R_DIG_VOL_MASK			0x00FF
 #define MADERA_IN5R_DIG_VOL_SHIFT			     0
-#define MADERA_IN5R_DIG_VOL_WIDTH			     8
 
 /* (0x0338)  IN6L_Control */
 #define MADERA_IN6L_HPF_MASK				0x8000
 #define MADERA_IN6L_HPF_SHIFT				    15
-#define MADERA_IN6L_HPF_WIDTH				     1
 #define MADERA_IN6_DMIC_SUP_MASK			0x1800
 #define MADERA_IN6_DMIC_SUP_SHIFT			    11
-#define MADERA_IN6_DMIC_SUP_WIDTH			     2
 
 /* (0x0339)  ADC_Digital_Volume_6L */
 #define MADERA_IN6L_MUTE				0x0100
 #define MADERA_IN6L_MUTE_MASK				0x0100
 #define MADERA_IN6L_MUTE_SHIFT				     8
-#define MADERA_IN6L_MUTE_WIDTH				     1
 #define MADERA_IN6L_DIG_VOL_MASK			0x00FF
 #define MADERA_IN6L_DIG_VOL_SHIFT			     0
-#define MADERA_IN6L_DIG_VOL_WIDTH			     8
 
 /* (0x033A)  DMIC6L_Control */
 #define MADERA_IN6_OSR_MASK				0x0700
 #define MADERA_IN6_OSR_SHIFT				     8
-#define MADERA_IN6_OSR_WIDTH				     3
 
 /* (0x033C)  IN6R_Control */
 #define MADERA_IN6R_HPF_MASK				0x8000
 #define MADERA_IN6R_HPF_SHIFT				    15
-#define MADERA_IN6R_HPF_WIDTH				     1
 
 /* (0x033D)  ADC_Digital_Volume_6R */
 #define MADERA_IN6R_MUTE				0x0100
 #define MADERA_IN6R_MUTE_MASK				0x0100
 #define MADERA_IN6R_MUTE_SHIFT				     8
-#define MADERA_IN6R_MUTE_WIDTH				     1
 #define MADERA_IN6R_DIG_VOL_MASK			0x00FF
 #define MADERA_IN6R_DIG_VOL_SHIFT			     0
-#define MADERA_IN6R_DIG_VOL_WIDTH			     8
 
 /* (0x033E)  DMIC6R_Control */
 #define MADERA_IN6_DMICCLK_SRC_MASK			0x1800
 #define MADERA_IN6_DMICCLK_SRC_SHIFT			    11
-#define MADERA_IN6_DMICCLK_SRC_WIDTH			     2
 
 /* (0x0400)  Output_Enables_1 */
 #define MADERA_EP_SEL					0x8000
 #define MADERA_EP_SEL_MASK				0x8000
 #define MADERA_EP_SEL_SHIFT				    15
-#define MADERA_EP_SEL_WIDTH				     1
 #define MADERA_OUT6L_ENA				0x0800
 #define MADERA_OUT6L_ENA_MASK				0x0800
 #define MADERA_OUT6L_ENA_SHIFT				    11
-#define MADERA_OUT6L_ENA_WIDTH				     1
 #define MADERA_OUT6R_ENA				0x0400
 #define MADERA_OUT6R_ENA_MASK				0x0400
 #define MADERA_OUT6R_ENA_SHIFT				    10
-#define MADERA_OUT6R_ENA_WIDTH				     1
 #define MADERA_OUT5L_ENA				0x0200
 #define MADERA_OUT5L_ENA_MASK				0x0200
 #define MADERA_OUT5L_ENA_SHIFT				     9
-#define MADERA_OUT5L_ENA_WIDTH				     1
 #define MADERA_OUT5R_ENA				0x0100
 #define MADERA_OUT5R_ENA_MASK				0x0100
 #define MADERA_OUT5R_ENA_SHIFT				     8
-#define MADERA_OUT5R_ENA_WIDTH				     1
 #define MADERA_OUT4L_ENA				0x0080
 #define MADERA_OUT4L_ENA_MASK				0x0080
 #define MADERA_OUT4L_ENA_SHIFT				     7
-#define MADERA_OUT4L_ENA_WIDTH				     1
 #define MADERA_OUT4R_ENA				0x0040
 #define MADERA_OUT4R_ENA_MASK				0x0040
 #define MADERA_OUT4R_ENA_SHIFT				     6
-#define MADERA_OUT4R_ENA_WIDTH				     1
 #define MADERA_OUT3L_ENA				0x0020
 #define MADERA_OUT3L_ENA_MASK				0x0020
 #define MADERA_OUT3L_ENA_SHIFT				     5
-#define MADERA_OUT3L_ENA_WIDTH				     1
 #define MADERA_OUT3R_ENA				0x0010
 #define MADERA_OUT3R_ENA_MASK				0x0010
 #define MADERA_OUT3R_ENA_SHIFT				     4
-#define MADERA_OUT3R_ENA_WIDTH				     1
 #define MADERA_OUT2L_ENA				0x0008
 #define MADERA_OUT2L_ENA_MASK				0x0008
 #define MADERA_OUT2L_ENA_SHIFT				     3
-#define MADERA_OUT2L_ENA_WIDTH				     1
 #define MADERA_OUT2R_ENA				0x0004
 #define MADERA_OUT2R_ENA_MASK				0x0004
 #define MADERA_OUT2R_ENA_SHIFT				     2
-#define MADERA_OUT2R_ENA_WIDTH				     1
 #define MADERA_OUT1L_ENA				0x0002
 #define MADERA_OUT1L_ENA_MASK				0x0002
 #define MADERA_OUT1L_ENA_SHIFT				     1
-#define MADERA_OUT1L_ENA_WIDTH				     1
 #define MADERA_OUT1R_ENA				0x0001
 #define MADERA_OUT1R_ENA_MASK				0x0001
 #define MADERA_OUT1R_ENA_SHIFT				     0
-#define MADERA_OUT1R_ENA_WIDTH				     1
 
 /* (0x0408)  Output_Rate_1 */
 #define MADERA_CP_DAC_MODE_MASK				0x0040
 #define MADERA_CP_DAC_MODE_SHIFT			     6
-#define MADERA_CP_DAC_MODE_WIDTH			     1
 #define MADERA_OUT_EXT_CLK_DIV_MASK			0x0030
 #define MADERA_OUT_EXT_CLK_DIV_SHIFT			     4
-#define MADERA_OUT_EXT_CLK_DIV_WIDTH			     2
 #define MADERA_OUT_CLK_SRC_MASK				0x0007
 #define MADERA_OUT_CLK_SRC_SHIFT			     0
-#define MADERA_OUT_CLK_SRC_WIDTH			     3
 
 /* (0x0409)  Output_Volume_Ramp */
 #define MADERA_OUT_VD_RAMP_MASK				0x0070
 #define MADERA_OUT_VD_RAMP_SHIFT			     4
-#define MADERA_OUT_VD_RAMP_WIDTH			     3
 #define MADERA_OUT_VI_RAMP_MASK				0x0007
 #define MADERA_OUT_VI_RAMP_SHIFT			     0
-#define MADERA_OUT_VI_RAMP_WIDTH			     3
 
 /* (0x0410)  Output_Path_Config_1L */
 #define MADERA_OUT1_MONO				0x1000
 #define MADERA_OUT1_MONO_MASK				0x1000
 #define MADERA_OUT1_MONO_SHIFT				    12
-#define MADERA_OUT1_MONO_WIDTH				     1
 #define MADERA_OUT1L_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT1L_ANC_SRC_SHIFT			    10
-#define MADERA_OUT1L_ANC_SRC_WIDTH			     2
 
 /* (0x0411)  DAC_Digital_Volume_1L */
 #define MADERA_OUT1L_VU					0x0200
 #define MADERA_OUT1L_VU_MASK				0x0200
 #define MADERA_OUT1L_VU_SHIFT				     9
-#define MADERA_OUT1L_VU_WIDTH				     1
 #define MADERA_OUT1L_MUTE				0x0100
 #define MADERA_OUT1L_MUTE_MASK				0x0100
 #define MADERA_OUT1L_MUTE_SHIFT				     8
-#define MADERA_OUT1L_MUTE_WIDTH				     1
 #define MADERA_OUT1L_VOL_MASK				0x00FF
 #define MADERA_OUT1L_VOL_SHIFT				     0
-#define MADERA_OUT1L_VOL_WIDTH				     8
 
 /* (0x0412)  Output_Path_Config_1 */
 #define MADERA_HP1_GND_SEL_MASK				0x0007
 #define MADERA_HP1_GND_SEL_SHIFT			     0
-#define MADERA_HP1_GND_SEL_WIDTH			     3
 
 /* (0x0414)  Output_Path_Config_1R */
 #define MADERA_OUT1R_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT1R_ANC_SRC_SHIFT			    10
-#define MADERA_OUT1R_ANC_SRC_WIDTH			     2
 
 /* (0x0415)  DAC_Digital_Volume_1R */
 #define MADERA_OUT1R_MUTE				0x0100
 #define MADERA_OUT1R_MUTE_MASK				0x0100
 #define MADERA_OUT1R_MUTE_SHIFT				     8
-#define MADERA_OUT1R_MUTE_WIDTH				     1
 #define MADERA_OUT1R_VOL_MASK				0x00FF
 #define MADERA_OUT1R_VOL_SHIFT				     0
-#define MADERA_OUT1R_VOL_WIDTH				     8
 
 /* (0x0418)  Output_Path_Config_2L */
 #define MADERA_OUT2L_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT2L_ANC_SRC_SHIFT			    10
-#define MADERA_OUT2L_ANC_SRC_WIDTH			     2
 
 /* (0x0419)  DAC_Digital_Volume_2L */
 #define MADERA_OUT2L_MUTE				0x0100
 #define MADERA_OUT2L_MUTE_MASK				0x0100
 #define MADERA_OUT2L_MUTE_SHIFT				     8
-#define MADERA_OUT2L_MUTE_WIDTH				     1
 #define MADERA_OUT2L_VOL_MASK				0x00FF
 #define MADERA_OUT2L_VOL_SHIFT				     0
-#define MADERA_OUT2L_VOL_WIDTH				     8
 
 /* (0x041A)  Output_Path_Config_2 */
 #define MADERA_HP2_GND_SEL_MASK				0x0007
 #define MADERA_HP2_GND_SEL_SHIFT			     0
-#define MADERA_HP2_GND_SEL_WIDTH			     3
 
 /* (0x041C)  Output_Path_Config_2R */
 #define MADERA_OUT2R_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT2R_ANC_SRC_SHIFT			    10
-#define MADERA_OUT2R_ANC_SRC_WIDTH			     2
 
 /* (0x041D)  DAC_Digital_Volume_2R */
 #define MADERA_OUT2R_MUTE				0x0100
 #define MADERA_OUT2R_MUTE_MASK				0x0100
 #define MADERA_OUT2R_MUTE_SHIFT				     8
-#define MADERA_OUT2R_MUTE_WIDTH				     1
 #define MADERA_OUT2R_VOL_MASK				0x00FF
 #define MADERA_OUT2R_VOL_SHIFT				     0
-#define MADERA_OUT2R_VOL_WIDTH				     8
 
 /* (0x0420)  Output_Path_Config_3L */
 #define MADERA_OUT3L_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT3L_ANC_SRC_SHIFT			    10
-#define MADERA_OUT3L_ANC_SRC_WIDTH			     2
 
 /* (0x0421)  DAC_Digital_Volume_3L */
 #define MADERA_OUT3L_MUTE				0x0100
 #define MADERA_OUT3L_MUTE_MASK				0x0100
 #define MADERA_OUT3L_MUTE_SHIFT				     8
-#define MADERA_OUT3L_MUTE_WIDTH				     1
 #define MADERA_OUT3L_VOL_MASK				0x00FF
 #define MADERA_OUT3L_VOL_SHIFT				     0
-#define MADERA_OUT3L_VOL_WIDTH				     8
 
 /* (0x0424)  Output_Path_Config_3R */
 #define MADERA_OUT3R_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT3R_ANC_SRC_SHIFT			    10
-#define MADERA_OUT3R_ANC_SRC_WIDTH			     2
 
 /* (0x0425)  DAC_Digital_Volume_3R */
 #define MADERA_OUT3R_MUTE				0x0100
 #define MADERA_OUT3R_MUTE_MASK				0x0100
 #define MADERA_OUT3R_MUTE_SHIFT				     8
-#define MADERA_OUT3R_MUTE_WIDTH				     1
 #define MADERA_OUT3R_VOL_MASK				0x00FF
 #define MADERA_OUT3R_VOL_SHIFT				     0
-#define MADERA_OUT3R_VOL_WIDTH				     8
 
 /* (0x0428)  Output_Path_Config_4L */
 #define MADERA_OUT4L_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT4L_ANC_SRC_SHIFT			    10
-#define MADERA_OUT4L_ANC_SRC_WIDTH			     2
 
 /* (0x0429)  DAC_Digital_Volume_4L */
 #define MADERA_OUT4L_MUTE				0x0100
 #define MADERA_OUT4L_MUTE_MASK				0x0100
 #define MADERA_OUT4L_MUTE_SHIFT				     8
-#define MADERA_OUT4L_MUTE_WIDTH				     1
 #define MADERA_OUT4L_VOL_MASK				0x00FF
 #define MADERA_OUT4L_VOL_SHIFT				     0
-#define MADERA_OUT4L_VOL_WIDTH				     8
 
 /* (0x042C)  Output_Path_Config_4R */
 #define MADERA_OUT4R_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT4R_ANC_SRC_SHIFT			    10
-#define MADERA_OUT4R_ANC_SRC_WIDTH			     2
 
 /* (0x042D)  DAC_Digital_Volume_4R */
 #define MADERA_OUT4R_MUTE				0x0100
 #define MADERA_OUT4R_MUTE_MASK				0x0100
 #define MADERA_OUT4R_MUTE_SHIFT				     8
-#define MADERA_OUT4R_MUTE_WIDTH				     1
 #define MADERA_OUT4R_VOL_MASK				0x00FF
 #define MADERA_OUT4R_VOL_SHIFT				     0
-#define MADERA_OUT4R_VOL_WIDTH				     8
 
 /* (0x0430)  Output_Path_Config_5L */
 #define MADERA_OUT5_OSR					0x2000
 #define MADERA_OUT5_OSR_MASK				0x2000
 #define MADERA_OUT5_OSR_SHIFT				    13
-#define MADERA_OUT5_OSR_WIDTH				     1
 #define MADERA_OUT5L_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT5L_ANC_SRC_SHIFT			    10
-#define MADERA_OUT5L_ANC_SRC_WIDTH			     2
 
 /* (0x0431)  DAC_Digital_Volume_5L */
 #define MADERA_OUT5L_MUTE				0x0100
 #define MADERA_OUT5L_MUTE_MASK				0x0100
 #define MADERA_OUT5L_MUTE_SHIFT				     8
-#define MADERA_OUT5L_MUTE_WIDTH				     1
 #define MADERA_OUT5L_VOL_MASK				0x00FF
 #define MADERA_OUT5L_VOL_SHIFT				     0
-#define MADERA_OUT5L_VOL_WIDTH				     8
 
 /* (0x0434)  Output_Path_Config_5R */
 #define MADERA_OUT5R_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT5R_ANC_SRC_SHIFT			    10
-#define MADERA_OUT5R_ANC_SRC_WIDTH			     2
 
 /* (0x0435)  DAC_Digital_Volume_5R */
 #define MADERA_OUT5R_MUTE				0x0100
 #define MADERA_OUT5R_MUTE_MASK				0x0100
 #define MADERA_OUT5R_MUTE_SHIFT				     8
-#define MADERA_OUT5R_MUTE_WIDTH				     1
 #define MADERA_OUT5R_VOL_MASK				0x00FF
 #define MADERA_OUT5R_VOL_SHIFT				     0
-#define MADERA_OUT5R_VOL_WIDTH				     8
 
 /* (0x0438)  Output_Path_Config_6L */
 #define MADERA_OUT6_OSR					0x2000
 #define MADERA_OUT6_OSR_MASK				0x2000
 #define MADERA_OUT6_OSR_SHIFT				    13
-#define MADERA_OUT6_OSR_WIDTH				     1
 #define MADERA_OUT6L_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT6L_ANC_SRC_SHIFT			    10
-#define MADERA_OUT6L_ANC_SRC_WIDTH			     2
 
 /* (0x0439)  DAC_Digital_Volume_6L */
 #define MADERA_OUT6L_MUTE				0x0100
 #define MADERA_OUT6L_MUTE_MASK				0x0100
 #define MADERA_OUT6L_MUTE_SHIFT				     8
-#define MADERA_OUT6L_MUTE_WIDTH				     1
 #define MADERA_OUT6L_VOL_MASK				0x00FF
 #define MADERA_OUT6L_VOL_SHIFT				     0
-#define MADERA_OUT6L_VOL_WIDTH				     8
 
 /* (0x043C)  Output_Path_Config_6R */
 #define MADERA_OUT6R_ANC_SRC_MASK			0x0C00
 #define MADERA_OUT6R_ANC_SRC_SHIFT			    10
-#define MADERA_OUT6R_ANC_SRC_WIDTH			     2
 
 /* (0x043D)  DAC_Digital_Volume_6R */
 #define MADERA_OUT6R_MUTE				0x0100
 #define MADERA_OUT6R_MUTE_MASK				0x0100
 #define MADERA_OUT6R_MUTE_SHIFT				     8
-#define MADERA_OUT6R_MUTE_WIDTH				     1
 #define MADERA_OUT6R_VOL_MASK				0x00FF
 #define MADERA_OUT6R_VOL_SHIFT				     0
-#define MADERA_OUT6R_VOL_WIDTH				     8
 
 /* (0x0450) - DAC AEC Control 1 */
 #define MADERA_AEC1_LOOPBACK_SRC_MASK			0x003C
 #define MADERA_AEC1_LOOPBACK_SRC_SHIFT			     2
-#define MADERA_AEC1_LOOPBACK_SRC_WIDTH			     4
 #define MADERA_AEC1_ENA_STS				0x0002
 #define MADERA_AEC1_ENA_STS_MASK			0x0002
 #define MADERA_AEC1_ENA_STS_SHIFT			     1
-#define MADERA_AEC1_ENA_STS_WIDTH			     1
 #define MADERA_AEC1_LOOPBACK_ENA			0x0001
 #define MADERA_AEC1_LOOPBACK_ENA_MASK			0x0001
 #define MADERA_AEC1_LOOPBACK_ENA_SHIFT			     0
-#define MADERA_AEC1_LOOPBACK_ENA_WIDTH			     1
 
 /* (0x0451)  DAC_AEC_Control_2 */
 #define MADERA_AEC2_LOOPBACK_SRC_MASK			0x003C
 #define MADERA_AEC2_LOOPBACK_SRC_SHIFT			     2
-#define MADERA_AEC2_LOOPBACK_SRC_WIDTH			     4
 #define MADERA_AEC2_ENA_STS				0x0002
 #define MADERA_AEC2_ENA_STS_MASK			0x0002
 #define MADERA_AEC2_ENA_STS_SHIFT			     1
-#define MADERA_AEC2_ENA_STS_WIDTH			     1
 #define MADERA_AEC2_LOOPBACK_ENA			0x0001
 #define MADERA_AEC2_LOOPBACK_ENA_MASK			0x0001
 #define MADERA_AEC2_LOOPBACK_ENA_SHIFT			     0
-#define MADERA_AEC2_LOOPBACK_ENA_WIDTH			     1
 
 /* (0x0458)  Noise_Gate_Control */
 #define MADERA_NGATE_HOLD_MASK				0x0030
 #define MADERA_NGATE_HOLD_SHIFT				     4
-#define MADERA_NGATE_HOLD_WIDTH				     2
 #define MADERA_NGATE_THR_MASK				0x000E
 #define MADERA_NGATE_THR_SHIFT				     1
-#define MADERA_NGATE_THR_WIDTH				     3
 #define MADERA_NGATE_ENA				0x0001
 #define MADERA_NGATE_ENA_MASK				0x0001
 #define MADERA_NGATE_ENA_SHIFT				     0
-#define MADERA_NGATE_ENA_WIDTH				     1
 
 /* (0x0490)  PDM_SPK1_CTRL_1 */
 #define MADERA_SPK1R_MUTE				0x2000
 #define MADERA_SPK1R_MUTE_MASK				0x2000
 #define MADERA_SPK1R_MUTE_SHIFT				    13
-#define MADERA_SPK1R_MUTE_WIDTH				     1
 #define MADERA_SPK1L_MUTE				0x1000
 #define MADERA_SPK1L_MUTE_MASK				0x1000
 #define MADERA_SPK1L_MUTE_SHIFT				    12
-#define MADERA_SPK1L_MUTE_WIDTH				     1
 #define MADERA_SPK1_MUTE_ENDIAN				0x0100
 #define MADERA_SPK1_MUTE_ENDIAN_MASK			0x0100
 #define MADERA_SPK1_MUTE_ENDIAN_SHIFT			     8
-#define MADERA_SPK1_MUTE_ENDIAN_WIDTH			     1
 #define MADERA_SPK1_MUTE_SEQ1_MASK			0x00FF
 #define MADERA_SPK1_MUTE_SEQ1_SHIFT			     0
-#define MADERA_SPK1_MUTE_SEQ1_WIDTH			     8
 
 /* (0x0491)  PDM_SPK1_CTRL_2 */
 #define MADERA_SPK1_FMT					0x0001
 #define MADERA_SPK1_FMT_MASK				0x0001
 #define MADERA_SPK1_FMT_SHIFT				     0
-#define MADERA_SPK1_FMT_WIDTH				     1
 
 /* (0x0492)  PDM_SPK2_CTRL_1 */
 #define MADERA_SPK2R_MUTE				0x2000
 #define MADERA_SPK2R_MUTE_MASK				0x2000
 #define MADERA_SPK2R_MUTE_SHIFT				    13
-#define MADERA_SPK2R_MUTE_WIDTH				     1
 #define MADERA_SPK2L_MUTE				0x1000
 #define MADERA_SPK2L_MUTE_MASK				0x1000
 #define MADERA_SPK2L_MUTE_SHIFT				    12
-#define MADERA_SPK2L_MUTE_WIDTH				     1
 
 /* (0x04A0) - HP1 Short Circuit Ctrl */
 #define MADERA_HP1_SC_ENA				0x1000
 #define MADERA_HP1_SC_ENA_MASK				0x1000
 #define MADERA_HP1_SC_ENA_SHIFT				    12
-#define MADERA_HP1_SC_ENA_WIDTH				     1
 
 /* (0x04A1) - HP2 Short Circuit Ctrl */
 #define MADERA_HP2_SC_ENA				0x1000
 #define MADERA_HP2_SC_ENA_MASK				0x1000
 #define MADERA_HP2_SC_ENA_SHIFT				    12
-#define MADERA_HP2_SC_ENA_WIDTH				     1
 
 /* (0x04A2) - HP3 Short Circuit Ctrl */
 #define MADERA_HP3_SC_ENA				0x1000
 #define MADERA_HP3_SC_ENA_MASK				0x1000
 #define MADERA_HP3_SC_ENA_SHIFT				    12
-#define MADERA_HP3_SC_ENA_WIDTH				     1
 
 /* (0x04A8) - HP_Test_Ctrl_5 */
 #define MADERA_HP1L_ONEFLT				0x0100
 #define MADERA_HP1L_ONEFLT_MASK				0x0100
 #define MADERA_HP1L_ONEFLT_SHIFT			     8
-#define MADERA_HP1L_ONEFLT_WIDTH			     1
 
 /* (0x04A9) - HP_Test_Ctrl_6 */
 #define MADERA_HP1R_ONEFLT				0x0100
 #define MADERA_HP1R_ONEFLT_MASK				0x0100
 #define MADERA_HP1R_ONEFLT_SHIFT			     8
-#define MADERA_HP1R_ONEFLT_WIDTH			     1
 
 /* (0x0500)  AIF1_BCLK_Ctrl */
 #define MADERA_AIF1_BCLK_INV				0x0080
 #define MADERA_AIF1_BCLK_INV_MASK			0x0080
 #define MADERA_AIF1_BCLK_INV_SHIFT			     7
-#define MADERA_AIF1_BCLK_INV_WIDTH			     1
 #define MADERA_AIF1_BCLK_MSTR				0x0020
 #define MADERA_AIF1_BCLK_MSTR_MASK			0x0020
 #define MADERA_AIF1_BCLK_MSTR_SHIFT			     5
-#define MADERA_AIF1_BCLK_MSTR_WIDTH			     1
 #define MADERA_AIF1_BCLK_FREQ_MASK			0x001F
 #define MADERA_AIF1_BCLK_FREQ_SHIFT			     0
-#define MADERA_AIF1_BCLK_FREQ_WIDTH			     5
 
 /* (0x0501)  AIF1_Tx_Pin_Ctrl */
 #define MADERA_AIF1TX_LRCLK_SRC				0x0008
 #define MADERA_AIF1TX_LRCLK_SRC_MASK			0x0008
 #define MADERA_AIF1TX_LRCLK_SRC_SHIFT			     3
-#define MADERA_AIF1TX_LRCLK_SRC_WIDTH			     1
 #define MADERA_AIF1TX_LRCLK_INV				0x0004
 #define MADERA_AIF1TX_LRCLK_INV_MASK			0x0004
 #define MADERA_AIF1TX_LRCLK_INV_SHIFT			     2
-#define MADERA_AIF1TX_LRCLK_INV_WIDTH			     1
 #define MADERA_AIF1TX_LRCLK_MSTR			0x0001
 #define MADERA_AIF1TX_LRCLK_MSTR_MASK			0x0001
 #define MADERA_AIF1TX_LRCLK_MSTR_SHIFT			     0
-#define MADERA_AIF1TX_LRCLK_MSTR_WIDTH			     1
 
 /* (0x0502)  AIF1_Rx_Pin_Ctrl */
 #define MADERA_AIF1RX_LRCLK_INV				0x0004
 #define MADERA_AIF1RX_LRCLK_INV_MASK			0x0004
 #define MADERA_AIF1RX_LRCLK_INV_SHIFT			     2
-#define MADERA_AIF1RX_LRCLK_INV_WIDTH			     1
 #define MADERA_AIF1RX_LRCLK_FRC				0x0002
 #define MADERA_AIF1RX_LRCLK_FRC_MASK			0x0002
 #define MADERA_AIF1RX_LRCLK_FRC_SHIFT			     1
-#define MADERA_AIF1RX_LRCLK_FRC_WIDTH			     1
 #define MADERA_AIF1RX_LRCLK_MSTR			0x0001
 #define MADERA_AIF1RX_LRCLK_MSTR_MASK			0x0001
 #define MADERA_AIF1RX_LRCLK_MSTR_SHIFT			     0
-#define MADERA_AIF1RX_LRCLK_MSTR_WIDTH			     1
 
 /* (0x0503)  AIF1_Rate_Ctrl */
 #define MADERA_AIF1_RATE_MASK				0xF800
 #define MADERA_AIF1_RATE_SHIFT				    11
-#define MADERA_AIF1_RATE_WIDTH				     5
 #define MADERA_AIF1_TRI					0x0040
 #define MADERA_AIF1_TRI_MASK				0x0040
 #define MADERA_AIF1_TRI_SHIFT				     6
-#define MADERA_AIF1_TRI_WIDTH				     1
 
 /* (0x0504)  AIF1_Format */
 #define MADERA_AIF1_FMT_MASK				0x0007
 #define MADERA_AIF1_FMT_SHIFT				     0
-#define MADERA_AIF1_FMT_WIDTH				     3
 
 /* (0x0506)  AIF1_Rx_BCLK_Rate */
 #define MADERA_AIF1RX_BCPF_MASK				0x1FFF
 #define MADERA_AIF1RX_BCPF_SHIFT			     0
-#define MADERA_AIF1RX_BCPF_WIDTH			    13
 
 /* (0x0507)  AIF1_Frame_Ctrl_1 */
 #define MADERA_AIF1TX_WL_MASK				0x3F00
 #define MADERA_AIF1TX_WL_SHIFT				     8
-#define MADERA_AIF1TX_WL_WIDTH				     6
 #define MADERA_AIF1TX_SLOT_LEN_MASK			0x00FF
 #define MADERA_AIF1TX_SLOT_LEN_SHIFT			     0
-#define MADERA_AIF1TX_SLOT_LEN_WIDTH			     8
 
 /* (0x0508)  AIF1_Frame_Ctrl_2 */
 #define MADERA_AIF1RX_WL_MASK				0x3F00
 #define MADERA_AIF1RX_WL_SHIFT				     8
-#define MADERA_AIF1RX_WL_WIDTH				     6
 #define MADERA_AIF1RX_SLOT_LEN_MASK			0x00FF
 #define MADERA_AIF1RX_SLOT_LEN_SHIFT			     0
-#define MADERA_AIF1RX_SLOT_LEN_WIDTH			     8
 
 /* (0x0509)  AIF1_Frame_Ctrl_3 */
 #define MADERA_AIF1TX1_SLOT_MASK			0x003F
 #define MADERA_AIF1TX1_SLOT_SHIFT			     0
-#define MADERA_AIF1TX1_SLOT_WIDTH			     6
 
 /* (0x0519)  AIF1_Tx_Enables */
 #define MADERA_AIF1TX8_ENA				0x0080
 #define MADERA_AIF1TX8_ENA_MASK				0x0080
 #define MADERA_AIF1TX8_ENA_SHIFT			     7
-#define MADERA_AIF1TX8_ENA_WIDTH			     1
 #define MADERA_AIF1TX7_ENA				0x0040
 #define MADERA_AIF1TX7_ENA_MASK				0x0040
 #define MADERA_AIF1TX7_ENA_SHIFT			     6
-#define MADERA_AIF1TX7_ENA_WIDTH			     1
 #define MADERA_AIF1TX6_ENA				0x0020
 #define MADERA_AIF1TX6_ENA_MASK				0x0020
 #define MADERA_AIF1TX6_ENA_SHIFT			     5
-#define MADERA_AIF1TX6_ENA_WIDTH			     1
 #define MADERA_AIF1TX5_ENA				0x0010
 #define MADERA_AIF1TX5_ENA_MASK				0x0010
 #define MADERA_AIF1TX5_ENA_SHIFT			     4
-#define MADERA_AIF1TX5_ENA_WIDTH			     1
 #define MADERA_AIF1TX4_ENA				0x0008
 #define MADERA_AIF1TX4_ENA_MASK				0x0008
 #define MADERA_AIF1TX4_ENA_SHIFT			     3
-#define MADERA_AIF1TX4_ENA_WIDTH			     1
 #define MADERA_AIF1TX3_ENA				0x0004
 #define MADERA_AIF1TX3_ENA_MASK				0x0004
 #define MADERA_AIF1TX3_ENA_SHIFT			     2
-#define MADERA_AIF1TX3_ENA_WIDTH			     1
 #define MADERA_AIF1TX2_ENA				0x0002
 #define MADERA_AIF1TX2_ENA_MASK				0x0002
 #define MADERA_AIF1TX2_ENA_SHIFT			     1
-#define MADERA_AIF1TX2_ENA_WIDTH			     1
 #define MADERA_AIF1TX1_ENA				0x0001
 #define MADERA_AIF1TX1_ENA_MASK				0x0001
 #define MADERA_AIF1TX1_ENA_SHIFT			     0
-#define MADERA_AIF1TX1_ENA_WIDTH			     1
 
 /* (0x051A)  AIF1_Rx_Enables */
 #define MADERA_AIF1RX8_ENA				0x0080
 #define MADERA_AIF1RX8_ENA_MASK				0x0080
 #define MADERA_AIF1RX8_ENA_SHIFT			     7
-#define MADERA_AIF1RX8_ENA_WIDTH			     1
 #define MADERA_AIF1RX7_ENA				0x0040
 #define MADERA_AIF1RX7_ENA_MASK				0x0040
 #define MADERA_AIF1RX7_ENA_SHIFT			     6
-#define MADERA_AIF1RX7_ENA_WIDTH			     1
 #define MADERA_AIF1RX6_ENA				0x0020
 #define MADERA_AIF1RX6_ENA_MASK				0x0020
 #define MADERA_AIF1RX6_ENA_SHIFT			     5
-#define MADERA_AIF1RX6_ENA_WIDTH			     1
 #define MADERA_AIF1RX5_ENA				0x0010
 #define MADERA_AIF1RX5_ENA_MASK				0x0010
 #define MADERA_AIF1RX5_ENA_SHIFT			     4
-#define MADERA_AIF1RX5_ENA_WIDTH			     1
 #define MADERA_AIF1RX4_ENA				0x0008
 #define MADERA_AIF1RX4_ENA_MASK				0x0008
 #define MADERA_AIF1RX4_ENA_SHIFT			     3
-#define MADERA_AIF1RX4_ENA_WIDTH			     1
 #define MADERA_AIF1RX3_ENA				0x0004
 #define MADERA_AIF1RX3_ENA_MASK				0x0004
 #define MADERA_AIF1RX3_ENA_SHIFT			     2
-#define MADERA_AIF1RX3_ENA_WIDTH			     1
 #define MADERA_AIF1RX2_ENA				0x0002
 #define MADERA_AIF1RX2_ENA_MASK				0x0002
 #define MADERA_AIF1RX2_ENA_SHIFT			     1
-#define MADERA_AIF1RX2_ENA_WIDTH			     1
 #define MADERA_AIF1RX1_ENA				0x0001
 #define MADERA_AIF1RX1_ENA_MASK				0x0001
 #define MADERA_AIF1RX1_ENA_SHIFT			     0
-#define MADERA_AIF1RX1_ENA_WIDTH			     1
 
 /* (0x0559)  AIF2_Tx_Enables */
 #define MADERA_AIF2TX8_ENA				0x0080
 #define MADERA_AIF2TX8_ENA_MASK				0x0080
 #define MADERA_AIF2TX8_ENA_SHIFT			     7
-#define MADERA_AIF2TX8_ENA_WIDTH			     1
 #define MADERA_AIF2TX7_ENA				0x0040
 #define MADERA_AIF2TX7_ENA_MASK				0x0040
 #define MADERA_AIF2TX7_ENA_SHIFT			     6
-#define MADERA_AIF2TX7_ENA_WIDTH			     1
 #define MADERA_AIF2TX6_ENA				0x0020
 #define MADERA_AIF2TX6_ENA_MASK				0x0020
 #define MADERA_AIF2TX6_ENA_SHIFT			     5
-#define MADERA_AIF2TX6_ENA_WIDTH			     1
 #define MADERA_AIF2TX5_ENA				0x0010
 #define MADERA_AIF2TX5_ENA_MASK				0x0010
 #define MADERA_AIF2TX5_ENA_SHIFT			     4
-#define MADERA_AIF2TX5_ENA_WIDTH			     1
 #define MADERA_AIF2TX4_ENA				0x0008
 #define MADERA_AIF2TX4_ENA_MASK				0x0008
 #define MADERA_AIF2TX4_ENA_SHIFT			     3
-#define MADERA_AIF2TX4_ENA_WIDTH			     1
 #define MADERA_AIF2TX3_ENA				0x0004
 #define MADERA_AIF2TX3_ENA_MASK				0x0004
 #define MADERA_AIF2TX3_ENA_SHIFT			     2
-#define MADERA_AIF2TX3_ENA_WIDTH			     1
 #define MADERA_AIF2TX2_ENA				0x0002
 #define MADERA_AIF2TX2_ENA_MASK				0x0002
 #define MADERA_AIF2TX2_ENA_SHIFT			     1
-#define MADERA_AIF2TX2_ENA_WIDTH			     1
 #define MADERA_AIF2TX1_ENA				0x0001
 #define MADERA_AIF2TX1_ENA_MASK				0x0001
 #define MADERA_AIF2TX1_ENA_SHIFT			     0
-#define MADERA_AIF2TX1_ENA_WIDTH			     1
 
 /* (0x055A)  AIF2_Rx_Enables */
 #define MADERA_AIF2RX8_ENA				0x0080
 #define MADERA_AIF2RX8_ENA_MASK				0x0080
 #define MADERA_AIF2RX8_ENA_SHIFT			     7
-#define MADERA_AIF2RX8_ENA_WIDTH			     1
 #define MADERA_AIF2RX7_ENA				0x0040
 #define MADERA_AIF2RX7_ENA_MASK				0x0040
 #define MADERA_AIF2RX7_ENA_SHIFT			     6
-#define MADERA_AIF2RX7_ENA_WIDTH			     1
 #define MADERA_AIF2RX6_ENA				0x0020
 #define MADERA_AIF2RX6_ENA_MASK				0x0020
 #define MADERA_AIF2RX6_ENA_SHIFT			     5
-#define MADERA_AIF2RX6_ENA_WIDTH			     1
 #define MADERA_AIF2RX5_ENA				0x0010
 #define MADERA_AIF2RX5_ENA_MASK				0x0010
 #define MADERA_AIF2RX5_ENA_SHIFT			     4
-#define MADERA_AIF2RX5_ENA_WIDTH			     1
 #define MADERA_AIF2RX4_ENA				0x0008
 #define MADERA_AIF2RX4_ENA_MASK				0x0008
 #define MADERA_AIF2RX4_ENA_SHIFT			     3
-#define MADERA_AIF2RX4_ENA_WIDTH			     1
 #define MADERA_AIF2RX3_ENA				0x0004
 #define MADERA_AIF2RX3_ENA_MASK				0x0004
 #define MADERA_AIF2RX3_ENA_SHIFT			     2
-#define MADERA_AIF2RX3_ENA_WIDTH			     1
 #define MADERA_AIF2RX2_ENA				0x0002
 #define MADERA_AIF2RX2_ENA_MASK				0x0002
 #define MADERA_AIF2RX2_ENA_SHIFT			     1
-#define MADERA_AIF2RX2_ENA_WIDTH			     1
 #define MADERA_AIF2RX1_ENA				0x0001
 #define MADERA_AIF2RX1_ENA_MASK				0x0001
 #define MADERA_AIF2RX1_ENA_SHIFT			     0
-#define MADERA_AIF2RX1_ENA_WIDTH			     1
 
 /* (0x0599)  AIF3_Tx_Enables */
 #define MADERA_AIF3TX8_ENA				0x0080
 #define MADERA_AIF3TX8_ENA_MASK				0x0080
 #define MADERA_AIF3TX8_ENA_SHIFT			     7
-#define MADERA_AIF3TX8_ENA_WIDTH			     1
 #define MADERA_AIF3TX7_ENA				0x0040
 #define MADERA_AIF3TX7_ENA_MASK				0x0040
 #define MADERA_AIF3TX7_ENA_SHIFT			     6
-#define MADERA_AIF3TX7_ENA_WIDTH			     1
 #define MADERA_AIF3TX6_ENA				0x0020
 #define MADERA_AIF3TX6_ENA_MASK				0x0020
 #define MADERA_AIF3TX6_ENA_SHIFT			     5
-#define MADERA_AIF3TX6_ENA_WIDTH			     1
 #define MADERA_AIF3TX5_ENA				0x0010
 #define MADERA_AIF3TX5_ENA_MASK				0x0010
 #define MADERA_AIF3TX5_ENA_SHIFT			     4
-#define MADERA_AIF3TX5_ENA_WIDTH			     1
 #define MADERA_AIF3TX4_ENA				0x0008
 #define MADERA_AIF3TX4_ENA_MASK				0x0008
 #define MADERA_AIF3TX4_ENA_SHIFT			     3
-#define MADERA_AIF3TX4_ENA_WIDTH			     1
 #define MADERA_AIF3TX3_ENA				0x0004
 #define MADERA_AIF3TX3_ENA_MASK				0x0004
 #define MADERA_AIF3TX3_ENA_SHIFT			     2
-#define MADERA_AIF3TX3_ENA_WIDTH			     1
 #define MADERA_AIF3TX2_ENA				0x0002
 #define MADERA_AIF3TX2_ENA_MASK				0x0002
 #define MADERA_AIF3TX2_ENA_SHIFT			     1
-#define MADERA_AIF3TX2_ENA_WIDTH			     1
 #define MADERA_AIF3TX1_ENA				0x0001
 #define MADERA_AIF3TX1_ENA_MASK				0x0001
 #define MADERA_AIF3TX1_ENA_SHIFT			     0
-#define MADERA_AIF3TX1_ENA_WIDTH			     1
 
 /* (0x059A)  AIF3_Rx_Enables */
 #define MADERA_AIF3RX8_ENA				0x0080
 #define MADERA_AIF3RX8_ENA_MASK				0x0080
 #define MADERA_AIF3RX8_ENA_SHIFT			     7
-#define MADERA_AIF3RX8_ENA_WIDTH			     1
 #define MADERA_AIF3RX7_ENA				0x0040
 #define MADERA_AIF3RX7_ENA_MASK				0x0040
 #define MADERA_AIF3RX7_ENA_SHIFT			     6
-#define MADERA_AIF3RX7_ENA_WIDTH			     1
 #define MADERA_AIF3RX6_ENA				0x0020
 #define MADERA_AIF3RX6_ENA_MASK				0x0020
 #define MADERA_AIF3RX6_ENA_SHIFT			     5
-#define MADERA_AIF3RX6_ENA_WIDTH			     1
 #define MADERA_AIF3RX5_ENA				0x0010
 #define MADERA_AIF3RX5_ENA_MASK				0x0010
 #define MADERA_AIF3RX5_ENA_SHIFT			     4
-#define MADERA_AIF3RX5_ENA_WIDTH			     1
 #define MADERA_AIF3RX4_ENA				0x0008
 #define MADERA_AIF3RX4_ENA_MASK				0x0008
 #define MADERA_AIF3RX4_ENA_SHIFT			     3
-#define MADERA_AIF3RX4_ENA_WIDTH			     1
 #define MADERA_AIF3RX3_ENA				0x0004
 #define MADERA_AIF3RX3_ENA_MASK				0x0004
 #define MADERA_AIF3RX3_ENA_SHIFT			     2
-#define MADERA_AIF3RX3_ENA_WIDTH			     1
 #define MADERA_AIF3RX2_ENA				0x0002
 #define MADERA_AIF3RX2_ENA_MASK				0x0002
 #define MADERA_AIF3RX2_ENA_SHIFT			     1
-#define MADERA_AIF3RX2_ENA_WIDTH			     1
 #define MADERA_AIF3RX1_ENA				0x0001
 #define MADERA_AIF3RX1_ENA_MASK				0x0001
 #define MADERA_AIF3RX1_ENA_SHIFT			     0
-#define MADERA_AIF3RX1_ENA_WIDTH			     1
 
 /* (0x05B9)  AIF4_Tx_Enables */
 #define MADERA_AIF4TX2_ENA				0x0002
 #define MADERA_AIF4TX2_ENA_MASK				0x0002
 #define MADERA_AIF4TX2_ENA_SHIFT			     1
-#define MADERA_AIF4TX2_ENA_WIDTH			     1
 #define MADERA_AIF4TX1_ENA				0x0001
 #define MADERA_AIF4TX1_ENA_MASK				0x0001
 #define MADERA_AIF4TX1_ENA_SHIFT			     0
-#define MADERA_AIF4TX1_ENA_WIDTH			     1
 
 /* (0x05BA)  AIF4_Rx_Enables */
 #define MADERA_AIF4RX2_ENA				0x0002
 #define MADERA_AIF4RX2_ENA_MASK				0x0002
 #define MADERA_AIF4RX2_ENA_SHIFT			     1
-#define MADERA_AIF4RX2_ENA_WIDTH			     1
 #define MADERA_AIF4RX1_ENA				0x0001
 #define MADERA_AIF4RX1_ENA_MASK				0x0001
 #define MADERA_AIF4RX1_ENA_SHIFT			     0
-#define MADERA_AIF4RX1_ENA_WIDTH			     1
 
 /* (0x05C2)  SPD1_TX_Control */
 #define MADERA_SPD1_VAL2				0x2000
 #define MADERA_SPD1_VAL2_MASK				0x2000
 #define MADERA_SPD1_VAL2_SHIFT				    13
-#define MADERA_SPD1_VAL2_WIDTH				     1
 #define MADERA_SPD1_VAL1				0x1000
 #define MADERA_SPD1_VAL1_MASK				0x1000
 #define MADERA_SPD1_VAL1_SHIFT				    12
-#define MADERA_SPD1_VAL1_WIDTH				     1
 #define MADERA_SPD1_RATE_MASK				0x00F0
 #define MADERA_SPD1_RATE_SHIFT				     4
-#define MADERA_SPD1_RATE_WIDTH				     4
 #define MADERA_SPD1_ENA					0x0001
 #define MADERA_SPD1_ENA_MASK				0x0001
 #define MADERA_SPD1_ENA_SHIFT				     0
-#define MADERA_SPD1_ENA_WIDTH				     1
 
 /* (0x05F5)  SLIMbus_RX_Channel_Enable */
 #define MADERA_SLIMRX8_ENA				0x0080
 #define MADERA_SLIMRX8_ENA_MASK				0x0080
 #define MADERA_SLIMRX8_ENA_SHIFT			     7
-#define MADERA_SLIMRX8_ENA_WIDTH			     1
 #define MADERA_SLIMRX7_ENA				0x0040
 #define MADERA_SLIMRX7_ENA_MASK				0x0040
 #define MADERA_SLIMRX7_ENA_SHIFT			     6
-#define MADERA_SLIMRX7_ENA_WIDTH			     1
 #define MADERA_SLIMRX6_ENA				0x0020
 #define MADERA_SLIMRX6_ENA_MASK				0x0020
 #define MADERA_SLIMRX6_ENA_SHIFT			     5
-#define MADERA_SLIMRX6_ENA_WIDTH			     1
 #define MADERA_SLIMRX5_ENA				0x0010
 #define MADERA_SLIMRX5_ENA_MASK				0x0010
 #define MADERA_SLIMRX5_ENA_SHIFT			     4
-#define MADERA_SLIMRX5_ENA_WIDTH			     1
 #define MADERA_SLIMRX4_ENA				0x0008
 #define MADERA_SLIMRX4_ENA_MASK				0x0008
 #define MADERA_SLIMRX4_ENA_SHIFT			     3
-#define MADERA_SLIMRX4_ENA_WIDTH			     1
 #define MADERA_SLIMRX3_ENA				0x0004
 #define MADERA_SLIMRX3_ENA_MASK				0x0004
 #define MADERA_SLIMRX3_ENA_SHIFT			     2
-#define MADERA_SLIMRX3_ENA_WIDTH			     1
 #define MADERA_SLIMRX2_ENA				0x0002
 #define MADERA_SLIMRX2_ENA_MASK				0x0002
 #define MADERA_SLIMRX2_ENA_SHIFT			     1
-#define MADERA_SLIMRX2_ENA_WIDTH			     1
 #define MADERA_SLIMRX1_ENA				0x0001
 #define MADERA_SLIMRX1_ENA_MASK				0x0001
 #define MADERA_SLIMRX1_ENA_SHIFT			     0
-#define MADERA_SLIMRX1_ENA_WIDTH			     1
 
 /* (0x05F6)  SLIMbus_TX_Channel_Enable */
 #define MADERA_SLIMTX8_ENA				0x0080
 #define MADERA_SLIMTX8_ENA_MASK				0x0080
 #define MADERA_SLIMTX8_ENA_SHIFT			     7
-#define MADERA_SLIMTX8_ENA_WIDTH			     1
 #define MADERA_SLIMTX7_ENA				0x0040
 #define MADERA_SLIMTX7_ENA_MASK				0x0040
 #define MADERA_SLIMTX7_ENA_SHIFT			     6
-#define MADERA_SLIMTX7_ENA_WIDTH			     1
 #define MADERA_SLIMTX6_ENA				0x0020
 #define MADERA_SLIMTX6_ENA_MASK				0x0020
 #define MADERA_SLIMTX6_ENA_SHIFT			     5
-#define MADERA_SLIMTX6_ENA_WIDTH			     1
 #define MADERA_SLIMTX5_ENA				0x0010
 #define MADERA_SLIMTX5_ENA_MASK				0x0010
 #define MADERA_SLIMTX5_ENA_SHIFT			     4
-#define MADERA_SLIMTX5_ENA_WIDTH			     1
 #define MADERA_SLIMTX4_ENA				0x0008
 #define MADERA_SLIMTX4_ENA_MASK				0x0008
 #define MADERA_SLIMTX4_ENA_SHIFT			     3
-#define MADERA_SLIMTX4_ENA_WIDTH			     1
 #define MADERA_SLIMTX3_ENA				0x0004
 #define MADERA_SLIMTX3_ENA_MASK				0x0004
 #define MADERA_SLIMTX3_ENA_SHIFT			     2
-#define MADERA_SLIMTX3_ENA_WIDTH			     1
 #define MADERA_SLIMTX2_ENA				0x0002
 #define MADERA_SLIMTX2_ENA_MASK				0x0002
 #define MADERA_SLIMTX2_ENA_SHIFT			     1
-#define MADERA_SLIMTX2_ENA_WIDTH			     1
 #define MADERA_SLIMTX1_ENA				0x0001
 #define MADERA_SLIMTX1_ENA_MASK				0x0001
 #define MADERA_SLIMTX1_ENA_SHIFT			     0
-#define MADERA_SLIMTX1_ENA_WIDTH			     1
 
 /* (0x0E10)  EQ1_1 */
 #define MADERA_EQ1_B1_GAIN_MASK				0xF800
 #define MADERA_EQ1_B1_GAIN_SHIFT			    11
-#define MADERA_EQ1_B1_GAIN_WIDTH			     5
 #define MADERA_EQ1_B2_GAIN_MASK				0x07C0
 #define MADERA_EQ1_B2_GAIN_SHIFT			     6
-#define MADERA_EQ1_B2_GAIN_WIDTH			     5
 #define MADERA_EQ1_B3_GAIN_MASK				0x003E
 #define MADERA_EQ1_B3_GAIN_SHIFT			     1
-#define MADERA_EQ1_B3_GAIN_WIDTH			     5
 #define MADERA_EQ1_ENA					0x0001
 #define MADERA_EQ1_ENA_MASK				0x0001
 #define MADERA_EQ1_ENA_SHIFT				     0
-#define MADERA_EQ1_ENA_WIDTH				     1
 
 /* (0x0E11)  EQ1_2 */
 #define MADERA_EQ1_B4_GAIN_MASK				0xF800
 #define MADERA_EQ1_B4_GAIN_SHIFT			    11
-#define MADERA_EQ1_B4_GAIN_WIDTH			     5
 #define MADERA_EQ1_B5_GAIN_MASK				0x07C0
 #define MADERA_EQ1_B5_GAIN_SHIFT			     6
-#define MADERA_EQ1_B5_GAIN_WIDTH			     5
 #define MADERA_EQ1_B1_MODE				0x0001
 #define MADERA_EQ1_B1_MODE_MASK				0x0001
 #define MADERA_EQ1_B1_MODE_SHIFT			     0
-#define MADERA_EQ1_B1_MODE_WIDTH			     1
 
 /* (0x0E26)  EQ2_1 */
 #define MADERA_EQ2_B1_GAIN_MASK				0xF800
 #define MADERA_EQ2_B1_GAIN_SHIFT			    11
-#define MADERA_EQ2_B1_GAIN_WIDTH			     5
 #define MADERA_EQ2_B2_GAIN_MASK				0x07C0
 #define MADERA_EQ2_B2_GAIN_SHIFT			     6
-#define MADERA_EQ2_B2_GAIN_WIDTH			     5
 #define MADERA_EQ2_B3_GAIN_MASK				0x003E
 #define MADERA_EQ2_B3_GAIN_SHIFT			     1
-#define MADERA_EQ2_B3_GAIN_WIDTH			     5
 #define MADERA_EQ2_ENA					0x0001
 #define MADERA_EQ2_ENA_MASK				0x0001
 #define MADERA_EQ2_ENA_SHIFT				     0
-#define MADERA_EQ2_ENA_WIDTH				     1
 
 /* (0x0E27)  EQ2_2 */
 #define MADERA_EQ2_B4_GAIN_MASK				0xF800
 #define MADERA_EQ2_B4_GAIN_SHIFT			    11
-#define MADERA_EQ2_B4_GAIN_WIDTH			     5
 #define MADERA_EQ2_B5_GAIN_MASK				0x07C0
 #define MADERA_EQ2_B5_GAIN_SHIFT			     6
-#define MADERA_EQ2_B5_GAIN_WIDTH			     5
 #define MADERA_EQ2_B1_MODE				0x0001
 #define MADERA_EQ2_B1_MODE_MASK				0x0001
 #define MADERA_EQ2_B1_MODE_SHIFT			     0
-#define MADERA_EQ2_B1_MODE_WIDTH			     1
 
 /* (0x0E3C)  EQ3_1 */
 #define MADERA_EQ3_B1_GAIN_MASK				0xF800
 #define MADERA_EQ3_B1_GAIN_SHIFT			    11
-#define MADERA_EQ3_B1_GAIN_WIDTH			     5
 #define MADERA_EQ3_B2_GAIN_MASK				0x07C0
 #define MADERA_EQ3_B2_GAIN_SHIFT			     6
-#define MADERA_EQ3_B2_GAIN_WIDTH			     5
 #define MADERA_EQ3_B3_GAIN_MASK				0x003E
 #define MADERA_EQ3_B3_GAIN_SHIFT			     1
-#define MADERA_EQ3_B3_GAIN_WIDTH			     5
 #define MADERA_EQ3_ENA					0x0001
 #define MADERA_EQ3_ENA_MASK				0x0001
 #define MADERA_EQ3_ENA_SHIFT				     0
-#define MADERA_EQ3_ENA_WIDTH				     1
 
 /* (0x0E3D)  EQ3_2 */
 #define MADERA_EQ3_B4_GAIN_MASK				0xF800
 #define MADERA_EQ3_B4_GAIN_SHIFT			    11
-#define MADERA_EQ3_B4_GAIN_WIDTH			     5
 #define MADERA_EQ3_B5_GAIN_MASK				0x07C0
 #define MADERA_EQ3_B5_GAIN_SHIFT			     6
-#define MADERA_EQ3_B5_GAIN_WIDTH			     5
 #define MADERA_EQ3_B1_MODE				0x0001
 #define MADERA_EQ3_B1_MODE_MASK				0x0001
 #define MADERA_EQ3_B1_MODE_SHIFT			     0
-#define MADERA_EQ3_B1_MODE_WIDTH			     1
 
 /* (0x0E52)  EQ4_1 */
 #define MADERA_EQ4_B1_GAIN_MASK				0xF800
 #define MADERA_EQ4_B1_GAIN_SHIFT			    11
-#define MADERA_EQ4_B1_GAIN_WIDTH			     5
 #define MADERA_EQ4_B2_GAIN_MASK				0x07C0
 #define MADERA_EQ4_B2_GAIN_SHIFT			     6
-#define MADERA_EQ4_B2_GAIN_WIDTH			     5
 #define MADERA_EQ4_B3_GAIN_MASK				0x003E
 #define MADERA_EQ4_B3_GAIN_SHIFT			     1
-#define MADERA_EQ4_B3_GAIN_WIDTH			     5
 #define MADERA_EQ4_ENA					0x0001
 #define MADERA_EQ4_ENA_MASK				0x0001
 #define MADERA_EQ4_ENA_SHIFT				     0
-#define MADERA_EQ4_ENA_WIDTH				     1
 
 /* (0x0E53)  EQ4_2 */
 #define MADERA_EQ4_B4_GAIN_MASK				0xF800
 #define MADERA_EQ4_B4_GAIN_SHIFT			    11
-#define MADERA_EQ4_B4_GAIN_WIDTH			     5
 #define MADERA_EQ4_B5_GAIN_MASK				0x07C0
 #define MADERA_EQ4_B5_GAIN_SHIFT			     6
-#define MADERA_EQ4_B5_GAIN_WIDTH			     5
 #define MADERA_EQ4_B1_MODE				0x0001
 #define MADERA_EQ4_B1_MODE_MASK				0x0001
 #define MADERA_EQ4_B1_MODE_SHIFT			     0
-#define MADERA_EQ4_B1_MODE_WIDTH			     1
 
 /* (0x0E80)  DRC1_ctrl1 */
 #define MADERA_DRC1L_ENA				0x0002
 #define MADERA_DRC1L_ENA_MASK				0x0002
 #define MADERA_DRC1L_ENA_SHIFT				     1
-#define MADERA_DRC1L_ENA_WIDTH				     1
 #define MADERA_DRC1R_ENA				0x0001
 #define MADERA_DRC1R_ENA_MASK				0x0001
 #define MADERA_DRC1R_ENA_SHIFT				     0
-#define MADERA_DRC1R_ENA_WIDTH				     1
 
 /* (0x0E88)  DRC2_ctrl1 */
 #define MADERA_DRC2L_ENA				0x0002
 #define MADERA_DRC2L_ENA_MASK				0x0002
 #define MADERA_DRC2L_ENA_SHIFT				     1
-#define MADERA_DRC2L_ENA_WIDTH				     1
 #define MADERA_DRC2R_ENA				0x0001
 #define MADERA_DRC2R_ENA_MASK				0x0001
 #define MADERA_DRC2R_ENA_SHIFT				     0
-#define MADERA_DRC2R_ENA_WIDTH				     1
 
 /* (0x0EC0)  HPLPF1_1 */
 #define MADERA_LHPF1_MODE				0x0002
 #define MADERA_LHPF1_MODE_MASK				0x0002
 #define MADERA_LHPF1_MODE_SHIFT				     1
-#define MADERA_LHPF1_MODE_WIDTH				     1
 #define MADERA_LHPF1_ENA				0x0001
 #define MADERA_LHPF1_ENA_MASK				0x0001
 #define MADERA_LHPF1_ENA_SHIFT				     0
-#define MADERA_LHPF1_ENA_WIDTH				     1
 
 /* (0x0EC1)  HPLPF1_2 */
 #define MADERA_LHPF1_COEFF_MASK				0xFFFF
 #define MADERA_LHPF1_COEFF_SHIFT			     0
-#define MADERA_LHPF1_COEFF_WIDTH			    16
 
 /* (0x0EC4)  HPLPF2_1 */
 #define MADERA_LHPF2_MODE				0x0002
 #define MADERA_LHPF2_MODE_MASK				0x0002
 #define MADERA_LHPF2_MODE_SHIFT				     1
-#define MADERA_LHPF2_MODE_WIDTH				     1
 #define MADERA_LHPF2_ENA				0x0001
 #define MADERA_LHPF2_ENA_MASK				0x0001
 #define MADERA_LHPF2_ENA_SHIFT				     0
-#define MADERA_LHPF2_ENA_WIDTH				     1
 
 /* (0x0EC5)  HPLPF2_2 */
 #define MADERA_LHPF2_COEFF_MASK				0xFFFF
 #define MADERA_LHPF2_COEFF_SHIFT			     0
-#define MADERA_LHPF2_COEFF_WIDTH			    16
 
 /* (0x0EC8)  HPLPF3_1 */
 #define MADERA_LHPF3_MODE				0x0002
 #define MADERA_LHPF3_MODE_MASK				0x0002
 #define MADERA_LHPF3_MODE_SHIFT				     1
-#define MADERA_LHPF3_MODE_WIDTH				     1
 #define MADERA_LHPF3_ENA				0x0001
 #define MADERA_LHPF3_ENA_MASK				0x0001
 #define MADERA_LHPF3_ENA_SHIFT				     0
-#define MADERA_LHPF3_ENA_WIDTH				     1
 
 /* (0x0EC9)  HPLPF3_2 */
 #define MADERA_LHPF3_COEFF_MASK				0xFFFF
 #define MADERA_LHPF3_COEFF_SHIFT			     0
-#define MADERA_LHPF3_COEFF_WIDTH			    16
 
 /* (0x0ECC)  HPLPF4_1 */
 #define MADERA_LHPF4_MODE				0x0002
 #define MADERA_LHPF4_MODE_MASK				0x0002
 #define MADERA_LHPF4_MODE_SHIFT				     1
-#define MADERA_LHPF4_MODE_WIDTH				     1
 #define MADERA_LHPF4_ENA				0x0001
 #define MADERA_LHPF4_ENA_MASK				0x0001
 #define MADERA_LHPF4_ENA_SHIFT				     0
-#define MADERA_LHPF4_ENA_WIDTH				     1
 
 /* (0x0ECD)  HPLPF4_2 */
 #define MADERA_LHPF4_COEFF_MASK				0xFFFF
 #define MADERA_LHPF4_COEFF_SHIFT			     0
-#define MADERA_LHPF4_COEFF_WIDTH			    16
 
 /* (0x0ED0)  ASRC2_ENABLE */
 #define MADERA_ASRC2_IN2L_ENA				0x0008
 #define MADERA_ASRC2_IN2L_ENA_MASK			0x0008
 #define MADERA_ASRC2_IN2L_ENA_SHIFT			     3
-#define MADERA_ASRC2_IN2L_ENA_WIDTH			     1
 #define MADERA_ASRC2_IN2R_ENA				0x0004
 #define MADERA_ASRC2_IN2R_ENA_MASK			0x0004
 #define MADERA_ASRC2_IN2R_ENA_SHIFT			     2
-#define MADERA_ASRC2_IN2R_ENA_WIDTH			     1
 #define MADERA_ASRC2_IN1L_ENA				0x0002
 #define MADERA_ASRC2_IN1L_ENA_MASK			0x0002
 #define MADERA_ASRC2_IN1L_ENA_SHIFT			     1
-#define MADERA_ASRC2_IN1L_ENA_WIDTH			     1
 #define MADERA_ASRC2_IN1R_ENA				0x0001
 #define MADERA_ASRC2_IN1R_ENA_MASK			0x0001
 #define MADERA_ASRC2_IN1R_ENA_SHIFT			     0
-#define MADERA_ASRC2_IN1R_ENA_WIDTH			     1
 
 /* (0x0ED2)  ASRC2_RATE1 */
 #define MADERA_ASRC2_RATE1_MASK				0xF800
 #define MADERA_ASRC2_RATE1_SHIFT			    11
-#define MADERA_ASRC2_RATE1_WIDTH			     5
 
 /* (0x0ED3)  ASRC2_RATE2 */
 #define MADERA_ASRC2_RATE2_MASK				0xF800
 #define MADERA_ASRC2_RATE2_SHIFT			    11
-#define MADERA_ASRC2_RATE2_WIDTH			     5
 
 /* (0x0EE0)  ASRC1_ENABLE */
 #define MADERA_ASRC1_IN2L_ENA				0x0008
 #define MADERA_ASRC1_IN2L_ENA_MASK			0x0008
 #define MADERA_ASRC1_IN2L_ENA_SHIFT			     3
-#define MADERA_ASRC1_IN2L_ENA_WIDTH			     1
 #define MADERA_ASRC1_IN2R_ENA				0x0004
 #define MADERA_ASRC1_IN2R_ENA_MASK			0x0004
 #define MADERA_ASRC1_IN2R_ENA_SHIFT			     2
-#define MADERA_ASRC1_IN2R_ENA_WIDTH			     1
 #define MADERA_ASRC1_IN1L_ENA				0x0002
 #define MADERA_ASRC1_IN1L_ENA_MASK			0x0002
 #define MADERA_ASRC1_IN1L_ENA_SHIFT			     1
-#define MADERA_ASRC1_IN1L_ENA_WIDTH			     1
 #define MADERA_ASRC1_IN1R_ENA				0x0001
 #define MADERA_ASRC1_IN1R_ENA_MASK			0x0001
 #define MADERA_ASRC1_IN1R_ENA_SHIFT			     0
-#define MADERA_ASRC1_IN1R_ENA_WIDTH			     1
 
 /* (0x0EE2)  ASRC1_RATE1 */
 #define MADERA_ASRC1_RATE1_MASK				0xF800
 #define MADERA_ASRC1_RATE1_SHIFT			    11
-#define MADERA_ASRC1_RATE1_WIDTH			     5
 
 /* (0x0EE3)  ASRC1_RATE2 */
 #define MADERA_ASRC1_RATE2_MASK				0xF800
 #define MADERA_ASRC1_RATE2_SHIFT			    11
-#define MADERA_ASRC1_RATE2_WIDTH			     5
 
 /* (0x0EF0) - ISRC1 CTRL 1 */
 #define MADERA_ISRC1_FSH_MASK				0xF800
 #define MADERA_ISRC1_FSH_SHIFT				    11
-#define MADERA_ISRC1_FSH_WIDTH				     5
 #define MADERA_ISRC1_CLK_SEL_MASK			0x0700
 #define MADERA_ISRC1_CLK_SEL_SHIFT			     8
-#define MADERA_ISRC1_CLK_SEL_WIDTH			     3
 
 /* (0x0EF1)  ISRC1_CTRL_2 */
 #define MADERA_ISRC1_FSL_MASK				0xF800
 #define MADERA_ISRC1_FSL_SHIFT				    11
-#define MADERA_ISRC1_FSL_WIDTH				     5
 
 /* (0x0EF2)  ISRC1_CTRL_3 */
 #define MADERA_ISRC1_INT1_ENA				0x8000
 #define MADERA_ISRC1_INT1_ENA_MASK			0x8000
 #define MADERA_ISRC1_INT1_ENA_SHIFT			    15
-#define MADERA_ISRC1_INT1_ENA_WIDTH			     1
 #define MADERA_ISRC1_INT2_ENA				0x4000
 #define MADERA_ISRC1_INT2_ENA_MASK			0x4000
 #define MADERA_ISRC1_INT2_ENA_SHIFT			    14
-#define MADERA_ISRC1_INT2_ENA_WIDTH			     1
 #define MADERA_ISRC1_INT3_ENA				0x2000
 #define MADERA_ISRC1_INT3_ENA_MASK			0x2000
 #define MADERA_ISRC1_INT3_ENA_SHIFT			    13
-#define MADERA_ISRC1_INT3_ENA_WIDTH			     1
 #define MADERA_ISRC1_INT4_ENA				0x1000
 #define MADERA_ISRC1_INT4_ENA_MASK			0x1000
 #define MADERA_ISRC1_INT4_ENA_SHIFT			    12
-#define MADERA_ISRC1_INT4_ENA_WIDTH			     1
 #define MADERA_ISRC1_DEC1_ENA				0x0200
 #define MADERA_ISRC1_DEC1_ENA_MASK			0x0200
 #define MADERA_ISRC1_DEC1_ENA_SHIFT			     9
-#define MADERA_ISRC1_DEC1_ENA_WIDTH			     1
 #define MADERA_ISRC1_DEC2_ENA				0x0100
 #define MADERA_ISRC1_DEC2_ENA_MASK			0x0100
 #define MADERA_ISRC1_DEC2_ENA_SHIFT			     8
-#define MADERA_ISRC1_DEC2_ENA_WIDTH			     1
 #define MADERA_ISRC1_DEC3_ENA				0x0080
 #define MADERA_ISRC1_DEC3_ENA_MASK			0x0080
 #define MADERA_ISRC1_DEC3_ENA_SHIFT			     7
-#define MADERA_ISRC1_DEC3_ENA_WIDTH			     1
 #define MADERA_ISRC1_DEC4_ENA				0x0040
 #define MADERA_ISRC1_DEC4_ENA_MASK			0x0040
 #define MADERA_ISRC1_DEC4_ENA_SHIFT			     6
-#define MADERA_ISRC1_DEC4_ENA_WIDTH			     1
 #define MADERA_ISRC1_NOTCH_ENA				0x0001
 #define MADERA_ISRC1_NOTCH_ENA_MASK			0x0001
 #define MADERA_ISRC1_NOTCH_ENA_SHIFT			     0
-#define MADERA_ISRC1_NOTCH_ENA_WIDTH			     1
 
 /* (0x0EF3)  ISRC2_CTRL_1 */
 #define MADERA_ISRC2_FSH_MASK				0xF800
 #define MADERA_ISRC2_FSH_SHIFT				    11
-#define MADERA_ISRC2_FSH_WIDTH				     5
 #define MADERA_ISRC2_CLK_SEL_MASK			0x0700
 #define MADERA_ISRC2_CLK_SEL_SHIFT			     8
-#define MADERA_ISRC2_CLK_SEL_WIDTH			     3
 
 /* (0x0EF4)  ISRC2_CTRL_2 */
 #define MADERA_ISRC2_FSL_MASK				0xF800
 #define MADERA_ISRC2_FSL_SHIFT				    11
-#define MADERA_ISRC2_FSL_WIDTH				     5
 
 /* (0x0EF5)  ISRC2_CTRL_3 */
 #define MADERA_ISRC2_INT1_ENA				0x8000
 #define MADERA_ISRC2_INT1_ENA_MASK			0x8000
 #define MADERA_ISRC2_INT1_ENA_SHIFT			    15
-#define MADERA_ISRC2_INT1_ENA_WIDTH			     1
 #define MADERA_ISRC2_INT2_ENA				0x4000
 #define MADERA_ISRC2_INT2_ENA_MASK			0x4000
 #define MADERA_ISRC2_INT2_ENA_SHIFT			    14
-#define MADERA_ISRC2_INT2_ENA_WIDTH			     1
 #define MADERA_ISRC2_INT3_ENA				0x2000
 #define MADERA_ISRC2_INT3_ENA_MASK			0x2000
 #define MADERA_ISRC2_INT3_ENA_SHIFT			    13
-#define MADERA_ISRC2_INT3_ENA_WIDTH			     1
 #define MADERA_ISRC2_INT4_ENA				0x1000
 #define MADERA_ISRC2_INT4_ENA_MASK			0x1000
 #define MADERA_ISRC2_INT4_ENA_SHIFT			    12
-#define MADERA_ISRC2_INT4_ENA_WIDTH			     1
 #define MADERA_ISRC2_DEC1_ENA				0x0200
 #define MADERA_ISRC2_DEC1_ENA_MASK			0x0200
 #define MADERA_ISRC2_DEC1_ENA_SHIFT			     9
-#define MADERA_ISRC2_DEC1_ENA_WIDTH			     1
 #define MADERA_ISRC2_DEC2_ENA				0x0100
 #define MADERA_ISRC2_DEC2_ENA_MASK			0x0100
 #define MADERA_ISRC2_DEC2_ENA_SHIFT			     8
-#define MADERA_ISRC2_DEC2_ENA_WIDTH			     1
 #define MADERA_ISRC2_DEC3_ENA				0x0080
 #define MADERA_ISRC2_DEC3_ENA_MASK			0x0080
 #define MADERA_ISRC2_DEC3_ENA_SHIFT			     7
-#define MADERA_ISRC2_DEC3_ENA_WIDTH			     1
 #define MADERA_ISRC2_DEC4_ENA				0x0040
 #define MADERA_ISRC2_DEC4_ENA_MASK			0x0040
 #define MADERA_ISRC2_DEC4_ENA_SHIFT			     6
-#define MADERA_ISRC2_DEC4_ENA_WIDTH			     1
 #define MADERA_ISRC2_NOTCH_ENA				0x0001
 #define MADERA_ISRC2_NOTCH_ENA_MASK			0x0001
 #define MADERA_ISRC2_NOTCH_ENA_SHIFT			     0
-#define MADERA_ISRC2_NOTCH_ENA_WIDTH			     1
 
 /* (0x0EF6)  ISRC3_CTRL_1 */
 #define MADERA_ISRC3_FSH_MASK				0xF800
 #define MADERA_ISRC3_FSH_SHIFT				    11
-#define MADERA_ISRC3_FSH_WIDTH				     5
 #define MADERA_ISRC3_CLK_SEL_MASK			0x0700
 #define MADERA_ISRC3_CLK_SEL_SHIFT			     8
-#define MADERA_ISRC3_CLK_SEL_WIDTH			     3
 
 /* (0x0EF7)  ISRC3_CTRL_2 */
 #define MADERA_ISRC3_FSL_MASK				0xF800
 #define MADERA_ISRC3_FSL_SHIFT				    11
-#define MADERA_ISRC3_FSL_WIDTH				     5
 
 /* (0x0EF8)  ISRC3_CTRL_3 */
 #define MADERA_ISRC3_INT1_ENA				0x8000
 #define MADERA_ISRC3_INT1_ENA_MASK			0x8000
 #define MADERA_ISRC3_INT1_ENA_SHIFT			    15
-#define MADERA_ISRC3_INT1_ENA_WIDTH			     1
 #define MADERA_ISRC3_INT2_ENA				0x4000
 #define MADERA_ISRC3_INT2_ENA_MASK			0x4000
 #define MADERA_ISRC3_INT2_ENA_SHIFT			    14
-#define MADERA_ISRC3_INT2_ENA_WIDTH			     1
 #define MADERA_ISRC3_INT3_ENA				0x2000
 #define MADERA_ISRC3_INT3_ENA_MASK			0x2000
 #define MADERA_ISRC3_INT3_ENA_SHIFT			    13
-#define MADERA_ISRC3_INT3_ENA_WIDTH			     1
 #define MADERA_ISRC3_INT4_ENA				0x1000
 #define MADERA_ISRC3_INT4_ENA_MASK			0x1000
 #define MADERA_ISRC3_INT4_ENA_SHIFT			    12
-#define MADERA_ISRC3_INT4_ENA_WIDTH			     1
 #define MADERA_ISRC3_DEC1_ENA				0x0200
 #define MADERA_ISRC3_DEC1_ENA_MASK			0x0200
 #define MADERA_ISRC3_DEC1_ENA_SHIFT			     9
-#define MADERA_ISRC3_DEC1_ENA_WIDTH			     1
 #define MADERA_ISRC3_DEC2_ENA				0x0100
 #define MADERA_ISRC3_DEC2_ENA_MASK			0x0100
 #define MADERA_ISRC3_DEC2_ENA_SHIFT			     8
-#define MADERA_ISRC3_DEC2_ENA_WIDTH			     1
 #define MADERA_ISRC3_DEC3_ENA				0x0080
 #define MADERA_ISRC3_DEC3_ENA_MASK			0x0080
 #define MADERA_ISRC3_DEC3_ENA_SHIFT			     7
-#define MADERA_ISRC3_DEC3_ENA_WIDTH			     1
 #define MADERA_ISRC3_DEC4_ENA				0x0040
 #define MADERA_ISRC3_DEC4_ENA_MASK			0x0040
 #define MADERA_ISRC3_DEC4_ENA_SHIFT			     6
-#define MADERA_ISRC3_DEC4_ENA_WIDTH			     1
 #define MADERA_ISRC3_NOTCH_ENA				0x0001
 #define MADERA_ISRC3_NOTCH_ENA_MASK			0x0001
 #define MADERA_ISRC3_NOTCH_ENA_SHIFT			     0
-#define MADERA_ISRC3_NOTCH_ENA_WIDTH			     1
 
 /* (0x0EF9)  ISRC4_CTRL_1 */
 #define MADERA_ISRC4_FSH_MASK				0xF800
 #define MADERA_ISRC4_FSH_SHIFT				    11
-#define MADERA_ISRC4_FSH_WIDTH				     5
 #define MADERA_ISRC4_CLK_SEL_MASK			0x0700
 #define MADERA_ISRC4_CLK_SEL_SHIFT			     8
-#define MADERA_ISRC4_CLK_SEL_WIDTH			     3
 
 /* (0x0EFA)  ISRC4_CTRL_2 */
 #define MADERA_ISRC4_FSL_MASK				0xF800
 #define MADERA_ISRC4_FSL_SHIFT				    11
-#define MADERA_ISRC4_FSL_WIDTH				     5
 
 /* (0x0EFB)  ISRC4_CTRL_3 */
 #define MADERA_ISRC4_INT1_ENA				0x8000
 #define MADERA_ISRC4_INT1_ENA_MASK			0x8000
 #define MADERA_ISRC4_INT1_ENA_SHIFT			    15
-#define MADERA_ISRC4_INT1_ENA_WIDTH			     1
 #define MADERA_ISRC4_INT2_ENA				0x4000
 #define MADERA_ISRC4_INT2_ENA_MASK			0x4000
 #define MADERA_ISRC4_INT2_ENA_SHIFT			    14
-#define MADERA_ISRC4_INT2_ENA_WIDTH			     1
 #define MADERA_ISRC4_INT3_ENA				0x2000
 #define MADERA_ISRC4_INT3_ENA_MASK			0x2000
 #define MADERA_ISRC4_INT3_ENA_SHIFT			    13
-#define MADERA_ISRC4_INT3_ENA_WIDTH			     1
 #define MADERA_ISRC4_INT4_ENA				0x1000
 #define MADERA_ISRC4_INT4_ENA_MASK			0x1000
 #define MADERA_ISRC4_INT4_ENA_SHIFT			    12
-#define MADERA_ISRC4_INT4_ENA_WIDTH			     1
 #define MADERA_ISRC4_DEC1_ENA				0x0200
 #define MADERA_ISRC4_DEC1_ENA_MASK			0x0200
 #define MADERA_ISRC4_DEC1_ENA_SHIFT			     9
-#define MADERA_ISRC4_DEC1_ENA_WIDTH			     1
 #define MADERA_ISRC4_DEC2_ENA				0x0100
 #define MADERA_ISRC4_DEC2_ENA_MASK			0x0100
 #define MADERA_ISRC4_DEC2_ENA_SHIFT			     8
-#define MADERA_ISRC4_DEC2_ENA_WIDTH			     1
 #define MADERA_ISRC4_DEC3_ENA				0x0080
 #define MADERA_ISRC4_DEC3_ENA_MASK			0x0080
 #define MADERA_ISRC4_DEC3_ENA_SHIFT			     7
-#define MADERA_ISRC4_DEC3_ENA_WIDTH			     1
 #define MADERA_ISRC4_DEC4_ENA				0x0040
 #define MADERA_ISRC4_DEC4_ENA_MASK			0x0040
 #define MADERA_ISRC4_DEC4_ENA_SHIFT			     6
-#define MADERA_ISRC4_DEC4_ENA_WIDTH			     1
 #define MADERA_ISRC4_NOTCH_ENA				0x0001
 #define MADERA_ISRC4_NOTCH_ENA_MASK			0x0001
 #define MADERA_ISRC4_NOTCH_ENA_SHIFT			     0
-#define MADERA_ISRC4_NOTCH_ENA_WIDTH			     1
 
 /* (0x0F00)  Clock_Control */
 #define MADERA_EXT_NG_SEL_CLR				0x0080
 #define MADERA_EXT_NG_SEL_CLR_MASK			0x0080
 #define MADERA_EXT_NG_SEL_CLR_SHIFT			     7
-#define MADERA_EXT_NG_SEL_CLR_WIDTH			     1
 #define MADERA_EXT_NG_SEL_SET				0x0040
 #define MADERA_EXT_NG_SEL_SET_MASK			0x0040
 #define MADERA_EXT_NG_SEL_SET_SHIFT			     6
-#define MADERA_EXT_NG_SEL_SET_WIDTH			     1
 #define MADERA_CLK_R_ENA_CLR				0x0020
 #define MADERA_CLK_R_ENA_CLR_MASK			0x0020
 #define MADERA_CLK_R_ENA_CLR_SHIFT			     5
-#define MADERA_CLK_R_ENA_CLR_WIDTH			     1
 #define MADERA_CLK_R_ENA_SET				0x0010
 #define MADERA_CLK_R_ENA_SET_MASK			0x0010
 #define MADERA_CLK_R_ENA_SET_SHIFT			     4
-#define MADERA_CLK_R_ENA_SET_WIDTH			     1
 #define MADERA_CLK_NG_ENA_CLR				0x0008
 #define MADERA_CLK_NG_ENA_CLR_MASK			0x0008
 #define MADERA_CLK_NG_ENA_CLR_SHIFT			     3
-#define MADERA_CLK_NG_ENA_CLR_WIDTH			     1
 #define MADERA_CLK_NG_ENA_SET				0x0004
 #define MADERA_CLK_NG_ENA_SET_MASK			0x0004
 #define MADERA_CLK_NG_ENA_SET_SHIFT			     2
-#define MADERA_CLK_NG_ENA_SET_WIDTH			     1
 #define MADERA_CLK_L_ENA_CLR				0x0002
 #define MADERA_CLK_L_ENA_CLR_MASK			0x0002
 #define MADERA_CLK_L_ENA_CLR_SHIFT			     1
-#define MADERA_CLK_L_ENA_CLR_WIDTH			     1
 #define MADERA_CLK_L_ENA_SET				0x0001
 #define MADERA_CLK_L_ENA_SET_MASK			0x0001
 #define MADERA_CLK_L_ENA_SET_SHIFT			     0
-#define MADERA_CLK_L_ENA_SET_WIDTH			     1
 
 /* (0x0F01)  ANC_SRC */
 #define MADERA_IN_RXANCR_SEL_MASK			0x0070
 #define MADERA_IN_RXANCR_SEL_SHIFT			     4
-#define MADERA_IN_RXANCR_SEL_WIDTH			     3
 #define MADERA_IN_RXANCL_SEL_MASK			0x0007
 #define MADERA_IN_RXANCL_SEL_SHIFT			     0
-#define MADERA_IN_RXANCL_SEL_WIDTH			     3
 
 /* (0x0F17)  FCL_ADC_reformatter_control */
 #define MADERA_FCL_MIC_MODE_SEL				0x000C
 #define MADERA_FCL_MIC_MODE_SEL_SHIFT			     2
-#define MADERA_FCL_MIC_MODE_SEL_WIDTH			     2
 
 /* (0x0F73)  FCR_ADC_reformatter_control */
 #define MADERA_FCR_MIC_MODE_SEL				0x000C
 #define MADERA_FCR_MIC_MODE_SEL_SHIFT			     2
-#define MADERA_FCR_MIC_MODE_SEL_WIDTH			     2
 
 /* (0x10C0)  AUXPDM1_CTRL_0 */
 #define MADERA_AUXPDM1_SRC_MASK				0x0F00
 #define MADERA_AUXPDM1_SRC_SHIFT			     8
-#define MADERA_AUXPDM1_SRC_WIDTH			     4
 #define MADERA_AUXPDM1_TXEDGE_MASK			0x0010
 #define MADERA_AUXPDM1_TXEDGE_SHIFT			     4
-#define MADERA_AUXPDM1_TXEDGE_WIDTH			     1
 #define MADERA_AUXPDM1_MSTR_MASK			0x0008
 #define MADERA_AUXPDM1_MSTR_SHIFT			     3
-#define MADERA_AUXPDM1_MSTR_WIDTH			     1
 #define MADERA_AUXPDM1_ENABLE_MASK			0x0001
 #define MADERA_AUXPDM1_ENABLE_SHIFT			     0
-#define MADERA_AUXPDM1_ENABLE_WIDTH			     1
 
 /* (0x10C1)  AUXPDM1_CTRL_1 */
 #define MADERA_AUXPDM1_CLK_FREQ_MASK			0xC000
 #define MADERA_AUXPDM1_CLK_FREQ_SHIFT			    14
-#define MADERA_AUXPDM1_CLK_FREQ_WIDTH			     2
 
 /* (0x1480)  DFC1_CTRL_W0 */
 #define MADERA_DFC1_RATE_MASK				0x007C
 #define MADERA_DFC1_RATE_SHIFT				     2
-#define MADERA_DFC1_RATE_WIDTH				     5
 #define MADERA_DFC1_DITH_ENA				0x0002
 #define MADERA_DFC1_DITH_ENA_MASK			0x0002
 #define MADERA_DFC1_DITH_ENA_SHIFT			     1
-#define MADERA_DFC1_DITH_ENA_WIDTH			     1
 #define MADERA_DFC1_ENA					0x0001
 #define MADERA_DFC1_ENA_MASK				0x0001
 #define MADERA_DFC1_ENA_SHIFT				     0
-#define MADERA_DFC1_ENA_WIDTH				     1
 
 /* (0x1482)  DFC1_RX_W0 */
 #define MADERA_DFC1_RX_DATA_WIDTH_MASK			0x1F00
 #define MADERA_DFC1_RX_DATA_WIDTH_SHIFT			     8
-#define MADERA_DFC1_RX_DATA_WIDTH_WIDTH			     5
 
 #define MADERA_DFC1_RX_DATA_TYPE_MASK			0x0007
 #define MADERA_DFC1_RX_DATA_TYPE_SHIFT			     0
-#define MADERA_DFC1_RX_DATA_TYPE_WIDTH			     3
 
 /* (0x1484)  DFC1_TX_W0 */
 #define MADERA_DFC1_TX_DATA_WIDTH_MASK			0x1F00
 #define MADERA_DFC1_TX_DATA_WIDTH_SHIFT			     8
-#define MADERA_DFC1_TX_DATA_WIDTH_WIDTH			     5
 
 #define MADERA_DFC1_TX_DATA_TYPE_MASK			0x0007
 #define MADERA_DFC1_TX_DATA_TYPE_SHIFT			     0
-#define MADERA_DFC1_TX_DATA_TYPE_WIDTH			     3
 
 /* (0x1600)  ADSP2_IRQ0 */
 #define MADERA_DSP_IRQ2					0x0002
@@ -3636,449 +3103,347 @@
 #define MADERA_GP1_LVL					0x8000
 #define MADERA_GP1_LVL_MASK				0x8000
 #define MADERA_GP1_LVL_SHIFT				    15
-#define MADERA_GP1_LVL_WIDTH				     1
 #define MADERA_GP1_OP_CFG				0x4000
 #define MADERA_GP1_OP_CFG_MASK				0x4000
 #define MADERA_GP1_OP_CFG_SHIFT				    14
-#define MADERA_GP1_OP_CFG_WIDTH				     1
 #define MADERA_GP1_DB					0x2000
 #define MADERA_GP1_DB_MASK				0x2000
 #define MADERA_GP1_DB_SHIFT				    13
-#define MADERA_GP1_DB_WIDTH				     1
 #define MADERA_GP1_POL					0x1000
 #define MADERA_GP1_POL_MASK				0x1000
 #define MADERA_GP1_POL_SHIFT				    12
-#define MADERA_GP1_POL_WIDTH				     1
 #define MADERA_GP1_IP_CFG				0x0800
 #define MADERA_GP1_IP_CFG_MASK				0x0800
 #define MADERA_GP1_IP_CFG_SHIFT				    11
-#define MADERA_GP1_IP_CFG_WIDTH				     1
 #define MADERA_GP1_FN_MASK				0x03FF
 #define MADERA_GP1_FN_SHIFT				     0
-#define MADERA_GP1_FN_WIDTH				    10
 
 /* (0x1701)  GPIO1_CTRL_2 */
 #define MADERA_GP1_DIR					0x8000
 #define MADERA_GP1_DIR_MASK				0x8000
 #define MADERA_GP1_DIR_SHIFT				    15
-#define MADERA_GP1_DIR_WIDTH				     1
 #define MADERA_GP1_PU					0x4000
 #define MADERA_GP1_PU_MASK				0x4000
 #define MADERA_GP1_PU_SHIFT				    14
-#define MADERA_GP1_PU_WIDTH				     1
 #define MADERA_GP1_PD					0x2000
 #define MADERA_GP1_PD_MASK				0x2000
 #define MADERA_GP1_PD_SHIFT				    13
-#define MADERA_GP1_PD_WIDTH				     1
 #define MADERA_GP1_DRV_STR_MASK				0x1800
 #define MADERA_GP1_DRV_STR_SHIFT			    11
-#define MADERA_GP1_DRV_STR_WIDTH			     2
 
 /* (0x1800)  IRQ1_Status_1 */
 #define MADERA_CTRLIF_ERR_EINT1				0x1000
 #define MADERA_CTRLIF_ERR_EINT1_MASK			0x1000
 #define MADERA_CTRLIF_ERR_EINT1_SHIFT			    12
-#define MADERA_CTRLIF_ERR_EINT1_WIDTH			     1
 #define MADERA_SYSCLK_FAIL_EINT1			0x0200
 #define MADERA_SYSCLK_FAIL_EINT1_MASK			0x0200
 #define MADERA_SYSCLK_FAIL_EINT1_SHIFT			     9
-#define MADERA_SYSCLK_FAIL_EINT1_WIDTH			     1
 #define MADERA_CLOCK_DETECT_EINT1			0x0100
 #define MADERA_CLOCK_DETECT_EINT1_MASK			0x0100
 #define MADERA_CLOCK_DETECT_EINT1_SHIFT			     8
-#define MADERA_CLOCK_DETECT_EINT1_WIDTH			     1
 #define MADERA_BOOT_DONE_EINT1				0x0080
 #define MADERA_BOOT_DONE_EINT1_MASK			0x0080
 #define MADERA_BOOT_DONE_EINT1_SHIFT			     7
-#define MADERA_BOOT_DONE_EINT1_WIDTH			     1
 
 /* (0x1801)  IRQ1_Status_2 */
 #define MADERA_FLLAO_LOCK_EINT1				0x0800
 #define MADERA_FLLAO_LOCK_EINT1_MASK			0x0800
 #define MADERA_FLLAO_LOCK_EINT1_SHIFT			    11
-#define MADERA_FLLAO_LOCK_EINT1_WIDTH			     1
 #define MADERA_FLL3_LOCK_EINT1				0x0400
 #define MADERA_FLL3_LOCK_EINT1_MASK			0x0400
 #define MADERA_FLL3_LOCK_EINT1_SHIFT			    10
-#define MADERA_FLL3_LOCK_EINT1_WIDTH			     1
 #define MADERA_FLL2_LOCK_EINT1				0x0200
 #define MADERA_FLL2_LOCK_EINT1_MASK			0x0200
 #define MADERA_FLL2_LOCK_EINT1_SHIFT			     9
-#define MADERA_FLL2_LOCK_EINT1_WIDTH			     1
 #define MADERA_FLL1_LOCK_EINT1				0x0100
 #define MADERA_FLL1_LOCK_EINT1_MASK			0x0100
 #define MADERA_FLL1_LOCK_EINT1_SHIFT			     8
-#define MADERA_FLL1_LOCK_EINT1_WIDTH			     1
 
 /* (0x1805)  IRQ1_Status_6 */
 #define MADERA_MICDET2_EINT1				0x0200
 #define MADERA_MICDET2_EINT1_MASK			0x0200
 #define MADERA_MICDET2_EINT1_SHIFT			     9
-#define MADERA_MICDET2_EINT1_WIDTH			     1
 #define MADERA_MICDET1_EINT1				0x0100
 #define MADERA_MICDET1_EINT1_MASK			0x0100
 #define MADERA_MICDET1_EINT1_SHIFT			     8
-#define MADERA_MICDET1_EINT1_WIDTH			     1
 #define MADERA_HPDET_EINT1				0x0001
 #define MADERA_HPDET_EINT1_MASK				0x0001
 #define MADERA_HPDET_EINT1_SHIFT			     0
-#define MADERA_HPDET_EINT1_WIDTH			     1
 
 /* (0x1806)  IRQ1_Status_7 */
 #define MADERA_MICD_CLAMP_FALL_EINT1			0x0020
 #define MADERA_MICD_CLAMP_FALL_EINT1_MASK		0x0020
 #define MADERA_MICD_CLAMP_FALL_EINT1_SHIFT		     5
-#define MADERA_MICD_CLAMP_FALL_EINT1_WIDTH		     1
 #define MADERA_MICD_CLAMP_RISE_EINT1			0x0010
 #define MADERA_MICD_CLAMP_RISE_EINT1_MASK		0x0010
 #define MADERA_MICD_CLAMP_RISE_EINT1_SHIFT		     4
-#define MADERA_MICD_CLAMP_RISE_EINT1_WIDTH		     1
 #define MADERA_JD2_FALL_EINT1				0x0008
 #define MADERA_JD2_FALL_EINT1_MASK			0x0008
 #define MADERA_JD2_FALL_EINT1_SHIFT			     3
-#define MADERA_JD2_FALL_EINT1_WIDTH			     1
 #define MADERA_JD2_RISE_EINT1				0x0004
 #define MADERA_JD2_RISE_EINT1_MASK			0x0004
 #define MADERA_JD2_RISE_EINT1_SHIFT			     2
-#define MADERA_JD2_RISE_EINT1_WIDTH			     1
 #define MADERA_JD1_FALL_EINT1				0x0002
 #define MADERA_JD1_FALL_EINT1_MASK			0x0002
 #define MADERA_JD1_FALL_EINT1_SHIFT			     1
-#define MADERA_JD1_FALL_EINT1_WIDTH			     1
 #define MADERA_JD1_RISE_EINT1				0x0001
 #define MADERA_JD1_RISE_EINT1_MASK			0x0001
 #define MADERA_JD1_RISE_EINT1_SHIFT			     0
-#define MADERA_JD1_RISE_EINT1_WIDTH			     1
 
 /* (0x1808)  IRQ1_Status_9 */
 #define MADERA_ASRC2_IN2_LOCK_EINT1			0x0800
 #define MADERA_ASRC2_IN2_LOCK_EINT1_MASK		0x0800
 #define MADERA_ASRC2_IN2_LOCK_EINT1_SHIFT		    11
-#define MADERA_ASRC2_IN2_LOCK_EINT1_WIDTH		     1
 #define MADERA_ASRC2_IN1_LOCK_EINT1			0x0400
 #define MADERA_ASRC2_IN1_LOCK_EINT1_MASK		0x0400
 #define MADERA_ASRC2_IN1_LOCK_EINT1_SHIFT		    10
-#define MADERA_ASRC2_IN1_LOCK_EINT1_WIDTH		     1
 #define MADERA_ASRC1_IN2_LOCK_EINT1			0x0200
 #define MADERA_ASRC1_IN2_LOCK_EINT1_MASK		0x0200
 #define MADERA_ASRC1_IN2_LOCK_EINT1_SHIFT		     9
-#define MADERA_ASRC1_IN2_LOCK_EINT1_WIDTH		     1
 #define MADERA_ASRC1_IN1_LOCK_EINT1			0x0100
 #define MADERA_ASRC1_IN1_LOCK_EINT1_MASK		0x0100
 #define MADERA_ASRC1_IN1_LOCK_EINT1_SHIFT		     8
-#define MADERA_ASRC1_IN1_LOCK_EINT1_WIDTH		     1
 #define MADERA_DRC2_SIG_DET_EINT1			0x0002
 #define MADERA_DRC2_SIG_DET_EINT1_MASK			0x0002
 #define MADERA_DRC2_SIG_DET_EINT1_SHIFT			     1
-#define MADERA_DRC2_SIG_DET_EINT1_WIDTH			     1
 #define MADERA_DRC1_SIG_DET_EINT1			0x0001
 #define MADERA_DRC1_SIG_DET_EINT1_MASK			0x0001
 #define MADERA_DRC1_SIG_DET_EINT1_SHIFT			     0
-#define MADERA_DRC1_SIG_DET_EINT1_WIDTH			     1
 
 /* (0x180A)  IRQ1_Status_11 */
 #define MADERA_DSP_IRQ16_EINT1				0x8000
 #define MADERA_DSP_IRQ16_EINT1_MASK			0x8000
 #define MADERA_DSP_IRQ16_EINT1_SHIFT			    15
-#define MADERA_DSP_IRQ16_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ15_EINT1				0x4000
 #define MADERA_DSP_IRQ15_EINT1_MASK			0x4000
 #define MADERA_DSP_IRQ15_EINT1_SHIFT			    14
-#define MADERA_DSP_IRQ15_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ14_EINT1				0x2000
 #define MADERA_DSP_IRQ14_EINT1_MASK			0x2000
 #define MADERA_DSP_IRQ14_EINT1_SHIFT			    13
-#define MADERA_DSP_IRQ14_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ13_EINT1				0x1000
 #define MADERA_DSP_IRQ13_EINT1_MASK			0x1000
 #define MADERA_DSP_IRQ13_EINT1_SHIFT			    12
-#define MADERA_DSP_IRQ13_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ12_EINT1				0x0800
 #define MADERA_DSP_IRQ12_EINT1_MASK			0x0800
 #define MADERA_DSP_IRQ12_EINT1_SHIFT			    11
-#define MADERA_DSP_IRQ12_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ11_EINT1				0x0400
 #define MADERA_DSP_IRQ11_EINT1_MASK			0x0400
 #define MADERA_DSP_IRQ11_EINT1_SHIFT			    10
-#define MADERA_DSP_IRQ11_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ10_EINT1				0x0200
 #define MADERA_DSP_IRQ10_EINT1_MASK			0x0200
 #define MADERA_DSP_IRQ10_EINT1_SHIFT			     9
-#define MADERA_DSP_IRQ10_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ9_EINT1				0x0100
 #define MADERA_DSP_IRQ9_EINT1_MASK			0x0100
 #define MADERA_DSP_IRQ9_EINT1_SHIFT			     8
-#define MADERA_DSP_IRQ9_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ8_EINT1				0x0080
 #define MADERA_DSP_IRQ8_EINT1_MASK			0x0080
 #define MADERA_DSP_IRQ8_EINT1_SHIFT			     7
-#define MADERA_DSP_IRQ8_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ7_EINT1				0x0040
 #define MADERA_DSP_IRQ7_EINT1_MASK			0x0040
 #define MADERA_DSP_IRQ7_EINT1_SHIFT			     6
-#define MADERA_DSP_IRQ7_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ6_EINT1				0x0020
 #define MADERA_DSP_IRQ6_EINT1_MASK			0x0020
 #define MADERA_DSP_IRQ6_EINT1_SHIFT			     5
-#define MADERA_DSP_IRQ6_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ5_EINT1				0x0010
 #define MADERA_DSP_IRQ5_EINT1_MASK			0x0010
 #define MADERA_DSP_IRQ5_EINT1_SHIFT			     4
-#define MADERA_DSP_IRQ5_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ4_EINT1				0x0008
 #define MADERA_DSP_IRQ4_EINT1_MASK			0x0008
 #define MADERA_DSP_IRQ4_EINT1_SHIFT			     3
-#define MADERA_DSP_IRQ4_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ3_EINT1				0x0004
 #define MADERA_DSP_IRQ3_EINT1_MASK			0x0004
 #define MADERA_DSP_IRQ3_EINT1_SHIFT			     2
-#define MADERA_DSP_IRQ3_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ2_EINT1				0x0002
 #define MADERA_DSP_IRQ2_EINT1_MASK			0x0002
 #define MADERA_DSP_IRQ2_EINT1_SHIFT			     1
-#define MADERA_DSP_IRQ2_EINT1_WIDTH			     1
 #define MADERA_DSP_IRQ1_EINT1				0x0001
 #define MADERA_DSP_IRQ1_EINT1_MASK			0x0001
 #define MADERA_DSP_IRQ1_EINT1_SHIFT			     0
-#define MADERA_DSP_IRQ1_EINT1_WIDTH			     1
 
 /* (0x180B)  IRQ1_Status_12 */
 #define MADERA_SPKOUTR_SC_EINT1				0x0080
 #define MADERA_SPKOUTR_SC_EINT1_MASK			0x0080
 #define MADERA_SPKOUTR_SC_EINT1_SHIFT			     7
-#define MADERA_SPKOUTR_SC_EINT1_WIDTH			     1
 #define MADERA_SPKOUTL_SC_EINT1				0x0040
 #define MADERA_SPKOUTL_SC_EINT1_MASK			0x0040
 #define MADERA_SPKOUTL_SC_EINT1_SHIFT			     6
-#define MADERA_SPKOUTL_SC_EINT1_WIDTH			     1
 #define MADERA_HP3R_SC_EINT1				0x0020
 #define MADERA_HP3R_SC_EINT1_MASK			0x0020
 #define MADERA_HP3R_SC_EINT1_SHIFT			     5
-#define MADERA_HP3R_SC_EINT1_WIDTH			     1
 #define MADERA_HP3L_SC_EINT1				0x0010
 #define MADERA_HP3L_SC_EINT1_MASK			0x0010
 #define MADERA_HP3L_SC_EINT1_SHIFT			     4
-#define MADERA_HP3L_SC_EINT1_WIDTH			     1
 #define MADERA_HP2R_SC_EINT1				0x0008
 #define MADERA_HP2R_SC_EINT1_MASK			0x0008
 #define MADERA_HP2R_SC_EINT1_SHIFT			     3
-#define MADERA_HP2R_SC_EINT1_WIDTH			     1
 #define MADERA_HP2L_SC_EINT1				0x0004
 #define MADERA_HP2L_SC_EINT1_MASK			0x0004
 #define MADERA_HP2L_SC_EINT1_SHIFT			     2
-#define MADERA_HP2L_SC_EINT1_WIDTH			     1
 #define MADERA_HP1R_SC_EINT1				0x0002
 #define MADERA_HP1R_SC_EINT1_MASK			0x0002
 #define MADERA_HP1R_SC_EINT1_SHIFT			     1
-#define MADERA_HP1R_SC_EINT1_WIDTH			     1
 #define MADERA_HP1L_SC_EINT1				0x0001
 #define MADERA_HP1L_SC_EINT1_MASK			0x0001
 #define MADERA_HP1L_SC_EINT1_SHIFT			     0
-#define MADERA_HP1L_SC_EINT1_WIDTH			     1
 
 /* (0x180E)  IRQ1_Status_15 */
 #define MADERA_SPK_OVERHEAT_WARN_EINT1			0x0004
 #define MADERA_SPK_OVERHEAT_WARN_EINT1_MASK		0x0004
 #define MADERA_SPK_OVERHEAT_WARN_EINT1_SHIFT		     2
-#define MADERA_SPK_OVERHEAT_WARN_EINT1_WIDTH		     1
 #define MADERA_SPK_OVERHEAT_EINT1			0x0002
 #define MADERA_SPK_OVERHEAT_EINT1_MASK			0x0002
 #define MADERA_SPK_OVERHEAT_EINT1_SHIFT			     1
-#define MADERA_SPK_OVERHEAT_EINT1_WIDTH			     1
 #define MADERA_SPK_SHUTDOWN_EINT1			0x0001
 #define MADERA_SPK_SHUTDOWN_EINT1_MASK			0x0001
 #define MADERA_SPK_SHUTDOWN_EINT1_SHIFT			     0
-#define MADERA_SPK_SHUTDOWN_EINT1_WIDTH			     1
 
 /* (0x1820) - IRQ1 Status 33 */
 #define MADERA_DSP7_BUS_ERR_EINT1			0x0040
 #define MADERA_DSP7_BUS_ERR_EINT1_MASK			0x0040
 #define MADERA_DSP7_BUS_ERR_EINT1_SHIFT			     6
-#define MADERA_DSP7_BUS_ERR_EINT1_WIDTH			     1
 #define MADERA_DSP6_BUS_ERR_EINT1			0x0020
 #define MADERA_DSP6_BUS_ERR_EINT1_MASK			0x0020
 #define MADERA_DSP6_BUS_ERR_EINT1_SHIFT			     5
-#define MADERA_DSP6_BUS_ERR_EINT1_WIDTH			     1
 #define MADERA_DSP5_BUS_ERR_EINT1			0x0010
 #define MADERA_DSP5_BUS_ERR_EINT1_MASK			0x0010
 #define MADERA_DSP5_BUS_ERR_EINT1_SHIFT			     4
-#define MADERA_DSP5_BUS_ERR_EINT1_WIDTH			     1
 #define MADERA_DSP4_BUS_ERR_EINT1			0x0008
 #define MADERA_DSP4_BUS_ERR_EINT1_MASK			0x0008
 #define MADERA_DSP4_BUS_ERR_EINT1_SHIFT			     3
-#define MADERA_DSP4_BUS_ERR_EINT1_WIDTH			     1
 #define MADERA_DSP3_BUS_ERR_EINT1			0x0004
 #define MADERA_DSP3_BUS_ERR_EINT1_MASK			0x0004
 #define MADERA_DSP3_BUS_ERR_EINT1_SHIFT			     2
-#define MADERA_DSP3_BUS_ERR_EINT1_WIDTH			     1
 #define MADERA_DSP2_BUS_ERR_EINT1			0x0002
 #define MADERA_DSP2_BUS_ERR_EINT1_MASK			0x0002
 #define MADERA_DSP2_BUS_ERR_EINT1_SHIFT			     1
-#define MADERA_DSP2_BUS_ERR_EINT1_WIDTH			     1
 #define MADERA_DSP1_BUS_ERR_EINT1			0x0001
 #define MADERA_DSP1_BUS_ERR_EINT1_MASK			0x0001
 #define MADERA_DSP1_BUS_ERR_EINT1_SHIFT			     0
-#define MADERA_DSP1_BUS_ERR_EINT1_WIDTH			     1
 
 /* (0x1845)  IRQ1_Mask_6 */
 #define MADERA_IM_MICDET2_EINT1				0x0200
 #define MADERA_IM_MICDET2_EINT1_MASK			0x0200
 #define MADERA_IM_MICDET2_EINT1_SHIFT			     9
-#define MADERA_IM_MICDET2_EINT1_WIDTH			     1
 #define MADERA_IM_MICDET1_EINT1				0x0100
 #define MADERA_IM_MICDET1_EINT1_MASK			0x0100
 #define MADERA_IM_MICDET1_EINT1_SHIFT			     8
-#define MADERA_IM_MICDET1_EINT1_WIDTH			     1
 #define MADERA_IM_HPDET_EINT1				0x0001
 #define MADERA_IM_HPDET_EINT1_MASK			0x0001
 #define MADERA_IM_HPDET_EINT1_SHIFT			     0
-#define MADERA_IM_HPDET_EINT1_WIDTH			     1
 /* (0x184E)  IRQ1_Mask_15 */
 #define MADERA_IM_SPK_OVERHEAT_WARN_EINT1		0x0004
 #define MADERA_IM_SPK_OVERHEAT_WARN_EINT1_MASK		0x0004
 #define MADERA_IM_SPK_OVERHEAT_WARN_EINT1_SHIFT		     2
-#define MADERA_IM_SPK_OVERHEAT_WARN_EINT1_WIDTH		     1
 #define MADERA_IM_SPK_OVERHEAT_EINT1			0x0002
 #define MADERA_IM_SPK_OVERHEAT_EINT1_MASK		0x0002
 #define MADERA_IM_SPK_OVERHEAT_EINT1_SHIFT		     1
-#define MADERA_IM_SPK_OVERHEAT_EINT1_WIDTH		     1
 #define MADERA_IM_SPK_SHUTDOWN_EINT1			0x0001
 #define MADERA_IM_SPK_SHUTDOWN_EINT1_MASK		0x0001
 #define MADERA_IM_SPK_SHUTDOWN_EINT1_SHIFT		     0
-#define MADERA_IM_SPK_SHUTDOWN_EINT1_WIDTH		     1
 
 /* (0x1880) - IRQ1 Raw Status 1 */
 #define MADERA_CTRLIF_ERR_STS1				0x1000
 #define MADERA_CTRLIF_ERR_STS1_MASK			0x1000
 #define MADERA_CTRLIF_ERR_STS1_SHIFT			    12
-#define MADERA_CTRLIF_ERR_STS1_WIDTH			     1
 #define MADERA_SYSCLK_FAIL_STS1				0x0200
 #define MADERA_SYSCLK_FAIL_STS1_MASK			0x0200
 #define MADERA_SYSCLK_FAIL_STS1_SHIFT			     9
-#define MADERA_SYSCLK_FAIL_STS1_WIDTH			     1
 #define MADERA_CLOCK_DETECT_STS1			0x0100
 #define MADERA_CLOCK_DETECT_STS1_MASK			0x0100
 #define MADERA_CLOCK_DETECT_STS1_SHIFT			     8
-#define MADERA_CLOCK_DETECT_STS1_WIDTH			     1
 #define MADERA_BOOT_DONE_STS1				0x0080
 #define MADERA_BOOT_DONE_STS1_MASK			0x0080
 #define MADERA_BOOT_DONE_STS1_SHIFT			     7
-#define MADERA_BOOT_DONE_STS1_WIDTH			     1
 
 /* (0x1881) - IRQ1 Raw Status 2 */
 #define MADERA_FLL3_LOCK_STS1				0x0400
 #define MADERA_FLL3_LOCK_STS1_MASK			0x0400
 #define MADERA_FLL3_LOCK_STS1_SHIFT			    10
-#define MADERA_FLL3_LOCK_STS1_WIDTH			     1
 #define MADERA_FLL2_LOCK_STS1				0x0200
 #define MADERA_FLL2_LOCK_STS1_MASK			0x0200
 #define MADERA_FLL2_LOCK_STS1_SHIFT			     9
-#define MADERA_FLL2_LOCK_STS1_WIDTH			     1
 #define MADERA_FLL1_LOCK_STS1				0x0100
 #define MADERA_FLL1_LOCK_STS1_MASK			0x0100
 #define MADERA_FLL1_LOCK_STS1_SHIFT			     8
-#define MADERA_FLL1_LOCK_STS1_WIDTH			     1
 
 /* (0x1886) - IRQ1 Raw Status 7 */
 #define MADERA_MICD_CLAMP_FALL_STS1			0x0020
 #define MADERA_MICD_CLAMP_FALL_STS1_MASK		0x0020
 #define MADERA_MICD_CLAMP_FALL_STS1_SHIFT		     5
-#define MADERA_MICD_CLAMP_FALL_STS1_WIDTH		     1
 #define MADERA_MICD_CLAMP_RISE_STS1			0x0010
 #define MADERA_MICD_CLAMP_RISE_STS1_MASK		0x0010
 #define MADERA_MICD_CLAMP_RISE_STS1_SHIFT		     4
-#define MADERA_MICD_CLAMP_RISE_STS1_WIDTH		     1
 #define MADERA_JD2_FALL_STS1				0x0008
 #define MADERA_JD2_FALL_STS1_MASK			0x0008
 #define MADERA_JD2_FALL_STS1_SHIFT			     3
-#define MADERA_JD2_FALL_STS1_WIDTH			     1
 #define MADERA_JD2_RISE_STS1				0x0004
 #define MADERA_JD2_RISE_STS1_MASK			0x0004
 #define MADERA_JD2_RISE_STS1_SHIFT			     2
-#define MADERA_JD2_RISE_STS1_WIDTH			     1
 #define MADERA_JD1_FALL_STS1				0x0002
 #define MADERA_JD1_FALL_STS1_MASK			0x0002
 #define MADERA_JD1_FALL_STS1_SHIFT			     1
-#define MADERA_JD1_FALL_STS1_WIDTH			     1
 #define MADERA_JD1_RISE_STS1				0x0001
 #define MADERA_JD1_RISE_STS1_MASK			0x0001
 #define MADERA_JD1_RISE_STS1_SHIFT			     0
-#define MADERA_JD1_RISE_STS1_WIDTH			     1
 
 /* (0x188E) - IRQ1 Raw Status 15 */
 #define MADERA_SPK_OVERHEAT_WARN_STS1			0x0004
 #define MADERA_SPK_OVERHEAT_WARN_STS1_MASK		0x0004
 #define MADERA_SPK_OVERHEAT_WARN_STS1_SHIFT		     2
-#define MADERA_SPK_OVERHEAT_WARN_STS1_WIDTH		     1
 #define MADERA_SPK_OVERHEAT_STS1			0x0002
 #define MADERA_SPK_OVERHEAT_STS1_MASK			0x0002
 #define MADERA_SPK_OVERHEAT_STS1_SHIFT			     1
-#define MADERA_SPK_OVERHEAT_STS1_WIDTH			     1
 #define MADERA_SPK_SHUTDOWN_STS1			0x0001
 #define MADERA_SPK_SHUTDOWN_STS1_MASK			0x0001
 #define MADERA_SPK_SHUTDOWN_STS1_SHIFT			     0
-#define MADERA_SPK_SHUTDOWN_STS1_WIDTH			     1
 
 /* (0x1A06)  Interrupt_Debounce_7 */
 #define MADERA_MICD_CLAMP_DB				0x0010
 #define MADERA_MICD_CLAMP_DB_MASK			0x0010
 #define MADERA_MICD_CLAMP_DB_SHIFT			     4
-#define MADERA_MICD_CLAMP_DB_WIDTH			     1
 #define MADERA_JD2_DB					0x0004
 #define MADERA_JD2_DB_MASK				0x0004
 #define MADERA_JD2_DB_SHIFT				     2
-#define MADERA_JD2_DB_WIDTH				     1
 #define MADERA_JD1_DB					0x0001
 #define MADERA_JD1_DB_MASK				0x0001
 #define MADERA_JD1_DB_SHIFT				     0
-#define MADERA_JD1_DB_WIDTH				     1
 
 /* (0x1A0E)  Interrupt_Debounce_15 */
 #define MADERA_SPK_OVERHEAT_WARN_DB			0x0004
 #define MADERA_SPK_OVERHEAT_WARN_DB_MASK		0x0004
 #define MADERA_SPK_OVERHEAT_WARN_DB_SHIFT		     2
-#define MADERA_SPK_OVERHEAT_WARN_DB_WIDTH		     1
 #define MADERA_SPK_OVERHEAT_DB				0x0002
 #define MADERA_SPK_OVERHEAT_DB_MASK			0x0002
 #define MADERA_SPK_OVERHEAT_DB_SHIFT			     1
-#define MADERA_SPK_OVERHEAT_DB_WIDTH			     1
 
 /* (0x1A80)  IRQ1_CTRL */
 #define MADERA_IM_IRQ1					0x0800
 #define MADERA_IM_IRQ1_MASK				0x0800
 #define MADERA_IM_IRQ1_SHIFT				    11
-#define MADERA_IM_IRQ1_WIDTH				     1
 #define MADERA_IRQ_POL					0x0400
 #define MADERA_IRQ_POL_MASK				0x0400
 #define MADERA_IRQ_POL_SHIFT				    10
-#define MADERA_IRQ_POL_WIDTH				     1
 
 /* (0x20004)  OTP_HPDET_Cal_1 */
 #define MADERA_OTP_HPDET_CALIB_OFFSET_11	    0xFF000000
 #define MADERA_OTP_HPDET_CALIB_OFFSET_11_MASK	    0xFF000000
 #define MADERA_OTP_HPDET_CALIB_OFFSET_11_SHIFT		    24
-#define MADERA_OTP_HPDET_CALIB_OFFSET_11_WIDTH		     8
 #define MADERA_OTP_HPDET_CALIB_OFFSET_10	    0x00FF0000
 #define MADERA_OTP_HPDET_CALIB_OFFSET_10_MASK	    0x00FF0000
 #define MADERA_OTP_HPDET_CALIB_OFFSET_10_SHIFT		    16
-#define MADERA_OTP_HPDET_CALIB_OFFSET_10_WIDTH		     8
 #define MADERA_OTP_HPDET_CALIB_OFFSET_01	    0x0000FF00
 #define MADERA_OTP_HPDET_CALIB_OFFSET_01_MASK	    0x0000FF00
 #define MADERA_OTP_HPDET_CALIB_OFFSET_01_SHIFT		     8
-#define MADERA_OTP_HPDET_CALIB_OFFSET_01_WIDTH		     8
 #define MADERA_OTP_HPDET_CALIB_OFFSET_00	    0x000000FF
 #define MADERA_OTP_HPDET_CALIB_OFFSET_00_MASK	    0x000000FF
 #define MADERA_OTP_HPDET_CALIB_OFFSET_00_SHIFT		     0
-#define MADERA_OTP_HPDET_CALIB_OFFSET_00_WIDTH		     8
 
 /* (0x20006)  OTP_HPDET_Cal_2 */
 #define MADERA_OTP_HPDET_GRADIENT_1X		    0x0000FF00
 #define MADERA_OTP_HPDET_GRADIENT_1X_MASK	    0x0000FF00
 #define MADERA_OTP_HPDET_GRADIENT_1X_SHIFT		     8
-#define MADERA_OTP_HPDET_GRADIENT_1X_WIDTH		     8
 #define MADERA_OTP_HPDET_GRADIENT_0X		    0x000000FF
 #define MADERA_OTP_HPDET_GRADIENT_0X_MASK	    0x000000FF
 #define MADERA_OTP_HPDET_GRADIENT_0X_SHIFT		     0
-#define MADERA_OTP_HPDET_GRADIENT_0X_WIDTH		     8
 
 #endif
-- 
cgit 


From 28faad777c2d1480dfdda6697e58c06cf9011ebc Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Sun, 27 Sep 2020 01:59:17 +0200
Subject: mfd: tps65910: Clean up after switching to regmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove wrappers around regmap calls to remove now-useless indirection.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/tps65910.c       | 16 ++++++++--------
 include/linux/mfd/tps65910.h | 35 -----------------------------------
 2 files changed, 8 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index a6c28df7aa38..cbfc6afb3928 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -292,7 +292,7 @@ static int tps65910_ck32k_init(struct tps65910 *tps65910,
 	if (!pmic_pdata->en_ck32k_xtal)
 		return 0;
 
-	ret = tps65910_reg_clear_bits(tps65910, TPS65910_DEVCTRL,
+	ret = regmap_clear_bits(tps65910->regmap, TPS65910_DEVCTRL,
 						DEVCTRL_CK32K_CTRL_MASK);
 	if (ret < 0) {
 		dev_err(tps65910->dev, "clear ck32k_ctrl failed: %d\n", ret);
@@ -314,7 +314,7 @@ static int tps65910_sleepinit(struct tps65910 *tps65910,
 	dev = tps65910->dev;
 
 	/* enabling SLEEP device state */
-	ret = tps65910_reg_set_bits(tps65910, TPS65910_DEVCTRL,
+	ret = regmap_set_bits(tps65910->regmap, TPS65910_DEVCTRL,
 				DEVCTRL_DEV_SLP_MASK);
 	if (ret < 0) {
 		dev_err(dev, "set dev_slp failed: %d\n", ret);
@@ -322,7 +322,7 @@ static int tps65910_sleepinit(struct tps65910 *tps65910,
 	}
 
 	if (pmic_pdata->slp_keepon.therm_keepon) {
-		ret = tps65910_reg_set_bits(tps65910,
+		ret = regmap_set_bits(tps65910->regmap,
 				TPS65910_SLEEP_KEEP_RES_ON,
 				SLEEP_KEEP_RES_ON_THERM_KEEPON_MASK);
 		if (ret < 0) {
@@ -332,7 +332,7 @@ static int tps65910_sleepinit(struct tps65910 *tps65910,
 	}
 
 	if (pmic_pdata->slp_keepon.clkout32k_keepon) {
-		ret = tps65910_reg_set_bits(tps65910,
+		ret = regmap_set_bits(tps65910->regmap,
 				TPS65910_SLEEP_KEEP_RES_ON,
 				SLEEP_KEEP_RES_ON_CLKOUT32K_KEEPON_MASK);
 		if (ret < 0) {
@@ -342,7 +342,7 @@ static int tps65910_sleepinit(struct tps65910 *tps65910,
 	}
 
 	if (pmic_pdata->slp_keepon.i2chs_keepon) {
-		ret = tps65910_reg_set_bits(tps65910,
+		ret = regmap_set_bits(tps65910->regmap,
 				TPS65910_SLEEP_KEEP_RES_ON,
 				SLEEP_KEEP_RES_ON_I2CHS_KEEPON_MASK);
 		if (ret < 0) {
@@ -354,7 +354,7 @@ static int tps65910_sleepinit(struct tps65910 *tps65910,
 	return 0;
 
 disable_dev_slp:
-	tps65910_reg_clear_bits(tps65910, TPS65910_DEVCTRL,
+	regmap_clear_bits(tps65910->regmap, TPS65910_DEVCTRL,
 				DEVCTRL_DEV_SLP_MASK);
 
 err_sleep_init:
@@ -436,11 +436,11 @@ static void tps65910_power_off(void)
 
 	tps65910 = dev_get_drvdata(&tps65910_i2c_client->dev);
 
-	if (tps65910_reg_set_bits(tps65910, TPS65910_DEVCTRL,
+	if (regmap_set_bits(tps65910->regmap, TPS65910_DEVCTRL,
 			DEVCTRL_PWR_OFF_MASK) < 0)
 		return;
 
-	tps65910_reg_clear_bits(tps65910, TPS65910_DEVCTRL,
+	regmap_clear_bits(tps65910->regmap, TPS65910_DEVCTRL,
 			DEVCTRL_DEV_ON_MASK);
 }
 
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index ce4b9e743f7c..f7398d982f23 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -913,39 +913,4 @@ static inline int tps65910_chip_id(struct tps65910 *tps65910)
 	return tps65910->id;
 }
 
-static inline int tps65910_reg_read(struct tps65910 *tps65910, u8 reg,
-		unsigned int *val)
-{
-	return regmap_read(tps65910->regmap, reg, val);
-}
-
-static inline int tps65910_reg_write(struct tps65910 *tps65910, u8 reg,
-		unsigned int val)
-{
-	return regmap_write(tps65910->regmap, reg, val);
-}
-
-static inline int tps65910_reg_set_bits(struct tps65910 *tps65910, u8 reg,
-		u8 mask)
-{
-	return regmap_update_bits(tps65910->regmap, reg, mask, mask);
-}
-
-static inline int tps65910_reg_clear_bits(struct tps65910 *tps65910, u8 reg,
-		u8 mask)
-{
-	return regmap_update_bits(tps65910->regmap, reg, mask, 0);
-}
-
-static inline int tps65910_reg_update_bits(struct tps65910 *tps65910, u8 reg,
-					   u8 mask, u8 val)
-{
-	return regmap_update_bits(tps65910->regmap, reg, mask, val);
-}
-
-static inline int tps65910_irq_get_virq(struct tps65910 *tps65910, int irq)
-{
-	return regmap_irq_get_virq(tps65910->irq_data, irq);
-}
-
 #endif /*  __LINUX_MFD_TPS65910_H */
-- 
cgit 


From 9f5b98f3f4149a10c315ddb4d0bed033f398e8ec Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Sun, 27 Sep 2020 01:59:18 +0200
Subject: mfd: tps65910: Remove unused pointers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Client pointers in tps65910 data are not used in the drivers.
Remove those fields.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/tps65910.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index f7398d982f23..701925db75b3 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -890,11 +890,6 @@ struct tps65910 {
 	struct regmap *regmap;
 	unsigned long id;
 
-	/* Client devices */
-	struct tps65910_pmic *pmic;
-	struct tps65910_rtc *rtc;
-	struct tps65910_power *power;
-
 	/* Device node parsed board data */
 	struct tps65910_board *of_plat_data;
 
-- 
cgit 


From f594d01bb4aff35dc582f5418e6823f79e28834b Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 27 Oct 2020 09:41:32 +0000
Subject: mfd: madera: Add special errata reset handling for cs47l15

An errata exists for cs47l15 where the reset must be handled
differently and removed before DCVDD is applied. A soft reset is used
for situations where a reset is required to reset state. This does
however, make this part unsuitable for DCVDD supplies with a rise time
greater than 2mS.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/madera-core.c       | 25 ++++++++++++++++++++-----
 include/linux/mfd/madera/core.h |  1 +
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index a9c6f0833f32..a2abc0094def 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -38,6 +38,9 @@
 #define MADERA_RESET_MIN_US	2000
 #define MADERA_RESET_MAX_US	3000
 
+#define ERRATA_DCVDD_MIN_US	10000
+#define ERRATA_DCVDD_MAX_US	15000
+
 static const char * const madera_core_supplies[] = {
 	"AVDD",
 	"DBVDD1",
@@ -291,7 +294,8 @@ static int __maybe_unused madera_runtime_resume(struct device *dev)
 
 	dev_dbg(dev, "Leaving sleep mode\n");
 
-	madera_enable_hard_reset(madera);
+	if (!madera->reset_errata)
+		madera_enable_hard_reset(madera);
 
 	ret = regulator_enable(madera->dcvdd);
 	if (ret) {
@@ -302,9 +306,12 @@ static int __maybe_unused madera_runtime_resume(struct device *dev)
 	regcache_cache_only(madera->regmap, false);
 	regcache_cache_only(madera->regmap_32bit, false);
 
-	madera_disable_hard_reset(madera);
+	if (madera->reset_errata)
+		usleep_range(ERRATA_DCVDD_MIN_US, ERRATA_DCVDD_MAX_US);
+	else
+		madera_disable_hard_reset(madera);
 
-	if (!madera->pdata.reset) {
+	if (!madera->pdata.reset || madera->reset_errata) {
 		ret = madera_wait_for_boot(madera);
 		if (ret)
 			goto err;
@@ -503,6 +510,8 @@ int madera_dev_init(struct madera *madera)
 	 */
 	switch (madera->type) {
 	case CS47L15:
+		madera->reset_errata = true;
+		break;
 	case CS47L35:
 	case CS47L90:
 	case CS47L91:
@@ -553,13 +562,19 @@ int madera_dev_init(struct madera *madera)
 		goto err_dcvdd;
 	}
 
+	if (madera->reset_errata)
+		madera_disable_hard_reset(madera);
+
 	ret = regulator_enable(madera->dcvdd);
 	if (ret) {
 		dev_err(dev, "Failed to enable DCVDD: %d\n", ret);
 		goto err_enable;
 	}
 
-	madera_disable_hard_reset(madera);
+	if (madera->reset_errata)
+		usleep_range(ERRATA_DCVDD_MIN_US, ERRATA_DCVDD_MAX_US);
+	else
+		madera_disable_hard_reset(madera);
 
 	regcache_cache_only(madera->regmap, false);
 	regcache_cache_only(madera->regmap_32bit, false);
@@ -667,7 +682,7 @@ int madera_dev_init(struct madera *madera)
 	 * It looks like a device we support. If we don't have a hard reset
 	 * we can now attempt a soft reset.
 	 */
-	if (!madera->pdata.reset) {
+	if (!madera->pdata.reset || madera->reset_errata) {
 		ret = madera_soft_reset(madera);
 		if (ret)
 			goto err_reset;
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index ad2c138105d4..03a8a788424a 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -186,6 +186,7 @@ struct madera {
 	struct regulator_bulk_data core_supplies[MADERA_MAX_CORE_SUPPLIES];
 	struct regulator *dcvdd;
 	bool internal_dcvdd;
+	bool reset_errata;
 
 	struct madera_pdata pdata;
 
-- 
cgit 


From 4556fe8f16e0225f5df7a57d123e0d55717bf2aa Mon Sep 17 00:00:00 2001
From: Michael Srba <Michael.Srba@seznam.cz>
Date: Tue, 10 Nov 2020 14:00:47 +0100
Subject: mfd: rt5033: Fix errorneous defines

Fix regulators on rt5033 by converting some values to bitmasks which were
errorneously not defined as such in the header file.

Cc: Beomho Seo <beomho.seo@samsung.com>
Fixes: 0b271258544b ("mfd: rt5033: Add Richtek RT5033 driver core.")
Signed-off-by: Michael Srba <Michael.Srba@seznam.cz>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rt5033-private.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rt5033-private.h b/include/linux/mfd/rt5033-private.h
index f812105c538c..2d1895c3efbf 100644
--- a/include/linux/mfd/rt5033-private.h
+++ b/include/linux/mfd/rt5033-private.h
@@ -91,14 +91,14 @@ enum rt5033_reg {
 #define RT5033_RT_HZ_MASK		0x01
 
 /* RT5033 control register */
-#define RT5033_CTRL_FCCM_BUCK_MASK		0x00
-#define RT5033_CTRL_BUCKOMS_MASK		0x01
-#define RT5033_CTRL_LDOOMS_MASK			0x02
-#define RT5033_CTRL_SLDOOMS_MASK		0x03
-#define RT5033_CTRL_EN_BUCK_MASK		0x04
-#define RT5033_CTRL_EN_LDO_MASK			0x05
-#define RT5033_CTRL_EN_SAFE_LDO_MASK		0x06
-#define RT5033_CTRL_LDO_SLEEP_MASK		0x07
+#define RT5033_CTRL_FCCM_BUCK_MASK		BIT(0)
+#define RT5033_CTRL_BUCKOMS_MASK		BIT(1)
+#define RT5033_CTRL_LDOOMS_MASK			BIT(2)
+#define RT5033_CTRL_SLDOOMS_MASK		BIT(3)
+#define RT5033_CTRL_EN_BUCK_MASK		BIT(4)
+#define RT5033_CTRL_EN_LDO_MASK			BIT(5)
+#define RT5033_CTRL_EN_SAFE_LDO_MASK		BIT(6)
+#define RT5033_CTRL_LDO_SLEEP_MASK		BIT(7)
 
 /* RT5033 BUCK control register */
 #define RT5033_BUCK_CTRL_MASK			0x1f
@@ -247,11 +247,11 @@ enum rt5033_fuel_reg {
 #define RT5033_FUEL_BAT_PRESENT		0x02
 
 /* RT5033 PMIC interrupts */
-#define RT5033_PMIC_IRQ_BUCKOCP		2
-#define RT5033_PMIC_IRQ_BUCKLV		3
-#define RT5033_PMIC_IRQ_SAFELDOLV	4
-#define RT5033_PMIC_IRQ_LDOLV		5
-#define RT5033_PMIC_IRQ_OT		6
-#define RT5033_PMIC_IRQ_VDDA_UV		7
+#define RT5033_PMIC_IRQ_BUCKOCP		BIT(2)
+#define RT5033_PMIC_IRQ_BUCKLV		BIT(3)
+#define RT5033_PMIC_IRQ_SAFELDOLV	BIT(4)
+#define RT5033_PMIC_IRQ_LDOLV		BIT(5)
+#define RT5033_PMIC_IRQ_OT		BIT(6)
+#define RT5033_PMIC_IRQ_VDDA_UV		BIT(7)
 
 #endif /* __RT5033_PRIVATE_H__ */
-- 
cgit 


From 295992fb815e791d14b18ef7cdbbaf1a76211a31 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Mon, 14 Sep 2020 15:09:33 +0200
Subject: mm: introduce vma_set_file function v5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the new vma_set_file() function to allow changing
vma->vm_file with the necessary refcount dance.

v2: add more users of this.
v3: add missing EXPORT_SYMBOL, rebase on mmap cleanup,
    add comments why we drop the reference on two occasions.
v4: make it clear that changing an anonymous vma is illegal.
v5: move vma_set_file to mm/util.c

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> (v2)
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://patchwork.freedesktop.org/patch/399360/
---
 drivers/dma-buf/dma-buf.c                  |  3 +--
 drivers/gpu/drm/etnaviv/etnaviv_gem.c      |  4 +---
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  3 +--
 drivers/gpu/drm/i915/gem/i915_gem_mman.c   |  5 +++--
 drivers/gpu/drm/msm/msm_gem.c              |  4 +---
 drivers/gpu/drm/omapdrm/omap_gem.c         |  3 +--
 drivers/gpu/drm/vgem/vgem_drv.c            |  3 +--
 drivers/staging/android/ashmem.c           |  6 +++---
 include/linux/mm.h                         |  2 ++
 mm/util.c                                  | 12 ++++++++++++
 10 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 282bd8b84170..e63684d4cd90 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1183,8 +1183,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
 		return -EINVAL;
 
 	/* readjust the vma */
-	fput(vma->vm_file);
-	vma->vm_file = get_file(dmabuf->file);
+	vma_set_file(vma, dmabuf->file);
 	vma->vm_pgoff = pgoff;
 
 	return dmabuf->ops->mmap(dmabuf, vma);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index bbd235473645..6d38c5c17f23 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -145,10 +145,8 @@ static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
-		get_file(etnaviv_obj->base.filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = etnaviv_obj->base.filp;
+		vma_set_file(vma, etnaviv_obj->base.filp);
 
 		vma->vm_page_prot = vm_page_prot;
 	}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 0dd477e56573..04e9c04545ad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -114,8 +114,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 	if (ret)
 		return ret;
 
-	fput(vma->vm_file);
-	vma->vm_file = get_file(obj->base.filp);
+	vma_set_file(vma, obj->base.filp);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 3d69e51f3e4d..ec28a6cde49b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -893,8 +893,9 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	 * requires avoiding extraneous references to their filp, hence why
 	 * we prefer to use an anonymous file for their mmaps.
 	 */
-	fput(vma->vm_file);
-	vma->vm_file = anon;
+	vma_set_file(vma, anon);
+	/* Drop the initial creation reference, the vma is now holding one. */
+	fput(anon);
 
 	switch (mmo->mmap_type) {
 	case I915_MMAP_TYPE_WC:
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 2e1bce7c0b19..311721ceee50 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -212,10 +212,8 @@ int msm_gem_mmap_obj(struct drm_gem_object *obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
-		get_file(obj->filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = obj->filp;
+		vma_set_file(vma, obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 68c271f4250b..30d299ca8795 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -564,9 +564,8 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = get_file(obj->filp);
+		vma_set_file(vma, obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 9a413091abb6..f8635ccaf9a1 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -403,8 +403,7 @@ static int vgem_prime_mmap(struct drm_gem_object *obj,
 	if (ret)
 		return ret;
 
-	fput(vma->vm_file);
-	vma->vm_file = get_file(obj->filp);
+	vma_set_file(vma, obj->filp);
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
 
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 10b4be1f3e78..4789d36ddfd3 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -450,9 +450,9 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 		vma_set_anonymous(vma);
 	}
 
-	if (vma->vm_file)
-		fput(vma->vm_file);
-	vma->vm_file = asma->file;
+	vma_set_file(vma, asma->file);
+	/* XXX: merge this with the get_file() above if possible */
+	fput(asma->file);
 
 out:
 	mutex_unlock(&ashmem_mutex);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..47bff16c182d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2719,6 +2719,8 @@ static inline void vma_set_page_prot(struct vm_area_struct *vma)
 }
 #endif
 
+void vma_set_file(struct vm_area_struct *vma, struct file *file);
+
 #ifdef CONFIG_NUMA_BALANCING
 unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
diff --git a/mm/util.c b/mm/util.c
index 4ddb6e186dd5..8c9b7d1e7c49 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -311,6 +311,18 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
 	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
 }
 
+/*
+ * Change backing file, only valid to use during initial VMA setup.
+ */
+void vma_set_file(struct vm_area_struct *vma, struct file *file)
+{
+	/* Changing an anonymous vma with this is illegal */
+	get_file(file);
+	swap(vma->vm_file, file);
+	fput(file);
+}
+EXPORT_SYMBOL(vma_set_file);
+
 #ifndef STACK_RND_MASK
 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
 #endif
-- 
cgit 


From 179a9cf79212bb3b96fb69a314583189cd863c5b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 17 Nov 2020 16:16:34 +0100
Subject: context_tracking: Don't implement exception_enter/exit() on
 CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK

The typical steps with context tracking are:

1) Task runs in userspace
2) Task enters the kernel (syscall/exception/IRQ)
3) Task switches from context tracking state CONTEXT_USER to
   CONTEXT_KERNEL (user_exit())
4) Task does stuff in kernel
5) Task switches from context tracking state CONTEXT_KERNEL to
   CONTEXT_USER (user_enter())
6) Task exits the kernel

If an exception fires between 5) and 6), the pt_regs and the context
tracking disagree on the context of the faulted/trapped instruction.
CONTEXT_KERNEL must be set before the exception handler, that's
unconditional for those handlers that want to be able to call into
schedule(), but CONTEXT_USER must be restored when the exception exits
whereas pt_regs tells that we are resuming to kernel space.

This can't be fixed with storing the context tracking state in a per-cpu
or per-task variable since another exception may fire onto the current
one and overwrite the saved state. Also the task can schedule. So it
has to be stored in a per task stack.

This is how exception_enter()/exception_exit() paper over the problem:

5) Task switches from context tracking state CONTEXT_KERNEL to
   CONTEXT_USER (user_enter())
5.1) Exception fires
5.2) prev_state = exception_enter() // save CONTEXT_USER to prev_state
                                    // and set CONTEXT_KERNEL
5.3) Exception handler
5.4) exception_enter(prev_state) // restore CONTEXT_USER
5.5) Exception resumes
6) Task exits the kernel

The condition to live without exception_enter()/exception_exit() is to
forbid exceptions and IRQs between 2) and 3) and between 5) and 6), or if
any is allowed to trigger, it won't call into context tracking, eg: NMIs,
and it won't schedule. These requirements are met by architectures
supporting CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK and those can
therefore afford not to implement this hack.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201117151637.259084-3-frederic@kernel.org
---
 include/linux/context_tracking.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index d53cd331c4dd..bceb06498521 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -51,7 +51,8 @@ static inline enum ctx_state exception_enter(void)
 {
 	enum ctx_state prev_ctx;
 
-	if (!context_tracking_enabled())
+	if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) ||
+	    !context_tracking_enabled())
 		return 0;
 
 	prev_ctx = this_cpu_read(context_tracking.state);
@@ -63,7 +64,8 @@ static inline enum ctx_state exception_enter(void)
 
 static inline void exception_exit(enum ctx_state prev_ctx)
 {
-	if (context_tracking_enabled()) {
+	if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) &&
+	    context_tracking_enabled()) {
 		if (prev_ctx != CONTEXT_KERNEL)
 			context_tracking_enter(prev_ctx);
 	}
-- 
cgit 


From 31f6a8c0a471be7d7d05c93eac50fcb729e79b9d Mon Sep 17 00:00:00 2001
From: Ionela Voinescu <ionela.voinescu@arm.com>
Date: Tue, 27 Oct 2020 18:07:11 +0000
Subject: sched/topology,schedutil: Wrap sched domains rebuild

Add the rebuild_sched_domains_energy() function to wrap the functionality
that rebuilds the scheduling domains if any of the Energy Aware Scheduling
(EAS) initialisation conditions change. This functionality is used when
schedutil is added or removed or when EAS is enabled or disabled
through the sched_energy_aware sysctl.

Therefore, create a single function that is used in both these cases and
that can be later reused.

Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Quentin Perret <qperret@google.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lkml.kernel.org/r/20201027180713.7642-2-ionela.voinescu@arm.com
---
 include/linux/sched/topology.h   |  8 ++++++++
 kernel/sched/cpufreq_schedutil.c |  9 +--------
 kernel/sched/topology.c          | 18 +++++++++++-------
 3 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 9ef7bf686a9f..8f0f778b7c91 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 
 #endif	/* !CONFIG_SMP */
 
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+extern void rebuild_sched_domains_energy(void);
+#else
+static inline void rebuild_sched_domains_energy(void)
+{
+}
+#endif
+
 #ifndef arch_scale_cpu_capacity
 /**
  * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e254745a82cb..37b303890336 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -899,16 +899,9 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 cpufreq_governor_init(schedutil_gov);
 
 #ifdef CONFIG_ENERGY_MODEL
-extern bool sched_energy_update;
-extern struct mutex sched_energy_mutex;
-
 static void rebuild_sd_workfn(struct work_struct *work)
 {
-	mutex_lock(&sched_energy_mutex);
-	sched_energy_update = true;
-	rebuild_sched_domains();
-	sched_energy_update = false;
-	mutex_unlock(&sched_energy_mutex);
+	rebuild_sched_domains_energy();
 }
 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b296c1c6b961..04d9ebfaedd6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1;
 DEFINE_MUTEX(sched_energy_mutex);
 bool sched_energy_update;
 
+void rebuild_sched_domains_energy(void)
+{
+	mutex_lock(&sched_energy_mutex);
+	sched_energy_update = true;
+	rebuild_sched_domains();
+	sched_energy_update = false;
+	mutex_unlock(&sched_energy_mutex);
+}
+
 #ifdef CONFIG_PROC_SYSCTL
 int sched_energy_aware_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
@@ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		state = static_branch_unlikely(&sched_energy_present);
-		if (state != sysctl_sched_energy_aware) {
-			mutex_lock(&sched_energy_mutex);
-			sched_energy_update = 1;
-			rebuild_sched_domains();
-			sched_energy_update = 0;
-			mutex_unlock(&sched_energy_mutex);
-		}
+		if (state != sysctl_sched_energy_aware)
+			rebuild_sched_domains_energy();
 	}
 
 	return ret;
-- 
cgit 


From 25ece30561d247b2931b0d11d92e9c976a668771 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 9 Nov 2020 17:34:05 +0100
Subject: rtc: nvmem: remove nvram ABI

The nvram sysfs attributes have been deprecated at least since v4.13, more
than 3 years ago and nobody ever complained about the deprecation warning.

Remove the sysfs attributes now.

[Bartosz: remove the declaration of rtc_nvmem_unregister()]

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Link: https://lore.kernel.org/r/20201109163409.24301-5-brgl@bgdev.pl
---
 drivers/rtc/class.c        |  2 --
 drivers/rtc/nvmem.c        | 82 +---------------------------------------------
 drivers/rtc/rtc-cmos.c     |  1 -
 drivers/rtc/rtc-ds1305.c   |  1 -
 drivers/rtc/rtc-ds1307.c   |  1 -
 drivers/rtc/rtc-ds1343.c   |  1 -
 drivers/rtc/rtc-ds1511.c   |  2 --
 drivers/rtc/rtc-ds1553.c   |  1 -
 drivers/rtc/rtc-ds1685.c   |  1 -
 drivers/rtc/rtc-ds1742.c   |  1 -
 drivers/rtc/rtc-m48t59.c   |  1 -
 drivers/rtc/rtc-m48t86.c   |  1 -
 drivers/rtc/rtc-rp5c01.c   |  1 -
 drivers/rtc/rtc-rv8803.c   |  1 -
 drivers/rtc/rtc-stk17ta8.c |  1 -
 drivers/rtc/rtc-tx4939.c   |  1 -
 include/linux/rtc.h        |  6 ----
 17 files changed, 1 insertion(+), 104 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 7c88d190c51f..a99b7d24b77c 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -339,8 +339,6 @@ static void devm_rtc_release_device(struct device *dev, void *res)
 {
 	struct rtc_device *rtc = *(struct rtc_device **)res;
 
-	rtc_nvmem_unregister(rtc);
-
 	if (rtc->registered)
 		rtc_device_unregister(rtc);
 	else
diff --git a/drivers/rtc/nvmem.c b/drivers/rtc/nvmem.c
index 4312096c7738..5e0b178a3b65 100644
--- a/drivers/rtc/nvmem.c
+++ b/drivers/rtc/nvmem.c
@@ -9,74 +9,7 @@
 #include <linux/types.h>
 #include <linux/nvmem-consumer.h>
 #include <linux/rtc.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
 
-/*
- * Deprecated ABI compatibility, this should be removed at some point
- */
-
-static const char nvram_warning[] = "Deprecated ABI, please use nvmem";
-
-static ssize_t
-rtc_nvram_read(struct file *filp, struct kobject *kobj,
-	       struct bin_attribute *attr,
-	       char *buf, loff_t off, size_t count)
-{
-	dev_warn_once(kobj_to_dev(kobj), nvram_warning);
-
-	return nvmem_device_read(attr->private, off, count, buf);
-}
-
-static ssize_t
-rtc_nvram_write(struct file *filp, struct kobject *kobj,
-		struct bin_attribute *attr,
-		char *buf, loff_t off, size_t count)
-{
-	dev_warn_once(kobj_to_dev(kobj), nvram_warning);
-
-	return nvmem_device_write(attr->private, off, count, buf);
-}
-
-static int rtc_nvram_register(struct rtc_device *rtc,
-			      struct nvmem_device *nvmem, size_t size)
-{
-	int err;
-
-	rtc->nvram = kzalloc(sizeof(*rtc->nvram), GFP_KERNEL);
-	if (!rtc->nvram)
-		return -ENOMEM;
-
-	rtc->nvram->attr.name = "nvram";
-	rtc->nvram->attr.mode = 0644;
-	rtc->nvram->private = nvmem;
-
-	sysfs_bin_attr_init(rtc->nvram);
-
-	rtc->nvram->read = rtc_nvram_read;
-	rtc->nvram->write = rtc_nvram_write;
-	rtc->nvram->size = size;
-
-	err = sysfs_create_bin_file(&rtc->dev.parent->kobj,
-				    rtc->nvram);
-	if (err) {
-		kfree(rtc->nvram);
-		rtc->nvram = NULL;
-	}
-
-	return err;
-}
-
-static void rtc_nvram_unregister(struct rtc_device *rtc)
-{
-	sysfs_remove_bin_file(&rtc->dev.parent->kobj, rtc->nvram);
-	kfree(rtc->nvram);
-	rtc->nvram = NULL;
-}
-
-/*
- * New ABI, uses nvmem
- */
 int rtc_nvmem_register(struct rtc_device *rtc,
 		       struct nvmem_config *nvmem_config)
 {
@@ -88,20 +21,7 @@ int rtc_nvmem_register(struct rtc_device *rtc,
 	nvmem_config->dev = rtc->dev.parent;
 	nvmem_config->owner = rtc->owner;
 	nvmem = devm_nvmem_register(rtc->dev.parent, nvmem_config);
-	if (IS_ERR(nvmem))
-		return PTR_ERR(nvmem);
-
-	/* Register the old ABI */
-	if (rtc->nvram_old_abi)
-		rtc_nvram_register(rtc, nvmem, nvmem_config->size);
 
-	return 0;
+	return PTR_ERR_OR_ZERO(nvmem);
 }
 EXPORT_SYMBOL_GPL(rtc_nvmem_register);
-
-void rtc_nvmem_unregister(struct rtc_device *rtc)
-{
-	/* unregister the old ABI */
-	if (rtc->nvram)
-		rtc_nvram_unregister(rtc);
-}
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index c633319cdb91..adca0de76e53 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -863,7 +863,6 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 		cmos_rtc.rtc->ops = &cmos_rtc_ops_no_alarm;
 	}
 
-	cmos_rtc.rtc->nvram_old_abi = true;
 	retval = rtc_register_device(cmos_rtc.rtc);
 	if (retval)
 		goto cleanup2;
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index a3d790889eea..a1ed539d41b4 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -694,7 +694,6 @@ static int ds1305_probe(struct spi_device *spi)
 	ds1305->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
 	ds1305_nvmem_cfg.priv = ds1305;
-	ds1305->rtc->nvram_old_abi = true;
 	status = rtc_register_device(ds1305->rtc);
 	if (status)
 		return status;
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index fdf25db1b1b3..e359cbf7882b 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -2016,7 +2016,6 @@ static int ds1307_probe(struct i2c_client *client,
 			.priv = ds1307,
 		};
 
-		ds1307->rtc->nvram_old_abi = true;
 		rtc_nvmem_register(ds1307->rtc, &nvmem_cfg);
 	}
 
diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c
index ba143423875b..e7604e844cbd 100644
--- a/drivers/rtc/rtc-ds1343.c
+++ b/drivers/rtc/rtc-ds1343.c
@@ -399,7 +399,6 @@ static int ds1343_probe(struct spi_device *spi)
 	if (IS_ERR(priv->rtc))
 		return PTR_ERR(priv->rtc);
 
-	priv->rtc->nvram_old_abi = true;
 	priv->rtc->ops = &ds1343_rtc_ops;
 	priv->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	priv->rtc->range_max = RTC_TIMESTAMP_END_2099;
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index a63872c4c76d..33c483d759c8 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -466,8 +466,6 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
 
 	pdata->rtc->ops = &ds1511_rtc_ops;
 
-	pdata->rtc->nvram_old_abi = true;
-
 	ret = rtc_register_device(pdata->rtc);
 	if (ret)
 		return ret;
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index cdf5e05b9489..c6a5563504e5 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -294,7 +294,6 @@ static int ds1553_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(pdata->rtc);
 
 	pdata->rtc->ops = &ds1553_rtc_ops;
-	pdata->rtc->nvram_old_abi = true;
 
 	ret = rtc_register_device(pdata->rtc);
 	if (ret)
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index dfbd7b88b2b9..9043c96e8845 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -1316,7 +1316,6 @@ ds1685_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	rtc_dev->nvram_old_abi = true;
 	nvmem_cfg.priv = rtc;
 	ret = rtc_nvmem_register(rtc_dev, &nvmem_cfg);
 	if (ret)
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 2b949f0dbaa9..291bbed90ef8 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -190,7 +190,6 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(rtc);
 
 	rtc->ops = &ds1742_rtc_ops;
-	rtc->nvram_old_abi = true;
 
 	ret = rtc_register_device(rtc);
 	if (ret)
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index 67e218758a8b..ee1d8f0146fd 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -463,7 +463,6 @@ static int m48t59_rtc_probe(struct platform_device *pdev)
 	if (IS_ERR(m48t59->rtc))
 		return PTR_ERR(m48t59->rtc);
 
-	m48t59->rtc->nvram_old_abi = true;
 	m48t59->rtc->ops = ops;
 
 	nvmem_cfg.size = pdata->offset;
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 75a0e73071d8..2b1135590dd5 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -254,7 +254,6 @@ static int m48t86_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(info->rtc);
 
 	info->rtc->ops = &m48t86_rtc_ops;
-	info->rtc->nvram_old_abi = true;
 
 	err = rtc_register_device(info->rtc);
 	if (err)
diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c
index 8776eadbdd3a..a69e8adcc4a1 100644
--- a/drivers/rtc/rtc-rp5c01.c
+++ b/drivers/rtc/rtc-rp5c01.c
@@ -251,7 +251,6 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev)
 		return PTR_ERR(rtc);
 
 	rtc->ops = &rp5c01_rtc_ops;
-	rtc->nvram_old_abi = true;
 
 	priv->rtc = rtc;
 
diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c
index c6d8e3425688..1d888da48c7c 100644
--- a/drivers/rtc/rtc-rv8803.c
+++ b/drivers/rtc/rtc-rv8803.c
@@ -585,7 +585,6 @@ static int rv8803_probe(struct i2c_client *client,
 	}
 
 	rv8803->rtc->ops = &rv8803_rtc_ops;
-	rv8803->rtc->nvram_old_abi = true;
 	rv8803->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv8803->rtc->range_max = RTC_TIMESTAMP_END_2099;
 	err = rtc_register_device(rv8803->rtc);
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index 01a45044f468..1ccf0d5d05b4 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -311,7 +311,6 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(pdata->rtc);
 
 	pdata->rtc->ops = &stk17ta8_rtc_ops;
-	pdata->rtc->nvram_old_abi = true;
 
 	nvmem_cfg.priv = pdata;
 	ret = rtc_nvmem_register(pdata->rtc, &nvmem_cfg);
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index 715b82981279..abbb62b14d7a 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -266,7 +266,6 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(rtc);
 
 	rtc->ops = &tx4939_rtc_ops;
-	rtc->nvram_old_abi = true;
 	rtc->range_max = U32_MAX;
 
 	pdata->rtc = rtc;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 22d1575e4991..0983ab9faffb 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -120,10 +120,6 @@ struct rtc_device {
 
 	bool registered;
 
-	/* Old ABI support */
-	bool nvram_old_abi;
-	struct bin_attribute *nvram;
-
 	time64_t range_min;
 	timeu64_t range_max;
 	time64_t start_secs;
@@ -250,14 +246,12 @@ extern int rtc_hctosys_ret;
 #ifdef CONFIG_RTC_NVMEM
 int rtc_nvmem_register(struct rtc_device *rtc,
 		       struct nvmem_config *nvmem_config);
-void rtc_nvmem_unregister(struct rtc_device *rtc);
 #else
 static inline int rtc_nvmem_register(struct rtc_device *rtc,
 				     struct nvmem_config *nvmem_config)
 {
 	return 0;
 }
-static inline void rtc_nvmem_unregister(struct rtc_device *rtc) {}
 #endif
 
 #ifdef CONFIG_RTC_INTF_SYSFS
-- 
cgit 


From 3a905c2d9544a418953d6c18668f0f853fbd9be9 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 9 Nov 2020 17:34:06 +0100
Subject: rtc: add devm_ prefix to rtc_nvmem_register()

rtc_nvmem_register() is a managed interface. It doesn't require any
release function to be called at driver detach. To avoid confusing
driver authors, let's rename it to devm_rtc_nvmem_register() and add it
to the list of managed interfaces in Documentation/.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201109163409.24301-6-brgl@bgdev.pl
---
 Documentation/driver-api/driver-model/devres.rst | 1 +
 drivers/rtc/nvmem.c                              | 4 ++--
 drivers/rtc/rtc-cmos.c                           | 2 +-
 drivers/rtc/rtc-ds1305.c                         | 2 +-
 drivers/rtc/rtc-ds1307.c                         | 2 +-
 drivers/rtc/rtc-ds1343.c                         | 2 +-
 drivers/rtc/rtc-ds1511.c                         | 2 +-
 drivers/rtc/rtc-ds1553.c                         | 2 +-
 drivers/rtc/rtc-ds1685.c                         | 2 +-
 drivers/rtc/rtc-ds1742.c                         | 2 +-
 drivers/rtc/rtc-ds3232.c                         | 2 +-
 drivers/rtc/rtc-isl12026.c                       | 2 +-
 drivers/rtc/rtc-isl1208.c                        | 2 +-
 drivers/rtc/rtc-m48t59.c                         | 2 +-
 drivers/rtc/rtc-m48t86.c                         | 2 +-
 drivers/rtc/rtc-meson.c                          | 2 +-
 drivers/rtc/rtc-omap.c                           | 2 +-
 drivers/rtc/rtc-pcf2127.c                        | 2 +-
 drivers/rtc/rtc-pcf85063.c                       | 2 +-
 drivers/rtc/rtc-pcf85363.c                       | 2 +-
 drivers/rtc/rtc-rp5c01.c                         | 2 +-
 drivers/rtc/rtc-rv3028.c                         | 4 ++--
 drivers/rtc/rtc-rv3029c2.c                       | 2 +-
 drivers/rtc/rtc-rv3032.c                         | 4 ++--
 drivers/rtc/rtc-rv8803.c                         | 2 +-
 drivers/rtc/rtc-rx8581.c                         | 2 +-
 drivers/rtc/rtc-stk17ta8.c                       | 2 +-
 drivers/rtc/rtc-tx4939.c                         | 2 +-
 include/linux/rtc.h                              | 8 ++++----
 29 files changed, 35 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 6ffc0f07404f..5df7ba54a4ba 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -414,6 +414,7 @@ RESET
 RTC
   devm_rtc_device_register()
   devm_rtc_allocate_device()
+  devm_rtc_nvmem_register()
 
 SERDEV
   devm_serdev_device_open()
diff --git a/drivers/rtc/nvmem.c b/drivers/rtc/nvmem.c
index 5e0b178a3b65..7502deb6390e 100644
--- a/drivers/rtc/nvmem.c
+++ b/drivers/rtc/nvmem.c
@@ -10,7 +10,7 @@
 #include <linux/nvmem-consumer.h>
 #include <linux/rtc.h>
 
-int rtc_nvmem_register(struct rtc_device *rtc,
+int devm_rtc_nvmem_register(struct rtc_device *rtc,
 		       struct nvmem_config *nvmem_config)
 {
 	struct nvmem_device *nvmem;
@@ -24,4 +24,4 @@ int rtc_nvmem_register(struct rtc_device *rtc,
 
 	return PTR_ERR_OR_ZERO(nvmem);
 }
-EXPORT_SYMBOL_GPL(rtc_nvmem_register);
+EXPORT_SYMBOL_GPL(devm_rtc_nvmem_register);
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index adca0de76e53..eea91c1538aa 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -869,7 +869,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 
 	/* export at least the first block of NVRAM */
 	nvmem_cfg.size = address_space - NVRAM_OFFSET;
-	if (rtc_nvmem_register(cmos_rtc.rtc, &nvmem_cfg))
+	if (devm_rtc_nvmem_register(cmos_rtc.rtc, &nvmem_cfg))
 		dev_err(dev, "nvmem registration failed\n");
 
 	dev_info(dev, "%s%s, %d bytes nvram%s\n",
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index a1ed539d41b4..a4e768261b43 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -698,7 +698,7 @@ static int ds1305_probe(struct spi_device *spi)
 	if (status)
 		return status;
 
-	rtc_nvmem_register(ds1305->rtc, &ds1305_nvmem_cfg);
+	devm_rtc_nvmem_register(ds1305->rtc, &ds1305_nvmem_cfg);
 
 	/* Maybe set up alarm IRQ; be ready to handle it triggering right
 	 * away.  NOTE that we don't share this.  The signal is active low,
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index e359cbf7882b..216bc5d9b716 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -2016,7 +2016,7 @@ static int ds1307_probe(struct i2c_client *client,
 			.priv = ds1307,
 		};
 
-		rtc_nvmem_register(ds1307->rtc, &nvmem_cfg);
+		devm_rtc_nvmem_register(ds1307->rtc, &nvmem_cfg);
 	}
 
 	ds1307_hwmon_register(ds1307);
diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c
index e7604e844cbd..ea663e24a34c 100644
--- a/drivers/rtc/rtc-ds1343.c
+++ b/drivers/rtc/rtc-ds1343.c
@@ -413,7 +413,7 @@ static int ds1343_probe(struct spi_device *spi)
 		return res;
 
 	nvmem_cfg.priv = priv;
-	rtc_nvmem_register(priv->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(priv->rtc, &nvmem_cfg);
 
 	priv->irq = spi->irq;
 
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 33c483d759c8..d5f48216e851 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -470,7 +470,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	rtc_nvmem_register(pdata->rtc, &ds1511_nvmem_cfg);
+	devm_rtc_nvmem_register(pdata->rtc, &ds1511_nvmem_cfg);
 
 	/*
 	 * if the platform has an interrupt in mind for this device,
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index c6a5563504e5..2d2eb739d92b 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -309,7 +309,7 @@ static int ds1553_rtc_probe(struct platform_device *pdev)
 		}
 	}
 
-	if (rtc_nvmem_register(pdata->rtc, &nvmem_cfg))
+	if (devm_rtc_nvmem_register(pdata->rtc, &nvmem_cfg))
 		dev_err(&pdev->dev, "unable to register nvmem\n");
 
 	return 0;
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index 9043c96e8845..bef588fce266 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -1317,7 +1317,7 @@ ds1685_rtc_probe(struct platform_device *pdev)
 		return ret;
 
 	nvmem_cfg.priv = rtc;
-	ret = rtc_nvmem_register(rtc_dev, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(rtc_dev, &nvmem_cfg);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 291bbed90ef8..29792a8cce97 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -195,7 +195,7 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	if (rtc_nvmem_register(rtc, &nvmem_cfg))
+	if (devm_rtc_nvmem_register(rtc, &nvmem_cfg))
 		dev_err(&pdev->dev, "Unable to register nvmem\n");
 
 	return 0;
diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c
index 69c37ab64352..16b89035d135 100644
--- a/drivers/rtc/rtc-ds3232.c
+++ b/drivers/rtc/rtc-ds3232.c
@@ -518,7 +518,7 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq,
 	if (IS_ERR(ds3232->rtc))
 		return PTR_ERR(ds3232->rtc);
 
-	ret = rtc_nvmem_register(ds3232->rtc, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(ds3232->rtc, &nvmem_cfg);
 	if(ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-isl12026.c b/drivers/rtc/rtc-isl12026.c
index 5b6b17fb6d62..fff8d8253669 100644
--- a/drivers/rtc/rtc-isl12026.c
+++ b/drivers/rtc/rtc-isl12026.c
@@ -465,7 +465,7 @@ static int isl12026_probe_new(struct i2c_client *client)
 
 	priv->rtc->ops = &isl12026_rtc_ops;
 	nvm_cfg.priv = priv;
-	ret = rtc_nvmem_register(priv->rtc, &nvm_cfg);
+	ret = devm_rtc_nvmem_register(priv->rtc, &nvm_cfg);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-isl1208.c b/drivers/rtc/rtc-isl1208.c
index ebb691fa48a6..08d778b10e9e 100644
--- a/drivers/rtc/rtc-isl1208.c
+++ b/drivers/rtc/rtc-isl1208.c
@@ -890,7 +890,7 @@ isl1208_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	if (rc)
 		return rc;
 
-	rc = rtc_nvmem_register(isl1208->rtc, &isl1208->nvmem_config);
+	rc = devm_rtc_nvmem_register(isl1208->rtc, &isl1208->nvmem_config);
 	if (rc)
 		return rc;
 
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index ee1d8f0146fd..e966a66ab2d3 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -466,7 +466,7 @@ static int m48t59_rtc_probe(struct platform_device *pdev)
 	m48t59->rtc->ops = ops;
 
 	nvmem_cfg.size = pdata->offset;
-	ret = rtc_nvmem_register(m48t59->rtc, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(m48t59->rtc, &nvmem_cfg);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 2b1135590dd5..182cfe59e4e0 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -259,7 +259,7 @@ static int m48t86_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	rtc_nvmem_register(info->rtc, &m48t86_nvmem_cfg);
+	devm_rtc_nvmem_register(info->rtc, &m48t86_nvmem_cfg);
 
 	/* read battery status */
 	reg = m48t86_readb(&pdev->dev, M48T86_D);
diff --git a/drivers/rtc/rtc-meson.c b/drivers/rtc/rtc-meson.c
index 47ebcf834cc2..938267713a4d 100644
--- a/drivers/rtc/rtc-meson.c
+++ b/drivers/rtc/rtc-meson.c
@@ -365,7 +365,7 @@ static int meson_rtc_probe(struct platform_device *pdev)
 	}
 
 	meson_rtc_nvmem_config.priv = rtc;
-	ret = rtc_nvmem_register(rtc->rtc, &meson_rtc_nvmem_config);
+	ret = devm_rtc_nvmem_register(rtc->rtc, &meson_rtc_nvmem_config);
 	if (ret)
 		goto out_disable_vdd;
 
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index 606fa80ad6e0..e65f79fc7718 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -890,7 +890,7 @@ static int omap_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		goto err;
 
-	rtc_nvmem_register(rtc->rtc, &omap_rtc_nvmem_config);
+	devm_rtc_nvmem_register(rtc->rtc, &omap_rtc_nvmem_config);
 
 	if (rtc->is_pmic_controller) {
 		if (!pm_power_off) {
diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index fd46860152e1..432cd627359b 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -608,7 +608,7 @@ static int pcf2127_probe(struct device *dev, struct regmap *regmap,
 			.size = 512,
 		};
 
-		ret = rtc_nvmem_register(pcf2127->rtc, &nvmem_cfg);
+		ret = devm_rtc_nvmem_register(pcf2127->rtc, &nvmem_cfg);
 	}
 
 	/*
diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c
index f8b99cb72959..c19f139e9b8d 100644
--- a/drivers/rtc/rtc-pcf85063.c
+++ b/drivers/rtc/rtc-pcf85063.c
@@ -607,7 +607,7 @@ static int pcf85063_probe(struct i2c_client *client)
 	}
 
 	nvmem_cfg.priv = pcf85063->regmap;
-	rtc_nvmem_register(pcf85063->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(pcf85063->rtc, &nvmem_cfg);
 
 #ifdef CONFIG_COMMON_CLK
 	/* register clk in common clk framework */
diff --git a/drivers/rtc/rtc-pcf85363.c b/drivers/rtc/rtc-pcf85363.c
index 3450d615974d..23cf14ca2c96 100644
--- a/drivers/rtc/rtc-pcf85363.c
+++ b/drivers/rtc/rtc-pcf85363.c
@@ -422,7 +422,7 @@ static int pcf85363_probe(struct i2c_client *client,
 
 	for (i = 0; i < config->num_nvram; i++) {
 		nvmem_cfg[i].priv = pcf85363;
-		rtc_nvmem_register(pcf85363->rtc, &nvmem_cfg[i]);
+		devm_rtc_nvmem_register(pcf85363->rtc, &nvmem_cfg[i]);
 	}
 
 	return ret;
diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c
index a69e8adcc4a1..8bc476c0905f 100644
--- a/drivers/rtc/rtc-rp5c01.c
+++ b/drivers/rtc/rtc-rp5c01.c
@@ -255,7 +255,7 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev)
 	priv->rtc = rtc;
 
 	nvmem_cfg.priv = priv;
-	error = rtc_nvmem_register(rtc, &nvmem_cfg);
+	error = devm_rtc_nvmem_register(rtc, &nvmem_cfg);
 	if (error)
 		return error;
 
diff --git a/drivers/rtc/rtc-rv3028.c b/drivers/rtc/rtc-rv3028.c
index fa226f0fe67d..f788df979750 100644
--- a/drivers/rtc/rtc-rv3028.c
+++ b/drivers/rtc/rtc-rv3028.c
@@ -891,9 +891,9 @@ static int rv3028_probe(struct i2c_client *client)
 		return ret;
 
 	nvmem_cfg.priv = rv3028->regmap;
-	rtc_nvmem_register(rv3028->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(rv3028->rtc, &nvmem_cfg);
 	eeprom_cfg.priv = rv3028;
-	rtc_nvmem_register(rv3028->rtc, &eeprom_cfg);
+	devm_rtc_nvmem_register(rv3028->rtc, &eeprom_cfg);
 
 	rv3028->rtc->max_user_freq = 1;
 
diff --git a/drivers/rtc/rtc-rv3029c2.c b/drivers/rtc/rtc-rv3029c2.c
index 62718231731b..ad359b3b74b2 100644
--- a/drivers/rtc/rtc-rv3029c2.c
+++ b/drivers/rtc/rtc-rv3029c2.c
@@ -755,7 +755,7 @@ static int rv3029_probe(struct device *dev, struct regmap *regmap, int irq,
 		return rc;
 
 	nvmem_cfg.priv = rv3029->regmap;
-	rtc_nvmem_register(rv3029->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(rv3029->rtc, &nvmem_cfg);
 
 	return 0;
 }
diff --git a/drivers/rtc/rtc-rv3032.c b/drivers/rtc/rtc-rv3032.c
index 14e931d6f9c6..ed9cba3292e6 100644
--- a/drivers/rtc/rtc-rv3032.c
+++ b/drivers/rtc/rtc-rv3032.c
@@ -890,9 +890,9 @@ static int rv3032_probe(struct i2c_client *client)
 		return ret;
 
 	nvmem_cfg.priv = rv3032->regmap;
-	rtc_nvmem_register(rv3032->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(rv3032->rtc, &nvmem_cfg);
 	eeprom_cfg.priv = rv3032;
-	rtc_nvmem_register(rv3032->rtc, &eeprom_cfg);
+	devm_rtc_nvmem_register(rv3032->rtc, &eeprom_cfg);
 
 	rv3032->rtc->max_user_freq = 1;
 
diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c
index 1d888da48c7c..44e1818a751c 100644
--- a/drivers/rtc/rtc-rv8803.c
+++ b/drivers/rtc/rtc-rv8803.c
@@ -591,7 +591,7 @@ static int rv8803_probe(struct i2c_client *client,
 	if (err)
 		return err;
 
-	rtc_nvmem_register(rv8803->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(rv8803->rtc, &nvmem_cfg);
 
 	rv8803->rtc->max_user_freq = 1;
 
diff --git a/drivers/rtc/rtc-rx8581.c b/drivers/rtc/rtc-rx8581.c
index 490f70f57636..017f74721cc0 100644
--- a/drivers/rtc/rtc-rx8581.c
+++ b/drivers/rtc/rtc-rx8581.c
@@ -302,7 +302,7 @@ static int rx8581_probe(struct i2c_client *client,
 
 	for (i = 0; i < config->num_nvram; i++) {
 		nvmem_cfg[i].priv = rx8581;
-		rtc_nvmem_register(rx8581->rtc, &nvmem_cfg[i]);
+		devm_rtc_nvmem_register(rx8581->rtc, &nvmem_cfg[i]);
 	}
 
 	return ret;
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index 1ccf0d5d05b4..ad616bce7bca 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -313,7 +313,7 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev)
 	pdata->rtc->ops = &stk17ta8_rtc_ops;
 
 	nvmem_cfg.priv = pdata;
-	ret = rtc_nvmem_register(pdata->rtc, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(pdata->rtc, &nvmem_cfg);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index abbb62b14d7a..11f46272bad3 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -271,7 +271,7 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 	pdata->rtc = rtc;
 
 	nvmem_cfg.priv = pdata;
-	ret = rtc_nvmem_register(rtc, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(rtc, &nvmem_cfg);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 0983ab9faffb..cbca651d8ca4 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -244,11 +244,11 @@ extern int rtc_hctosys_ret;
 #endif
 
 #ifdef CONFIG_RTC_NVMEM
-int rtc_nvmem_register(struct rtc_device *rtc,
-		       struct nvmem_config *nvmem_config);
+int devm_rtc_nvmem_register(struct rtc_device *rtc,
+			    struct nvmem_config *nvmem_config);
 #else
-static inline int rtc_nvmem_register(struct rtc_device *rtc,
-				     struct nvmem_config *nvmem_config)
+static inline int devm_rtc_nvmem_register(struct rtc_device *rtc,
+					  struct nvmem_config *nvmem_config)
 {
 	return 0;
 }
-- 
cgit 


From fdcfd854333be5b30377dc5daa9cd0fa1643a979 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 9 Nov 2020 17:34:08 +0100
Subject: rtc: rework rtc_register_device() resource management

rtc_register_device() is a managed interface but it doesn't use devres
by itself - instead it marks an rtc_device as "registered" and the devres
callback for devm_rtc_allocate_device() takes care of resource release.

This doesn't correspond with the design behind devres where managed
structures should not be aware of being managed. The correct solution
here is to register a separate devres callback for unregistering the
device.

While at it: rename rtc_register_device() to devm_rtc_register_device()
and add it to the list of managed interfaces in devres.rst. This way we
can avoid any potential confusion of driver developers who may expect
there to exist a corresponding unregister function.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201109163409.24301-8-brgl@bgdev.pl
---
 Documentation/driver-api/driver-model/devres.rst |  1 +
 arch/alpha/kernel/rtc.c                          |  2 +-
 drivers/mfd/menelaus.c                           |  2 +-
 drivers/rtc/class.c                              | 19 +++++++++----------
 drivers/rtc/rtc-88pm80x.c                        |  2 +-
 drivers/rtc/rtc-88pm860x.c                       |  2 +-
 drivers/rtc/rtc-ab-b5ze-s3.c                     |  2 +-
 drivers/rtc/rtc-ab-eoz9.c                        |  2 +-
 drivers/rtc/rtc-ab3100.c                         |  2 +-
 drivers/rtc/rtc-ab8500.c                         |  2 +-
 drivers/rtc/rtc-abx80x.c                         |  2 +-
 drivers/rtc/rtc-ac100.c                          |  2 +-
 drivers/rtc/rtc-armada38x.c                      |  2 +-
 drivers/rtc/rtc-aspeed.c                         |  2 +-
 drivers/rtc/rtc-at91rm9200.c                     |  2 +-
 drivers/rtc/rtc-at91sam9.c                       |  2 +-
 drivers/rtc/rtc-au1xxx.c                         |  2 +-
 drivers/rtc/rtc-bd70528.c                        |  2 +-
 drivers/rtc/rtc-brcmstb-waketimer.c              |  2 +-
 drivers/rtc/rtc-cadence.c                        |  2 +-
 drivers/rtc/rtc-cmos.c                           |  2 +-
 drivers/rtc/rtc-coh901331.c                      |  2 +-
 drivers/rtc/rtc-cpcap.c                          |  2 +-
 drivers/rtc/rtc-cros-ec.c                        |  2 +-
 drivers/rtc/rtc-da9052.c                         |  2 +-
 drivers/rtc/rtc-da9063.c                         |  2 +-
 drivers/rtc/rtc-davinci.c                        |  2 +-
 drivers/rtc/rtc-digicolor.c                      |  2 +-
 drivers/rtc/rtc-dm355evm.c                       |  2 +-
 drivers/rtc/rtc-ds1305.c                         |  2 +-
 drivers/rtc/rtc-ds1307.c                         |  2 +-
 drivers/rtc/rtc-ds1343.c                         |  2 +-
 drivers/rtc/rtc-ds1347.c                         |  2 +-
 drivers/rtc/rtc-ds1374.c                         |  2 +-
 drivers/rtc/rtc-ds1511.c                         |  2 +-
 drivers/rtc/rtc-ds1553.c                         |  2 +-
 drivers/rtc/rtc-ds1672.c                         |  2 +-
 drivers/rtc/rtc-ds1685.c                         |  2 +-
 drivers/rtc/rtc-ds1742.c                         |  2 +-
 drivers/rtc/rtc-ds2404.c                         |  2 +-
 drivers/rtc/rtc-ep93xx.c                         |  2 +-
 drivers/rtc/rtc-fsl-ftm-alarm.c                  |  2 +-
 drivers/rtc/rtc-ftrtc010.c                       |  2 +-
 drivers/rtc/rtc-goldfish.c                       |  2 +-
 drivers/rtc/rtc-imx-sc.c                         |  2 +-
 drivers/rtc/rtc-imxdi.c                          |  2 +-
 drivers/rtc/rtc-isl12026.c                       |  2 +-
 drivers/rtc/rtc-isl1208.c                        |  2 +-
 drivers/rtc/rtc-jz4740.c                         |  2 +-
 drivers/rtc/rtc-lpc32xx.c                        |  2 +-
 drivers/rtc/rtc-ls1x.c                           |  2 +-
 drivers/rtc/rtc-m41t80.c                         |  2 +-
 drivers/rtc/rtc-m48t59.c                         |  2 +-
 drivers/rtc/rtc-m48t86.c                         |  2 +-
 drivers/rtc/rtc-mc13xxx.c                        |  2 +-
 drivers/rtc/rtc-meson-vrtc.c                     |  2 +-
 drivers/rtc/rtc-meson.c                          |  2 +-
 drivers/rtc/rtc-mpc5121.c                        |  2 +-
 drivers/rtc/rtc-mrst.c                           |  2 +-
 drivers/rtc/rtc-mt2712.c                         |  2 +-
 drivers/rtc/rtc-mt6397.c                         |  2 +-
 drivers/rtc/rtc-mv.c                             |  2 +-
 drivers/rtc/rtc-mxc.c                            |  2 +-
 drivers/rtc/rtc-mxc_v2.c                         |  2 +-
 drivers/rtc/rtc-omap.c                           |  2 +-
 drivers/rtc/rtc-pcap.c                           |  2 +-
 drivers/rtc/rtc-pcf2123.c                        |  2 +-
 drivers/rtc/rtc-pcf2127.c                        |  2 +-
 drivers/rtc/rtc-pcf85063.c                       |  2 +-
 drivers/rtc/rtc-pcf85363.c                       |  2 +-
 drivers/rtc/rtc-pcf8563.c                        |  2 +-
 drivers/rtc/rtc-pic32.c                          |  2 +-
 drivers/rtc/rtc-pl030.c                          |  2 +-
 drivers/rtc/rtc-pl031.c                          |  2 +-
 drivers/rtc/rtc-pm8xxx.c                         |  2 +-
 drivers/rtc/rtc-ps3.c                            |  2 +-
 drivers/rtc/rtc-r9701.c                          |  2 +-
 drivers/rtc/rtc-rc5t619.c                        |  2 +-
 drivers/rtc/rtc-rk808.c                          |  2 +-
 drivers/rtc/rtc-rp5c01.c                         |  2 +-
 drivers/rtc/rtc-rs5c348.c                        |  2 +-
 drivers/rtc/rtc-rv3028.c                         |  2 +-
 drivers/rtc/rtc-rv3029c2.c                       |  2 +-
 drivers/rtc/rtc-rv3032.c                         |  2 +-
 drivers/rtc/rtc-rv8803.c                         |  2 +-
 drivers/rtc/rtc-rx8010.c                         |  2 +-
 drivers/rtc/rtc-rx8581.c                         |  2 +-
 drivers/rtc/rtc-s35390a.c                        |  2 +-
 drivers/rtc/rtc-sa1100.c                         |  2 +-
 drivers/rtc/rtc-sc27xx.c                         |  2 +-
 drivers/rtc/rtc-sd3078.c                         |  2 +-
 drivers/rtc/rtc-sh.c                             |  2 +-
 drivers/rtc/rtc-sirfsoc.c                        |  2 +-
 drivers/rtc/rtc-snvs.c                           |  2 +-
 drivers/rtc/rtc-st-lpc.c                         |  2 +-
 drivers/rtc/rtc-starfire.c                       |  2 +-
 drivers/rtc/rtc-stk17ta8.c                       |  2 +-
 drivers/rtc/rtc-stmp3xxx.c                       |  2 +-
 drivers/rtc/rtc-sun4v.c                          |  2 +-
 drivers/rtc/rtc-sun6i.c                          |  2 +-
 drivers/rtc/rtc-sunxi.c                          |  2 +-
 drivers/rtc/rtc-tegra.c                          |  2 +-
 drivers/rtc/rtc-test.c                           |  2 +-
 drivers/rtc/rtc-tps6586x.c                       |  2 +-
 drivers/rtc/rtc-tps65910.c                       |  2 +-
 drivers/rtc/rtc-tx4939.c                         |  2 +-
 drivers/rtc/rtc-vr41xx.c                         |  2 +-
 drivers/rtc/rtc-vt8500.c                         |  2 +-
 drivers/rtc/rtc-wilco-ec.c                       |  2 +-
 drivers/rtc/rtc-wm831x.c                         |  2 +-
 drivers/rtc/rtc-xgene.c                          |  2 +-
 drivers/rtc/rtc-zynqmp.c                         |  2 +-
 drivers/rtc/sysfs.c                              |  2 --
 include/linux/rtc.h                              |  8 +++-----
 114 files changed, 123 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 5df7ba54a4ba..cd8b6e657b94 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -414,6 +414,7 @@ RESET
 RTC
   devm_rtc_device_register()
   devm_rtc_allocate_device()
+  devm_rtc_register_device()
   devm_rtc_nvmem_register()
 
 SERDEV
diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c
index 1b1d5963ac55..ce3077946e1d 100644
--- a/arch/alpha/kernel/rtc.c
+++ b/arch/alpha/kernel/rtc.c
@@ -216,6 +216,6 @@ alpha_rtc_init(void)
 		rtc->ops = &remote_rtc_ops;
 #endif
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 device_initcall(alpha_rtc_init);
diff --git a/drivers/mfd/menelaus.c b/drivers/mfd/menelaus.c
index b64d3315a5e1..07e0ca2e467c 100644
--- a/drivers/mfd/menelaus.c
+++ b/drivers/mfd/menelaus.c
@@ -1119,7 +1119,7 @@ static inline void menelaus_rtc_init(struct menelaus_chip *m)
 		menelaus_write_reg(MENELAUS_RTC_CTRL, m->rtc_control);
 	}
 
-	err = rtc_register_device(m->rtc);
+	err = devm_rtc_register_device(m->rtc);
 	if (err) {
 		if (alarm) {
 			menelaus_remove_irq_work(MENELAUS_RTCALM_IRQ);
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index a99b7d24b77c..b8a34ee039ad 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -321,8 +321,10 @@ static void rtc_device_get_offset(struct rtc_device *rtc)
  *
  * @rtc: the RTC class device to destroy
  */
-static void rtc_device_unregister(struct rtc_device *rtc)
+static void devm_rtc_unregister_device(void *data)
 {
+	struct rtc_device *rtc = data;
+
 	mutex_lock(&rtc->ops_lock);
 	/*
 	 * Remove innards of this RTC, then disable it, before
@@ -339,10 +341,7 @@ static void devm_rtc_release_device(struct device *dev, void *res)
 {
 	struct rtc_device *rtc = *(struct rtc_device **)res;
 
-	if (rtc->registered)
-		rtc_device_unregister(rtc);
-	else
-		put_device(&rtc->dev);
+	put_device(&rtc->dev);
 }
 
 struct rtc_device *devm_rtc_allocate_device(struct device *dev)
@@ -383,7 +382,7 @@ exit_ida:
 }
 EXPORT_SYMBOL_GPL(devm_rtc_allocate_device);
 
-int __rtc_register_device(struct module *owner, struct rtc_device *rtc)
+int __devm_rtc_register_device(struct module *owner, struct rtc_device *rtc)
 {
 	struct rtc_wkalrm alrm;
 	int err;
@@ -413,7 +412,6 @@ int __rtc_register_device(struct module *owner, struct rtc_device *rtc)
 
 	rtc_proc_add_device(rtc);
 
-	rtc->registered = true;
 	dev_info(rtc->dev.parent, "registered as %s\n",
 		 dev_name(&rtc->dev));
 
@@ -422,9 +420,10 @@ int __rtc_register_device(struct module *owner, struct rtc_device *rtc)
 		rtc_hctosys(rtc);
 #endif
 
-	return 0;
+	return devm_add_action_or_reset(rtc->dev.parent,
+					devm_rtc_unregister_device, rtc);
 }
-EXPORT_SYMBOL_GPL(__rtc_register_device);
+EXPORT_SYMBOL_GPL(__devm_rtc_register_device);
 
 /**
  * devm_rtc_device_register - resource managed rtc_device_register()
@@ -454,7 +453,7 @@ struct rtc_device *devm_rtc_device_register(struct device *dev,
 
 	rtc->ops = ops;
 
-	err = __rtc_register_device(owner, rtc);
+	err = __devm_rtc_register_device(owner, rtc);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c
index 75779e8501a3..6a3f44cf6ebe 100644
--- a/drivers/rtc/rtc-88pm80x.c
+++ b/drivers/rtc/rtc-88pm80x.c
@@ -294,7 +294,7 @@ static int pm80x_rtc_probe(struct platform_device *pdev)
 	info->rtc_dev->ops = &pm80x_rtc_ops;
 	info->rtc_dev->range_max = U32_MAX;
 
-	ret = rtc_register_device(info->rtc_dev);
+	ret = devm_rtc_register_device(info->rtc_dev);
 	if (ret)
 		goto out_rtc;
 
diff --git a/drivers/rtc/rtc-88pm860x.c b/drivers/rtc/rtc-88pm860x.c
index c90457d001e9..2c809a1a445e 100644
--- a/drivers/rtc/rtc-88pm860x.c
+++ b/drivers/rtc/rtc-88pm860x.c
@@ -307,7 +307,7 @@ static int pm860x_rtc_probe(struct platform_device *pdev)
 	info->rtc_dev->ops = &pm860x_rtc_ops;
 	info->rtc_dev->range_max = U32_MAX;
 
-	ret = rtc_register_device(info->rtc_dev);
+	ret = devm_rtc_register_device(info->rtc_dev);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ab-b5ze-s3.c b/drivers/rtc/rtc-ab-b5ze-s3.c
index 2370ac0cdb5f..6e3e320dc727 100644
--- a/drivers/rtc/rtc-ab-b5ze-s3.c
+++ b/drivers/rtc/rtc-ab-b5ze-s3.c
@@ -892,7 +892,7 @@ static int abb5zes3_probe(struct i2c_client *client,
 		}
 	}
 
-	ret = rtc_register_device(data->rtc);
+	ret = devm_rtc_register_device(data->rtc);
 
 err:
 	if (ret && data->irq)
diff --git a/drivers/rtc/rtc-ab-eoz9.c b/drivers/rtc/rtc-ab-eoz9.c
index d690985caa4c..b20d8f26dcdb 100644
--- a/drivers/rtc/rtc-ab-eoz9.c
+++ b/drivers/rtc/rtc-ab-eoz9.c
@@ -420,7 +420,7 @@ static int abeoz9_probe(struct i2c_client *client,
 	data->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	data->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	ret = rtc_register_device(data->rtc);
+	ret = devm_rtc_register_device(data->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ab3100.c b/drivers/rtc/rtc-ab3100.c
index 2ed6def90975..e4fd961e8bf6 100644
--- a/drivers/rtc/rtc-ab3100.c
+++ b/drivers/rtc/rtc-ab3100.c
@@ -238,7 +238,7 @@ static int __init ab3100_rtc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtc);
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver ab3100_rtc_driver = {
diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c
index 3d60f3283f11..b40048871295 100644
--- a/drivers/rtc/rtc-ab8500.c
+++ b/drivers/rtc/rtc-ab8500.c
@@ -404,7 +404,7 @@ static int ab8500_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static int ab8500_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-abx80x.c b/drivers/rtc/rtc-abx80x.c
index 803725b3a02c..6733bb0df674 100644
--- a/drivers/rtc/rtc-abx80x.c
+++ b/drivers/rtc/rtc-abx80x.c
@@ -851,7 +851,7 @@ static int abx80x_probe(struct i2c_client *client,
 		return err;
 	}
 
-	return rtc_register_device(priv->rtc);
+	return devm_rtc_register_device(priv->rtc);
 }
 
 static const struct i2c_device_id abx80x_id[] = {
diff --git a/drivers/rtc/rtc-ac100.c b/drivers/rtc/rtc-ac100.c
index 29223931aba7..1ddbef99e38f 100644
--- a/drivers/rtc/rtc-ac100.c
+++ b/drivers/rtc/rtc-ac100.c
@@ -610,7 +610,7 @@ static int ac100_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(chip->rtc);
+	return devm_rtc_register_device(chip->rtc);
 }
 
 static int ac100_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-armada38x.c b/drivers/rtc/rtc-armada38x.c
index 94d7c22fc4f3..807a79c07f08 100644
--- a/drivers/rtc/rtc-armada38x.c
+++ b/drivers/rtc/rtc-armada38x.c
@@ -556,7 +556,7 @@ static __init int armada38x_rtc_probe(struct platform_device *pdev)
 
 	rtc->rtc_dev->range_max = U32_MAX;
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-aspeed.c b/drivers/rtc/rtc-aspeed.c
index eacdd0637cce..a93352ed3aec 100644
--- a/drivers/rtc/rtc-aspeed.c
+++ b/drivers/rtc/rtc-aspeed.c
@@ -104,7 +104,7 @@ static int aspeed_rtc_probe(struct platform_device *pdev)
 	rtc->rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_1900;
 	rtc->rtc_dev->range_max = 38814989399LL; /* 3199-12-31 23:59:59 */
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static const struct of_device_id aspeed_rtc_match[] = {
diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c
index da24e68adcca..fe396d27ebb7 100644
--- a/drivers/rtc/rtc-at91rm9200.c
+++ b/drivers/rtc/rtc-at91rm9200.c
@@ -538,7 +538,7 @@ static int __init at91_rtc_probe(struct platform_device *pdev)
 
 	rtc->range_min = RTC_TIMESTAMP_BEGIN_1900;
 	rtc->range_max = RTC_TIMESTAMP_END_2099;
-	ret = rtc_register_device(rtc);
+	ret = devm_rtc_register_device(rtc);
 	if (ret)
 		goto err_clk;
 
diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c
index e39e89867d29..2216be429ab7 100644
--- a/drivers/rtc/rtc-at91sam9.c
+++ b/drivers/rtc/rtc-at91sam9.c
@@ -431,7 +431,7 @@ static int at91_rtc_probe(struct platform_device *pdev)
 		dev_warn(&pdev->dev, "%s: SET TIME!\n",
 			 dev_name(&rtc->rtcdev->dev));
 
-	return rtc_register_device(rtc->rtcdev);
+	return devm_rtc_register_device(rtc->rtcdev);
 
 err_clk:
 	clk_disable_unprepare(rtc->sclk);
diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c
index 791bebcb6f47..e6428b27b5d4 100644
--- a/drivers/rtc/rtc-au1xxx.c
+++ b/drivers/rtc/rtc-au1xxx.c
@@ -104,7 +104,7 @@ static int au1xtoy_rtc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtcdev);
 
-	return rtc_register_device(rtcdev);
+	return devm_rtc_register_device(rtcdev);
 }
 
 static struct platform_driver au1xrtc_driver = {
diff --git a/drivers/rtc/rtc-bd70528.c b/drivers/rtc/rtc-bd70528.c
index 4492b770422c..17cb67f5bf6e 100644
--- a/drivers/rtc/rtc-bd70528.c
+++ b/drivers/rtc/rtc-bd70528.c
@@ -604,7 +604,7 @@ static int bd70528_probe(struct platform_device *pdev)
 		}
 	}
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static const struct platform_device_id bd718x7_rtc_id[] = {
diff --git a/drivers/rtc/rtc-brcmstb-waketimer.c b/drivers/rtc/rtc-brcmstb-waketimer.c
index 375a9987a1d6..0366e2ff04ae 100644
--- a/drivers/rtc/rtc-brcmstb-waketimer.c
+++ b/drivers/rtc/rtc-brcmstb-waketimer.c
@@ -252,7 +252,7 @@ static int brcmstb_waketmr_probe(struct platform_device *pdev)
 	timer->rtc->ops = &brcmstb_waketmr_ops;
 	timer->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(timer->rtc);
+	ret = devm_rtc_register_device(timer->rtc);
 	if (ret)
 		goto err_notifier;
 
diff --git a/drivers/rtc/rtc-cadence.c b/drivers/rtc/rtc-cadence.c
index 595d5d252850..1edf7f16d73a 100644
--- a/drivers/rtc/rtc-cadence.c
+++ b/drivers/rtc/rtc-cadence.c
@@ -336,7 +336,7 @@ static int cdns_rtc_probe(struct platform_device *pdev)
 	writel(0, crtc->regs + CDNS_RTC_HMR);
 	writel(CDNS_RTC_KRTCR_KRTC, crtc->regs + CDNS_RTC_KRTCR);
 
-	ret = rtc_register_device(crtc->rtc_dev);
+	ret = devm_rtc_register_device(crtc->rtc_dev);
 	if (ret)
 		goto err_disable_wakeup;
 
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 766074c04b53..83415600185c 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -863,7 +863,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 		cmos_rtc.rtc->ops = &cmos_rtc_ops_no_alarm;
 	}
 
-	retval = rtc_register_device(cmos_rtc.rtc);
+	retval = devm_rtc_register_device(cmos_rtc.rtc);
 	if (retval)
 		goto cleanup2;
 
diff --git a/drivers/rtc/rtc-coh901331.c b/drivers/rtc/rtc-coh901331.c
index da59917c9ee8..168ced87d93a 100644
--- a/drivers/rtc/rtc-coh901331.c
+++ b/drivers/rtc/rtc-coh901331.c
@@ -203,7 +203,7 @@ static int __init coh901331_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtap);
 
-	ret = rtc_register_device(rtap->rtc);
+	ret = devm_rtc_register_device(rtap->rtc);
 	if (ret)
 		goto out_no_rtc;
 
diff --git a/drivers/rtc/rtc-cpcap.c b/drivers/rtc/rtc-cpcap.c
index 38d576b0c4fa..afc8fcba8f88 100644
--- a/drivers/rtc/rtc-cpcap.c
+++ b/drivers/rtc/rtc-cpcap.c
@@ -301,7 +301,7 @@ static int cpcap_rtc_probe(struct platform_device *pdev)
 		/* ignore error and continue without wakeup support */
 	}
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static const struct of_device_id cpcap_rtc_of_match[] = {
diff --git a/drivers/rtc/rtc-cros-ec.c b/drivers/rtc/rtc-cros-ec.c
index f7343c289cab..70626793ca69 100644
--- a/drivers/rtc/rtc-cros-ec.c
+++ b/drivers/rtc/rtc-cros-ec.c
@@ -350,7 +350,7 @@ static int cros_ec_rtc_probe(struct platform_device *pdev)
 	cros_ec_rtc->rtc->ops = &cros_ec_rtc_ops;
 	cros_ec_rtc->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(cros_ec_rtc->rtc);
+	ret = devm_rtc_register_device(cros_ec_rtc->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c
index 58de10da37b1..9ca99bd35702 100644
--- a/drivers/rtc/rtc-da9052.c
+++ b/drivers/rtc/rtc-da9052.c
@@ -304,7 +304,7 @@ static int da9052_rtc_probe(struct platform_device *pdev)
 	rtc->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rtc->rtc->range_max = RTC_TIMESTAMP_END_2063;
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-da9063.c b/drivers/rtc/rtc-da9063.c
index 6f0a3a711135..d4b72a9fa2ba 100644
--- a/drivers/rtc/rtc-da9063.c
+++ b/drivers/rtc/rtc-da9063.c
@@ -494,7 +494,7 @@ static int da9063_rtc_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "Failed to request ALARM IRQ %d: %d\n",
 			irq_alarm, ret);
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static struct platform_driver da9063_rtc_driver = {
diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c
index 73f87a17cdf3..6bef0f2353da 100644
--- a/drivers/rtc/rtc-davinci.c
+++ b/drivers/rtc/rtc-davinci.c
@@ -484,7 +484,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev)
 
 	device_init_wakeup(&pdev->dev, 0);
 
-	return rtc_register_device(davinci_rtc->rtc);
+	return devm_rtc_register_device(davinci_rtc->rtc);
 }
 
 static int __exit davinci_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-digicolor.c b/drivers/rtc/rtc-digicolor.c
index 200d85b01e8b..4fdfa5b6feb2 100644
--- a/drivers/rtc/rtc-digicolor.c
+++ b/drivers/rtc/rtc-digicolor.c
@@ -202,7 +202,7 @@ static int __init dc_rtc_probe(struct platform_device *pdev)
 	rtc->rtc_dev->ops = &dc_rtc_ops;
 	rtc->rtc_dev->range_max = U32_MAX;
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static const struct of_device_id dc_dt_ids[] = {
diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c
index cd947a20843b..94fb16ac3e0f 100644
--- a/drivers/rtc/rtc-dm355evm.c
+++ b/drivers/rtc/rtc-dm355evm.c
@@ -132,7 +132,7 @@ static int dm355evm_rtc_probe(struct platform_device *pdev)
 	rtc->ops = &dm355evm_rtc_ops;
 	rtc->range_max = U32_MAX;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 /*
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index a4e768261b43..8c2ab29c3d91 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -694,7 +694,7 @@ static int ds1305_probe(struct spi_device *spi)
 	ds1305->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
 	ds1305_nvmem_cfg.priv = ds1305;
-	status = rtc_register_device(ds1305->rtc);
+	status = devm_rtc_register_device(ds1305->rtc);
 	if (status)
 		return status;
 
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 216bc5d9b716..183cf7c01364 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -2001,7 +2001,7 @@ static int ds1307_probe(struct i2c_client *client,
 	if (err)
 		return err;
 
-	err = rtc_register_device(ds1307->rtc);
+	err = devm_rtc_register_device(ds1307->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c
index ea663e24a34c..f14ed6c96437 100644
--- a/drivers/rtc/rtc-ds1343.c
+++ b/drivers/rtc/rtc-ds1343.c
@@ -408,7 +408,7 @@ static int ds1343_probe(struct spi_device *spi)
 		dev_err(&spi->dev,
 			"unable to create sysfs entries for rtc ds1343\n");
 
-	res = rtc_register_device(priv->rtc);
+	res = devm_rtc_register_device(priv->rtc);
 	if (res)
 		return res;
 
diff --git a/drivers/rtc/rtc-ds1347.c b/drivers/rtc/rtc-ds1347.c
index 7025cf3fb9f8..157bf5209ac4 100644
--- a/drivers/rtc/rtc-ds1347.c
+++ b/drivers/rtc/rtc-ds1347.c
@@ -166,7 +166,7 @@ static int ds1347_probe(struct spi_device *spi)
 	rtc->range_min = RTC_TIMESTAMP_BEGIN_0000;
 	rtc->range_max = RTC_TIMESTAMP_END_9999;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct spi_driver ds1347_driver = {
diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c
index 177d870bda0d..fab79921a712 100644
--- a/drivers/rtc/rtc-ds1374.c
+++ b/drivers/rtc/rtc-ds1374.c
@@ -508,7 +508,7 @@ static int ds1374_probe(struct i2c_client *client,
 	ds1374->rtc->ops = &ds1374_rtc_ops;
 	ds1374->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(ds1374->rtc);
+	ret = devm_rtc_register_device(ds1374->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index d5f48216e851..bda884333082 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -466,7 +466,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
 
 	pdata->rtc->ops = &ds1511_rtc_ops;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index bb40ea8b6373..dbff5b621ef5 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -295,7 +295,7 @@ static int ds1553_rtc_probe(struct platform_device *pdev)
 
 	pdata->rtc->ops = &ds1553_rtc_ops;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds1672.c b/drivers/rtc/rtc-ds1672.c
index 9da84df9f152..630493759d15 100644
--- a/drivers/rtc/rtc-ds1672.c
+++ b/drivers/rtc/rtc-ds1672.c
@@ -124,7 +124,7 @@ static int ds1672_probe(struct i2c_client *client,
 	rtc->ops = &ds1672_rtc_ops;
 	rtc->range_max = U32_MAX;
 
-	err = rtc_register_device(rtc);
+	err = devm_rtc_register_device(rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index bef588fce266..d69c807af29b 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -1321,7 +1321,7 @@ ds1685_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(rtc_dev);
+	return devm_rtc_register_device(rtc_dev);
 }
 
 /**
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 39c6c3a85b34..13d45c697da6 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -191,7 +191,7 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
 
 	rtc->ops = &ds1742_rtc_ops;
 
-	ret = rtc_register_device(rtc);
+	ret = devm_rtc_register_device(rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds2404.c b/drivers/rtc/rtc-ds2404.c
index 9df0c44512b8..0480f592307e 100644
--- a/drivers/rtc/rtc-ds2404.c
+++ b/drivers/rtc/rtc-ds2404.c
@@ -234,7 +234,7 @@ static int rtc_probe(struct platform_device *pdev)
 	chip->rtc->ops = &ds2404_rtc_ops;
 	chip->rtc->range_max = U32_MAX;
 
-	retval = rtc_register_device(chip->rtc);
+	retval = devm_rtc_register_device(chip->rtc);
 	if (retval)
 		return retval;
 
diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c
index 8ec9ea1ca72e..9a5a15cbcd9b 100644
--- a/drivers/rtc/rtc-ep93xx.c
+++ b/drivers/rtc/rtc-ep93xx.c
@@ -145,7 +145,7 @@ static int ep93xx_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	return rtc_register_device(ep93xx_rtc->rtc);
+	return devm_rtc_register_device(ep93xx_rtc->rtc);
 }
 
 static struct platform_driver ep93xx_rtc_driver = {
diff --git a/drivers/rtc/rtc-fsl-ftm-alarm.c b/drivers/rtc/rtc-fsl-ftm-alarm.c
index 48d3b38ea348..57cc09d0a806 100644
--- a/drivers/rtc/rtc-fsl-ftm-alarm.c
+++ b/drivers/rtc/rtc-fsl-ftm-alarm.c
@@ -290,7 +290,7 @@ static int ftm_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		dev_err(&pdev->dev, "failed to enable irq wake\n");
 
-	ret = rtc_register_device(rtc->rtc_dev);
+	ret = devm_rtc_register_device(rtc->rtc_dev);
 	if (ret) {
 		dev_err(&pdev->dev, "can't register rtc device\n");
 		return ret;
diff --git a/drivers/rtc/rtc-ftrtc010.c b/drivers/rtc/rtc-ftrtc010.c
index 0919f7dc94a3..ad3add5db4c8 100644
--- a/drivers/rtc/rtc-ftrtc010.c
+++ b/drivers/rtc/rtc-ftrtc010.c
@@ -176,7 +176,7 @@ static int ftrtc010_rtc_probe(struct platform_device *pdev)
 	if (unlikely(ret))
 		return ret;
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static int ftrtc010_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-goldfish.c b/drivers/rtc/rtc-goldfish.c
index 6349d2cd3680..7ab95d052644 100644
--- a/drivers/rtc/rtc-goldfish.c
+++ b/drivers/rtc/rtc-goldfish.c
@@ -194,7 +194,7 @@ static int goldfish_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	return rtc_register_device(rtcdrv->rtc);
+	return devm_rtc_register_device(rtcdrv->rtc);
 }
 
 static const struct of_device_id goldfish_rtc_of_match[] = {
diff --git a/drivers/rtc/rtc-imx-sc.c b/drivers/rtc/rtc-imx-sc.c
index a5f59e6f862e..cc9fbab49999 100644
--- a/drivers/rtc/rtc-imx-sc.c
+++ b/drivers/rtc/rtc-imx-sc.c
@@ -166,7 +166,7 @@ static int imx_sc_rtc_probe(struct platform_device *pdev)
 	imx_sc_rtc->range_min = 0;
 	imx_sc_rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(imx_sc_rtc);
+	ret = devm_rtc_register_device(imx_sc_rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c
index 8d141d8a5490..c2692da74e09 100644
--- a/drivers/rtc/rtc-imxdi.c
+++ b/drivers/rtc/rtc-imxdi.c
@@ -814,7 +814,7 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
 	imxdi->rtc->ops = &dryice_rtc_ops;
 	imxdi->rtc->range_max = U32_MAX;
 
-	rc = rtc_register_device(imxdi->rtc);
+	rc = devm_rtc_register_device(imxdi->rtc);
 	if (rc)
 		goto err;
 
diff --git a/drivers/rtc/rtc-isl12026.c b/drivers/rtc/rtc-isl12026.c
index fff8d8253669..1fc6627d854d 100644
--- a/drivers/rtc/rtc-isl12026.c
+++ b/drivers/rtc/rtc-isl12026.c
@@ -469,7 +469,7 @@ static int isl12026_probe_new(struct i2c_client *client)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(priv->rtc);
+	return devm_rtc_register_device(priv->rtc);
 }
 
 static int isl12026_remove(struct i2c_client *client)
diff --git a/drivers/rtc/rtc-isl1208.c b/drivers/rtc/rtc-isl1208.c
index 08d778b10e9e..563a6d9c9fcf 100644
--- a/drivers/rtc/rtc-isl1208.c
+++ b/drivers/rtc/rtc-isl1208.c
@@ -894,7 +894,7 @@ isl1208_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	if (rc)
 		return rc;
 
-	return rtc_register_device(isl1208->rtc);
+	return devm_rtc_register_device(isl1208->rtc);
 }
 
 static struct i2c_driver isl1208_driver = {
diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c
index 9607e6b6e0b3..6e51df72fd65 100644
--- a/drivers/rtc/rtc-jz4740.c
+++ b/drivers/rtc/rtc-jz4740.c
@@ -375,7 +375,7 @@ static int jz4740_rtc_probe(struct platform_device *pdev)
 	/* Each 1 Hz pulse should happen after (rate) ticks */
 	jz4740_rtc_reg_write(rtc, JZ_REG_RTC_REGULATOR, rate - 1);
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c
index 15d8abda81fe..76ad7031a13d 100644
--- a/drivers/rtc/rtc-lpc32xx.c
+++ b/drivers/rtc/rtc-lpc32xx.c
@@ -239,7 +239,7 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev)
 	rtc->rtc->ops = &lpc32xx_rtc_ops;
 	rtc->rtc->range_max = U32_MAX;
 
-	err = rtc_register_device(rtc->rtc);
+	err = devm_rtc_register_device(rtc->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-ls1x.c b/drivers/rtc/rtc-ls1x.c
index 8bd34056fea0..5af26dc5c2a3 100644
--- a/drivers/rtc/rtc-ls1x.c
+++ b/drivers/rtc/rtc-ls1x.c
@@ -176,7 +176,7 @@ static int ls1x_rtc_probe(struct platform_device *pdev)
 	rtcdev->range_min = RTC_TIMESTAMP_BEGIN_1900;
 	rtcdev->range_max = RTC_TIMESTAMP_END_2099;
 
-	return rtc_register_device(rtcdev);
+	return devm_rtc_register_device(rtcdev);
 }
 
 static struct platform_driver  ls1x_rtc_driver = {
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c
index 8a89bc52b0d4..160dcf68e64e 100644
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -977,7 +977,7 @@ static int m41t80_probe(struct i2c_client *client,
 		m41t80_sqw_register_clk(m41t80_data);
 #endif
 
-	rc = rtc_register_device(m41t80_data->rtc);
+	rc = devm_rtc_register_device(m41t80_data->rtc);
 	if (rc)
 		return rc;
 
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index e966a66ab2d3..5f5898d3b055 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -470,7 +470,7 @@ static int m48t59_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = rtc_register_device(m48t59->rtc);
+	ret = devm_rtc_register_device(m48t59->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 182cfe59e4e0..481c9525b1dd 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -255,7 +255,7 @@ static int m48t86_rtc_probe(struct platform_device *pdev)
 
 	info->rtc->ops = &m48t86_rtc_ops;
 
-	err = rtc_register_device(info->rtc);
+	err = devm_rtc_register_device(info->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c
index d6802e6191cb..d4234e78497e 100644
--- a/drivers/rtc/rtc-mc13xxx.c
+++ b/drivers/rtc/rtc-mc13xxx.c
@@ -307,7 +307,7 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev)
 
 	mc13xxx_unlock(mc13xxx);
 
-	ret = rtc_register_device(priv->rtc);
+	ret = devm_rtc_register_device(priv->rtc);
 	if (ret) {
 		mc13xxx_lock(mc13xxx);
 		goto err_irq_request;
diff --git a/drivers/rtc/rtc-meson-vrtc.c b/drivers/rtc/rtc-meson-vrtc.c
index e6bd0808a092..1463c8621561 100644
--- a/drivers/rtc/rtc-meson-vrtc.c
+++ b/drivers/rtc/rtc-meson-vrtc.c
@@ -83,7 +83,7 @@ static int meson_vrtc_probe(struct platform_device *pdev)
 		return PTR_ERR(vrtc->rtc);
 
 	vrtc->rtc->ops = &meson_vrtc_ops;
-	return rtc_register_device(vrtc->rtc);
+	return devm_rtc_register_device(vrtc->rtc);
 }
 
 static int __maybe_unused meson_vrtc_suspend(struct device *dev)
diff --git a/drivers/rtc/rtc-meson.c b/drivers/rtc/rtc-meson.c
index 938267713a4d..8642c06565ea 100644
--- a/drivers/rtc/rtc-meson.c
+++ b/drivers/rtc/rtc-meson.c
@@ -369,7 +369,7 @@ static int meson_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_disable_vdd;
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		goto out_disable_vdd;
 
diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c
index 5c2ce71aa044..bb2ea9bc56f2 100644
--- a/drivers/rtc/rtc-mpc5121.c
+++ b/drivers/rtc/rtc-mpc5121.c
@@ -371,7 +371,7 @@ static int mpc5121_rtc_probe(struct platform_device *op)
 		rtc->rtc->range_max = U32_MAX;
 	}
 
-	err = rtc_register_device(rtc->rtc);
+	err = devm_rtc_register_device(rtc->rtc);
 	if (err)
 		goto out_dispose2;
 
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index 17bf5394e1e5..421b3b6071b6 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -361,7 +361,7 @@ static int vrtc_mrst_do_probe(struct device *dev, struct resource *iomem,
 		}
 	}
 
-	retval = rtc_register_device(mrst_rtc.rtc);
+	retval = devm_rtc_register_device(mrst_rtc.rtc);
 	if (retval)
 		goto cleanup0;
 
diff --git a/drivers/rtc/rtc-mt2712.c b/drivers/rtc/rtc-mt2712.c
index d5f691c8a035..cd92a9788351 100644
--- a/drivers/rtc/rtc-mt2712.c
+++ b/drivers/rtc/rtc-mt2712.c
@@ -352,7 +352,7 @@ static int mt2712_rtc_probe(struct platform_device *pdev)
 	mt2712_rtc->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	mt2712_rtc->rtc->range_max = MT2712_RTC_TIMESTAMP_END_2127;
 
-	return rtc_register_device(mt2712_rtc->rtc);
+	return devm_rtc_register_device(mt2712_rtc->rtc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-mt6397.c b/drivers/rtc/rtc-mt6397.c
index 1894aded4c85..6655035e5164 100644
--- a/drivers/rtc/rtc-mt6397.c
+++ b/drivers/rtc/rtc-mt6397.c
@@ -301,7 +301,7 @@ static int mtk_rtc_probe(struct platform_device *pdev)
 
 	rtc->rtc_dev->ops = &mtk_rtc_ops;
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-mv.c b/drivers/rtc/rtc-mv.c
index d5f190e578e4..f8e2ecea1d8d 100644
--- a/drivers/rtc/rtc-mv.c
+++ b/drivers/rtc/rtc-mv.c
@@ -278,7 +278,7 @@ static int __init mv_rtc_probe(struct platform_device *pdev)
 	pdata->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	pdata->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (!ret)
 		return 0;
 out:
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index 0d253ce3a8f5..65b29b0fa548 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -408,7 +408,7 @@ static int mxc_rtc_probe(struct platform_device *pdev)
 			dev_err(&pdev->dev, "failed to enable irq wake\n");
 	}
 
-	ret = rtc_register_device(rtc);
+	ret = devm_rtc_register_device(rtc);
 
 	return ret;
 }
diff --git a/drivers/rtc/rtc-mxc_v2.c b/drivers/rtc/rtc-mxc_v2.c
index 91534560fe2a..0d73f6f0cf9e 100644
--- a/drivers/rtc/rtc-mxc_v2.c
+++ b/drivers/rtc/rtc-mxc_v2.c
@@ -354,7 +354,7 @@ static int mxc_rtc_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret < 0)
 		clk_unprepare(pdata->clk);
 
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index e65f79fc7718..dc7db2477f88 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -886,7 +886,7 @@ static int omap_rtc_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		goto err;
 
diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c
index 178bfb1dea21..8c7a98a5452c 100644
--- a/drivers/rtc/rtc-pcap.c
+++ b/drivers/rtc/rtc-pcap.c
@@ -163,7 +163,7 @@ static int __init pcap_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	return rtc_register_device(pcap_rtc->rtc);
+	return devm_rtc_register_device(pcap_rtc->rtc);
 }
 
 static int __exit pcap_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c
index c3691fa4210e..534ffc91eec1 100644
--- a/drivers/rtc/rtc-pcf2123.c
+++ b/drivers/rtc/rtc-pcf2123.c
@@ -434,7 +434,7 @@ static int pcf2123_probe(struct spi_device *spi)
 	rtc->range_max = RTC_TIMESTAMP_END_2099;
 	rtc->set_start_time = true;
 
-	ret = rtc_register_device(rtc);
+	ret = devm_rtc_register_device(rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index 432cd627359b..33fa8b17b79c 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -682,7 +682,7 @@ static int pcf2127_probe(struct device *dev, struct regmap *regmap,
 		return ret;
 	}
 
-	return rtc_register_device(pcf2127->rtc);
+	return devm_rtc_register_device(pcf2127->rtc);
 }
 
 #ifdef CONFIG_OF
diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c
index c19f139e9b8d..e19cf2adbc35 100644
--- a/drivers/rtc/rtc-pcf85063.c
+++ b/drivers/rtc/rtc-pcf85063.c
@@ -614,7 +614,7 @@ static int pcf85063_probe(struct i2c_client *client)
 	pcf85063_clkout_register_clk(pcf85063);
 #endif
 
-	return rtc_register_device(pcf85063->rtc);
+	return devm_rtc_register_device(pcf85063->rtc);
 }
 
 #ifdef CONFIG_OF
diff --git a/drivers/rtc/rtc-pcf85363.c b/drivers/rtc/rtc-pcf85363.c
index 23cf14ca2c96..a574c8d15a5c 100644
--- a/drivers/rtc/rtc-pcf85363.c
+++ b/drivers/rtc/rtc-pcf85363.c
@@ -418,7 +418,7 @@ static int pcf85363_probe(struct i2c_client *client,
 			pcf85363->rtc->ops = &rtc_ops_alarm;
 	}
 
-	ret = rtc_register_device(pcf85363->rtc);
+	ret = devm_rtc_register_device(pcf85363->rtc);
 
 	for (i = 0; i < config->num_nvram; i++) {
 		nvmem_cfg[i].priv = pcf85363;
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c
index 2dc30eafa639..de3e6c355f2e 100644
--- a/drivers/rtc/rtc-pcf8563.c
+++ b/drivers/rtc/rtc-pcf8563.c
@@ -582,7 +582,7 @@ static int pcf8563_probe(struct i2c_client *client,
 		}
 	}
 
-	err = rtc_register_device(pcf8563->rtc);
+	err = devm_rtc_register_device(pcf8563->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-pic32.c b/drivers/rtc/rtc-pic32.c
index 2b6946744654..7fb9145c43bd 100644
--- a/drivers/rtc/rtc-pic32.c
+++ b/drivers/rtc/rtc-pic32.c
@@ -338,7 +338,7 @@ static int pic32_rtc_probe(struct platform_device *pdev)
 	pdata->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	pdata->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret)
 		goto err_nortc;
 
diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c
index ebe03eba8f5f..5a880516f3e8 100644
--- a/drivers/rtc/rtc-pl030.c
+++ b/drivers/rtc/rtc-pl030.c
@@ -121,7 +121,7 @@ static int pl030_probe(struct amba_device *dev, const struct amba_id *id)
 	if (ret)
 		goto err_irq;
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		goto err_reg;
 
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index d4b2ab786126..224bbf096262 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -370,7 +370,7 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id)
 	ldata->rtc->range_min = vendor->range_min;
 	ldata->rtc->range_max = vendor->range_max;
 
-	ret = rtc_register_device(ldata->rtc);
+	ret = devm_rtc_register_device(ldata->rtc);
 	if (ret)
 		goto out;
 
diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c
index b45ee2cb2c04..0d9dd6faabba 100644
--- a/drivers/rtc/rtc-pm8xxx.c
+++ b/drivers/rtc/rtc-pm8xxx.c
@@ -508,7 +508,7 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev)
 		return rc;
 	}
 
-	return rtc_register_device(rtc_dd->rtc);
+	return devm_rtc_register_device(rtc_dd->rtc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-ps3.c b/drivers/rtc/rtc-ps3.c
index f0336d691e6c..6b098734c715 100644
--- a/drivers/rtc/rtc-ps3.c
+++ b/drivers/rtc/rtc-ps3.c
@@ -56,7 +56,7 @@ static int __init ps3_rtc_probe(struct platform_device *dev)
 
 	platform_set_drvdata(dev, rtc);
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver ps3_rtc_driver = {
diff --git a/drivers/rtc/rtc-r9701.c b/drivers/rtc/rtc-r9701.c
index 7ceb968f0e44..60a3c3d7499b 100644
--- a/drivers/rtc/rtc-r9701.c
+++ b/drivers/rtc/rtc-r9701.c
@@ -127,7 +127,7 @@ static int r9701_probe(struct spi_device *spi)
 	rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct spi_driver r9701_driver = {
diff --git a/drivers/rtc/rtc-rc5t619.c b/drivers/rtc/rtc-rc5t619.c
index dd1a20977478..e73102a39f1b 100644
--- a/drivers/rtc/rtc-rc5t619.c
+++ b/drivers/rtc/rtc-rc5t619.c
@@ -426,7 +426,7 @@ static int rc5t619_rtc_probe(struct platform_device *pdev)
 		dev_warn(&pdev->dev, "rc5t619 interrupt is disabled\n");
 	}
 
-	return rtc_register_device(rtc->rtc);
+	return devm_rtc_register_device(rtc->rtc);
 }
 
 static struct platform_driver rc5t619_rtc_driver = {
diff --git a/drivers/rtc/rtc-rk808.c b/drivers/rtc/rtc-rk808.c
index c0334c602e88..e920da8c08da 100644
--- a/drivers/rtc/rtc-rk808.c
+++ b/drivers/rtc/rtc-rk808.c
@@ -447,7 +447,7 @@ static int rk808_rtc_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	return rtc_register_device(rk808_rtc->rtc);
+	return devm_rtc_register_device(rk808_rtc->rtc);
 }
 
 static struct platform_driver rk808_rtc_driver = {
diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c
index 8bc476c0905f..44afa6d996e7 100644
--- a/drivers/rtc/rtc-rp5c01.c
+++ b/drivers/rtc/rtc-rp5c01.c
@@ -259,7 +259,7 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev)
 	if (error)
 		return error;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver rp5c01_rtc_driver = {
diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c
index 47c13678449e..fec633f80789 100644
--- a/drivers/rtc/rtc-rs5c348.c
+++ b/drivers/rtc/rtc-rs5c348.c
@@ -197,7 +197,7 @@ static int rs5c348_probe(struct spi_device *spi)
 
 	rtc->ops = &rs5c348_rtc_ops;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct spi_driver rs5c348_driver = {
diff --git a/drivers/rtc/rtc-rv3028.c b/drivers/rtc/rtc-rv3028.c
index f788df979750..979407a51c7a 100644
--- a/drivers/rtc/rtc-rv3028.c
+++ b/drivers/rtc/rtc-rv3028.c
@@ -886,7 +886,7 @@ static int rv3028_probe(struct i2c_client *client)
 	rv3028->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv3028->rtc->range_max = RTC_TIMESTAMP_END_2099;
 	rv3028->rtc->ops = &rv3028_rtc_ops;
-	ret = rtc_register_device(rv3028->rtc);
+	ret = devm_rtc_register_device(rv3028->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-rv3029c2.c b/drivers/rtc/rtc-rv3029c2.c
index ad359b3b74b2..dc1bda62095e 100644
--- a/drivers/rtc/rtc-rv3029c2.c
+++ b/drivers/rtc/rtc-rv3029c2.c
@@ -750,7 +750,7 @@ static int rv3029_probe(struct device *dev, struct regmap *regmap, int irq,
 	rv3029->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv3029->rtc->range_max = RTC_TIMESTAMP_END_2079;
 
-	rc = rtc_register_device(rv3029->rtc);
+	rc = devm_rtc_register_device(rv3029->rtc);
 	if (rc)
 		return rc;
 
diff --git a/drivers/rtc/rtc-rv3032.c b/drivers/rtc/rtc-rv3032.c
index ed9cba3292e6..c9bcea727757 100644
--- a/drivers/rtc/rtc-rv3032.c
+++ b/drivers/rtc/rtc-rv3032.c
@@ -885,7 +885,7 @@ static int rv3032_probe(struct i2c_client *client)
 	rv3032->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv3032->rtc->range_max = RTC_TIMESTAMP_END_2099;
 	rv3032->rtc->ops = &rv3032_rtc_ops;
-	ret = rtc_register_device(rv3032->rtc);
+	ret = devm_rtc_register_device(rv3032->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c
index 44e1818a751c..d4ea6db51b26 100644
--- a/drivers/rtc/rtc-rv8803.c
+++ b/drivers/rtc/rtc-rv8803.c
@@ -587,7 +587,7 @@ static int rv8803_probe(struct i2c_client *client,
 	rv8803->rtc->ops = &rv8803_rtc_ops;
 	rv8803->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv8803->rtc->range_max = RTC_TIMESTAMP_END_2099;
-	err = rtc_register_device(rv8803->rtc);
+	err = devm_rtc_register_device(rv8803->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-rx8010.c b/drivers/rtc/rtc-rx8010.c
index dca41a2a39b2..8340ab47a059 100644
--- a/drivers/rtc/rtc-rx8010.c
+++ b/drivers/rtc/rtc-rx8010.c
@@ -419,7 +419,7 @@ static int rx8010_probe(struct i2c_client *client)
 	rx8010->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rx8010->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	return rtc_register_device(rx8010->rtc);
+	return devm_rtc_register_device(rx8010->rtc);
 }
 
 static struct i2c_driver rx8010_driver = {
diff --git a/drivers/rtc/rtc-rx8581.c b/drivers/rtc/rtc-rx8581.c
index 017f74721cc0..de109139529b 100644
--- a/drivers/rtc/rtc-rx8581.c
+++ b/drivers/rtc/rtc-rx8581.c
@@ -298,7 +298,7 @@ static int rx8581_probe(struct i2c_client *client,
 	rx8581->rtc->start_secs = 0;
 	rx8581->rtc->set_start_time = true;
 
-	ret = rtc_register_device(rx8581->rtc);
+	ret = devm_rtc_register_device(rx8581->rtc);
 
 	for (i = 0; i < config->num_nvram; i++) {
 		nvmem_cfg[i].priv = rx8581;
diff --git a/drivers/rtc/rtc-s35390a.c b/drivers/rtc/rtc-s35390a.c
index 03672a246356..ea15d0392bb9 100644
--- a/drivers/rtc/rtc-s35390a.c
+++ b/drivers/rtc/rtc-s35390a.c
@@ -497,7 +497,7 @@ static int s35390a_probe(struct i2c_client *client,
 	if (status1 & S35390A_FLAG_INT2)
 		rtc_update_irq(s35390a->rtc, 1, RTC_AF);
 
-	return rtc_register_device(s35390a->rtc);
+	return devm_rtc_register_device(s35390a->rtc);
 }
 
 static struct i2c_driver s35390a_driver = {
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index 9ccc97cf5e09..1250887e4382 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -205,7 +205,7 @@ int sa1100_rtc_init(struct platform_device *pdev, struct sa1100_rtc *info)
 	info->rtc->max_user_freq = RTC_FREQ;
 	info->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(info->rtc);
+	ret = devm_rtc_register_device(info->rtc);
 	if (ret) {
 		clk_disable_unprepare(info->clk);
 		return ret;
diff --git a/drivers/rtc/rtc-sc27xx.c b/drivers/rtc/rtc-sc27xx.c
index a953bc0a5a5b..187aa955b79c 100644
--- a/drivers/rtc/rtc-sc27xx.c
+++ b/drivers/rtc/rtc-sc27xx.c
@@ -618,7 +618,7 @@ static int sprd_rtc_probe(struct platform_device *pdev)
 	rtc->rtc->ops = &sprd_rtc_ops;
 	rtc->rtc->range_min = 0;
 	rtc->rtc->range_max = 5662310399LL;
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret) {
 		device_init_wakeup(&pdev->dev, 0);
 		return ret;
diff --git a/drivers/rtc/rtc-sd3078.c b/drivers/rtc/rtc-sd3078.c
index a7aa943c1183..f6bee69ba017 100644
--- a/drivers/rtc/rtc-sd3078.c
+++ b/drivers/rtc/rtc-sd3078.c
@@ -192,7 +192,7 @@ static int sd3078_probe(struct i2c_client *client,
 	sd3078->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	sd3078->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	ret = rtc_register_device(sd3078->rtc);
+	ret = devm_rtc_register_device(sd3078->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index 9167b48014a1..cd146b574143 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -607,7 +607,7 @@ static int __init sh_rtc_probe(struct platform_device *pdev)
 		rtc->rtc_dev->range_max = mktime64(2098, 12, 31, 23, 59, 59);
 	}
 
-	ret = rtc_register_device(rtc->rtc_dev);
+	ret = devm_rtc_register_device(rtc->rtc_dev);
 	if (ret)
 		goto err_unmap;
 
diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c
index abf19435dbad..03a6cca23201 100644
--- a/drivers/rtc/rtc-sirfsoc.c
+++ b/drivers/rtc/rtc-sirfsoc.c
@@ -356,7 +356,7 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	return rtc_register_device(rtcdrv->rtc);
+	return devm_rtc_register_device(rtcdrv->rtc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
index a7d39a49b748..bd929b0e7d7d 100644
--- a/drivers/rtc/rtc-snvs.c
+++ b/drivers/rtc/rtc-snvs.c
@@ -387,7 +387,7 @@ static int snvs_rtc_probe(struct platform_device *pdev)
 	data->rtc->ops = &snvs_rtc_ops;
 	data->rtc->range_max = U32_MAX;
 
-	return rtc_register_device(data->rtc);
+	return devm_rtc_register_device(data->rtc);
 }
 
 static int __maybe_unused snvs_rtc_suspend_noirq(struct device *dev)
diff --git a/drivers/rtc/rtc-st-lpc.c b/drivers/rtc/rtc-st-lpc.c
index 0c65448b85ee..bdb20f63254e 100644
--- a/drivers/rtc/rtc-st-lpc.c
+++ b/drivers/rtc/rtc-st-lpc.c
@@ -250,7 +250,7 @@ static int st_rtc_probe(struct platform_device *pdev)
 	rtc->rtc_dev->range_max = U64_MAX;
 	do_div(rtc->rtc_dev->range_max, rtc->clkrate);
 
-	ret = rtc_register_device(rtc->rtc_dev);
+	ret = devm_rtc_register_device(rtc->rtc_dev);
 	if (ret) {
 		clk_disable_unprepare(rtc->clk);
 		return ret;
diff --git a/drivers/rtc/rtc-starfire.c b/drivers/rtc/rtc-starfire.c
index 37a26279e107..fbd1ed41cbf1 100644
--- a/drivers/rtc/rtc-starfire.c
+++ b/drivers/rtc/rtc-starfire.c
@@ -48,7 +48,7 @@ static int __init starfire_rtc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtc);
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver starfire_rtc_driver = {
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index ad616bce7bca..7cb6be1b7815 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -317,7 +317,7 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(pdata->rtc);
+	return devm_rtc_register_device(pdata->rtc);
 }
 
 /* work with hotplug and coldplug */
diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c
index 0a969af80af7..40c0f7ed36e0 100644
--- a/drivers/rtc/rtc-stmp3xxx.c
+++ b/drivers/rtc/rtc-stmp3xxx.c
@@ -366,7 +366,7 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev)
 	rtc_data->rtc->ops = &stmp3xxx_rtc_ops;
 	rtc_data->rtc->range_max = U32_MAX;
 
-	err = rtc_register_device(rtc_data->rtc);
+	err = devm_rtc_register_device(rtc_data->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-sun4v.c b/drivers/rtc/rtc-sun4v.c
index 036463dfa103..a86e27de8c06 100644
--- a/drivers/rtc/rtc-sun4v.c
+++ b/drivers/rtc/rtc-sun4v.c
@@ -86,7 +86,7 @@ static int __init sun4v_rtc_probe(struct platform_device *pdev)
 	rtc->range_max = U64_MAX;
 	platform_set_drvdata(pdev, rtc);
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver sun4v_rtc_driver = {
diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c
index f2818cdd11d8..adec1b14a8de 100644
--- a/drivers/rtc/rtc-sun6i.c
+++ b/drivers/rtc/rtc-sun6i.c
@@ -726,7 +726,7 @@ static int sun6i_rtc_probe(struct platform_device *pdev)
 	chip->rtc->ops = &sun6i_rtc_ops;
 	chip->rtc->range_max = 2019686399LL; /* 2033-12-31 23:59:59 */
 
-	ret = rtc_register_device(chip->rtc);
+	ret = devm_rtc_register_device(chip->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-sunxi.c b/drivers/rtc/rtc-sunxi.c
index f5d7f44550ce..5d019e3a835a 100644
--- a/drivers/rtc/rtc-sunxi.c
+++ b/drivers/rtc/rtc-sunxi.c
@@ -470,7 +470,7 @@ static int sunxi_rtc_probe(struct platform_device *pdev)
 
 	chip->rtc->ops = &sunxi_rtc_ops;
 
-	return rtc_register_device(chip->rtc);
+	return devm_rtc_register_device(chip->rtc);
 }
 
 static struct platform_driver sunxi_rtc_driver = {
diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c
index 7fbb1741692f..8925015cc698 100644
--- a/drivers/rtc/rtc-tegra.c
+++ b/drivers/rtc/rtc-tegra.c
@@ -329,7 +329,7 @@ static int tegra_rtc_probe(struct platform_device *pdev)
 		goto disable_clk;
 	}
 
-	ret = rtc_register_device(info->rtc);
+	ret = devm_rtc_register_device(info->rtc);
 	if (ret)
 		goto disable_clk;
 
diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c
index 74b3a0603b73..b092a1648513 100644
--- a/drivers/rtc/rtc-test.c
+++ b/drivers/rtc/rtc-test.c
@@ -139,7 +139,7 @@ static int test_probe(struct platform_device *plat_dev)
 	timer_setup(&rtd->alarm, test_rtc_alarm_handler, 0);
 	rtd->alarm.expires = 0;
 
-	return rtc_register_device(rtd->rtc);
+	return devm_rtc_register_device(rtd->rtc);
 }
 
 static struct platform_driver test_driver = {
diff --git a/drivers/rtc/rtc-tps6586x.c b/drivers/rtc/rtc-tps6586x.c
index e39af2d67051..a980337c3065 100644
--- a/drivers/rtc/rtc-tps6586x.c
+++ b/drivers/rtc/rtc-tps6586x.c
@@ -280,7 +280,7 @@ static int tps6586x_rtc_probe(struct platform_device *pdev)
 		goto fail_rtc_register;
 	}
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		goto fail_rtc_register;
 
diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c
index e3840386f430..2d87b62826a8 100644
--- a/drivers/rtc/rtc-tps65910.c
+++ b/drivers/rtc/rtc-tps65910.c
@@ -434,7 +434,7 @@ static int tps65910_rtc_probe(struct platform_device *pdev)
 	tps_rtc->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	tps_rtc->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	return rtc_register_device(tps_rtc->rtc);
+	return devm_rtc_register_device(tps_rtc->rtc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index 11f46272bad3..c3309db5448d 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -275,7 +275,7 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static int __exit tx4939_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index c3671043ace7..5a9f9ad86d32 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -335,7 +335,7 @@ static int rtc_probe(struct platform_device *pdev)
 
 	dev_info(&pdev->dev, "Real Time Clock of NEC VR4100 series\n");
 
-	retval = rtc_register_device(rtc);
+	retval = devm_rtc_register_device(rtc);
 	if (retval)
 		goto err_iounmap_all;
 
diff --git a/drivers/rtc/rtc-vt8500.c b/drivers/rtc/rtc-vt8500.c
index e2588625025f..197b649cd629 100644
--- a/drivers/rtc/rtc-vt8500.c
+++ b/drivers/rtc/rtc-vt8500.c
@@ -232,7 +232,7 @@ static int vt8500_rtc_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	return rtc_register_device(vt8500_rtc->rtc);
+	return devm_rtc_register_device(vt8500_rtc->rtc);
 }
 
 static int vt8500_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-wilco-ec.c b/drivers/rtc/rtc-wilco-ec.c
index ff46066a68a4..2a205a646452 100644
--- a/drivers/rtc/rtc-wilco-ec.c
+++ b/drivers/rtc/rtc-wilco-ec.c
@@ -176,7 +176,7 @@ static int wilco_ec_rtc_probe(struct platform_device *pdev)
 	rtc->range_max = RTC_TIMESTAMP_END_2099;
 	rtc->owner = THIS_MODULE;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver wilco_ec_rtc_driver = {
diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c
index ccef887d2690..640833e21057 100644
--- a/drivers/rtc/rtc-wm831x.c
+++ b/drivers/rtc/rtc-wm831x.c
@@ -429,7 +429,7 @@ static int wm831x_rtc_probe(struct platform_device *pdev)
 	wm831x_rtc->rtc->ops = &wm831x_rtc_ops;
 	wm831x_rtc->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(wm831x_rtc->rtc);
+	ret = devm_rtc_register_device(wm831x_rtc->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-xgene.c b/drivers/rtc/rtc-xgene.c
index 96db441f92b3..cf68a9b1c9eb 100644
--- a/drivers/rtc/rtc-xgene.c
+++ b/drivers/rtc/rtc-xgene.c
@@ -185,7 +185,7 @@ static int xgene_rtc_probe(struct platform_device *pdev)
 	pdata->rtc->ops = &xgene_rtc_ops;
 	pdata->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret) {
 		clk_disable_unprepare(pdata->clk);
 		return ret;
diff --git a/drivers/rtc/rtc-zynqmp.c b/drivers/rtc/rtc-zynqmp.c
index 4b1077e2f826..f440bb52be92 100644
--- a/drivers/rtc/rtc-zynqmp.c
+++ b/drivers/rtc/rtc-zynqmp.c
@@ -264,7 +264,7 @@ static int xlnx_rtc_probe(struct platform_device *pdev)
 
 	device_init_wakeup(&pdev->dev, 1);
 
-	return rtc_register_device(xrtcdev->rtc);
+	return devm_rtc_register_device(xrtcdev->rtc);
 }
 
 static int xlnx_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/sysfs.c b/drivers/rtc/sysfs.c
index 950fac0d41ff..8a957d31a1a4 100644
--- a/drivers/rtc/sysfs.c
+++ b/drivers/rtc/sysfs.c
@@ -317,8 +317,6 @@ int rtc_add_groups(struct rtc_device *rtc, const struct attribute_group **grps)
 	size_t old_cnt = 0, add_cnt = 0, new_cnt;
 	const struct attribute_group **groups, **old;
 
-	if (rtc->registered)
-		return -EINVAL;
 	if (!grps)
 		return -EINVAL;
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index cbca651d8ca4..55e7beed066c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -118,8 +118,6 @@ struct rtc_device {
 	 */
 	long set_offset_nsec;
 
-	bool registered;
-
 	time64_t range_min;
 	timeu64_t range_max;
 	time64_t start_secs;
@@ -157,7 +155,7 @@ extern struct rtc_device *devm_rtc_device_register(struct device *dev,
 					const struct rtc_class_ops *ops,
 					struct module *owner);
 struct rtc_device *devm_rtc_allocate_device(struct device *dev);
-int __rtc_register_device(struct module *owner, struct rtc_device *rtc);
+int __devm_rtc_register_device(struct module *owner, struct rtc_device *rtc);
 
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
@@ -234,8 +232,8 @@ static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec,
 	return false;
 }
 
-#define rtc_register_device(device) \
-	__rtc_register_device(THIS_MODULE, device)
+#define devm_rtc_register_device(device) \
+	__devm_rtc_register_device(THIS_MODULE, device)
 
 #ifdef CONFIG_RTC_HCTOSYS_DEVICE
 extern int rtc_hctosys_ret;
-- 
cgit 


From e1cef2d4c379b2aab43a7dc9601f645048209090 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 19 Nov 2020 14:53:40 +0900
Subject: tools/bootconfig: Align the bootconfig applied initrd image size to 4

Align the bootconfig applied initrd image size to 4. To fill the gap,
the bootconfig command uses null characters in between the bootconfig
data and the footer. This will expands the footer size but don't change
the checksum.
Thus the block image of the initrd file with bootconfig is as follows.

[initrd][bootconfig][(pad)][size][csum]["#BOOTCONFIG\n"]

Link: https://lkml.kernel.org/r/160576522046.320071.8550680670010950634.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h          |  3 ++
 tools/bootconfig/main.c             | 57 ++++++++++++++++++++++---------------
 tools/bootconfig/test-bootconfig.sh |  6 +++-
 3 files changed, 42 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 9903088891fa..2696eb0fc149 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -12,6 +12,9 @@
 
 #define BOOTCONFIG_MAGIC	"#BOOTCONFIG\n"
 #define BOOTCONFIG_MAGIC_LEN	12
+#define BOOTCONFIG_ALIGN_SHIFT	2
+#define BOOTCONFIG_ALIGN	(1 << BOOTCONFIG_ALIGN_SHIFT)
+#define BOOTCONFIG_ALIGN_MASK	(BOOTCONFIG_ALIGN - 1)
 
 /* XBC tree node */
 struct xbc_node {
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index a0733cbb3c49..4a445b6304bb 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -337,12 +337,13 @@ static int delete_xbc(const char *path)
 
 static int apply_xbc(const char *path, const char *xbc_path)
 {
+	char *buf, *data, *p;
+	size_t total_size;
 	struct stat stat;
+	const char *msg;
 	u32 size, csum;
-	char *buf, *data;
+	int pos, pad;
 	int ret, fd;
-	const char *msg;
-	int pos;
 
 	ret = load_xbc_file(xbc_path, &buf);
 	if (ret < 0) {
@@ -352,13 +353,12 @@ static int apply_xbc(const char *path, const char *xbc_path)
 	size = strlen(buf) + 1;
 	csum = checksum((unsigned char *)buf, size);
 
-	/* Prepare xbc_path data */
-	data = malloc(size + 8);
+	/* Backup the bootconfig data */
+	data = calloc(size + BOOTCONFIG_ALIGN +
+		      sizeof(u32) + sizeof(u32) + BOOTCONFIG_MAGIC_LEN, 1);
 	if (!data)
 		return -ENOMEM;
-	strcpy(data, buf);
-	*(u32 *)(data + size) = size;
-	*(u32 *)(data + size + 4) = csum;
+	memcpy(data, buf, size);
 
 	/* Check the data format */
 	ret = xbc_init(buf, &msg, &pos);
@@ -399,24 +399,35 @@ static int apply_xbc(const char *path, const char *xbc_path)
 		pr_err("Failed to get the size of %s\n", path);
 		goto out;
 	}
-	ret = write(fd, data, size + 8);
-	if (ret < size + 8) {
+
+	/* To align up the total size to BOOTCONFIG_ALIGN, get padding size */
+	total_size = stat.st_size + size + sizeof(u32) * 2 + BOOTCONFIG_MAGIC_LEN;
+	pad = ((total_size + BOOTCONFIG_ALIGN - 1) & (~BOOTCONFIG_ALIGN_MASK)) - total_size;
+	size += pad;
+
+	/* Add a footer */
+	p = data + size;
+	*(u32 *)p = size;
+	p += sizeof(u32);
+
+	*(u32 *)p = csum;
+	p += sizeof(u32);
+
+	memcpy(p, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN);
+	p += BOOTCONFIG_MAGIC_LEN;
+
+	total_size = p - data;
+
+	ret = write(fd, data, total_size);
+	if (ret < total_size) {
 		if (ret < 0)
 			ret = -errno;
 		pr_err("Failed to apply a boot config: %d\n", ret);
-		if (ret < 0)
-			goto out;
-		goto out_rollback;
-	}
-	/* Write a magic word of the bootconfig */
-	ret = write(fd, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN);
-	if (ret < BOOTCONFIG_MAGIC_LEN) {
-		if (ret < 0)
-			ret = -errno;
-		pr_err("Failed to apply a boot config magic: %d\n", ret);
-		goto out_rollback;
-	}
-	ret = 0;
+		if (ret >= 0)
+			goto out_rollback;
+	} else
+		ret = 0;
+
 out:
 	close(fd);
 	free(data);
diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh
index d295e406a756..baed891d0ba4 100755
--- a/tools/bootconfig/test-bootconfig.sh
+++ b/tools/bootconfig/test-bootconfig.sh
@@ -9,6 +9,7 @@ else
   TESTDIR=.
 fi
 BOOTCONF=${TESTDIR}/bootconfig
+ALIGN=4
 
 INITRD=`mktemp ${TESTDIR}/initrd-XXXX`
 TEMPCONF=`mktemp ${TESTDIR}/temp-XXXX.bconf`
@@ -59,7 +60,10 @@ echo "Show command test"
 xpass $BOOTCONF $INITRD
 
 echo "File size check"
-xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9 + 12)
+total_size=$(expr $bconf_size + $initrd_size + 9 + 12 + $ALIGN - 1 )
+total_size=$(expr $total_size / $ALIGN)
+total_size=$(expr $total_size \* $ALIGN)
+xpass test $new_size -eq $total_size
 
 echo "Apply command repeat test"
 xpass $BOOTCONF -a $TEMPCONF $INITRD
-- 
cgit 


From bc1c99a5971aa7571e8b9731c28fa32abe12cab8 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 17 Nov 2020 20:49:52 +0800
Subject: EDAC: Add DDR5 new memory type

Add a new entry to 'enum mem_type' and a new string to
'edac_mem_types[]' for DDR5 new memory type.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/edac_mc.c | 1 +
 include/linux/edac.h   | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index eef8724faae0..f6d462d0be2d 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -163,6 +163,7 @@ const char * const edac_mem_types[] = {
 	[MEM_RDDR4]	= "Registered-DDR4",
 	[MEM_LPDDR4]	= "Low-Power-DDR4-RAM",
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
+	[MEM_DDR5]	= "Unbuffered-DDR5",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
 	[MEM_WIO2]	= "Wide-IO-2",
 };
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 8f63245f7f7c..e64b73b556eb 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -181,6 +181,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  *			This is a variant of the DDR4 memories.
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
  * @MEM_LPDDR4:		Low-Power DDR4 memory.
+ * @MEM_DDR5:		Unbuffered DDR5 RAM
  * @MEM_NVDIMM:		Non-volatile RAM
  * @MEM_WIO2:		Wide I/O 2.
  */
@@ -208,6 +209,7 @@ enum mem_type {
 	MEM_RDDR4,
 	MEM_LRDDR4,
 	MEM_LPDDR4,
+	MEM_DDR5,
 	MEM_NVDIMM,
 	MEM_WIO2,
 };
@@ -234,6 +236,7 @@ enum mem_type {
 #define MEM_FLAG_RDDR4          BIT(MEM_RDDR4)
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
 #define MEM_FLAG_LPDDR4         BIT(MEM_LPDDR4)
+#define MEM_FLAG_DDR5           BIT(MEM_DDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
 #define MEM_FLAG_WIO2		BIT(MEM_WIO2)
 
-- 
cgit 


From dfe564045c653d9e6969ccca57a8a04771d333f7 Mon Sep 17 00:00:00 2001
From: chao <chao@eero.com>
Date: Sun, 30 Aug 2020 23:41:17 -0700
Subject: rcu: Panic after fixed number of stalls

Some stalls are transient, so that system fully recovers.  This commit
therefore allows users to configure the number of stalls that must happen
in order to trigger kernel panic.

Signed-off-by: chao <chao@eero.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/kernel.h  |  1 +
 kernel/rcu/tree_stall.h |  6 ++++++
 kernel/sysctl.c         | 11 +++++++++++
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f05e9128201..4b5fd3da5fe8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -536,6 +536,7 @@ extern int panic_on_warn;
 extern unsigned long panic_on_taint;
 extern bool panic_on_taint_nousertaint;
 extern int sysctl_panic_on_rcu_stall;
+extern int sysctl_max_rcu_stall_to_panic;
 extern int sysctl_panic_on_stackoverflow;
 
 extern bool crash_kexec_post_notifiers;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index ca21d28a0f98..70d48c52fabc 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -13,6 +13,7 @@
 
 /* panic() on RCU Stall sysctl. */
 int sysctl_panic_on_rcu_stall __read_mostly;
+int sysctl_max_rcu_stall_to_panic __read_mostly;
 
 #ifdef CONFIG_PROVE_RCU
 #define RCU_STALL_DELAY_DELTA		(5 * HZ)
@@ -106,6 +107,11 @@ early_initcall(check_cpu_stall_init);
 /* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
 static void panic_on_rcu_stall(void)
 {
+	static int cpu_stall;
+
+	if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
+		return;
+
 	if (sysctl_panic_on_rcu_stall)
 		panic("RCU Stall\n");
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afad085960b8..c9fbdd848138 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2650,6 +2650,17 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif
+#if defined(CONFIG_TREE_RCU)
+	{
+		.procname	= "max_rcu_stall_to_panic",
+		.data		= &sysctl_max_rcu_stall_to_panic,
+		.maxlen		= sizeof(sysctl_max_rcu_stall_to_panic),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+#endif
 #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
 	{
 		.procname	= "stack_erasing",
-- 
cgit 


From 1eafe075bf9cb4db575be4ddf1b1c8256758714a Mon Sep 17 00:00:00 2001
From: Asif Rasheed <b00073877@aus.edu>
Date: Sun, 20 Sep 2020 17:31:54 +0400
Subject: list.h: Update comment to explicitly note circular lists

The students in the Operating System Lecture Section at the
American University of Sharjah were confused by the header comment
in include/linux/list.h, which says "Simple doubly linked list
implementation".  This comment means "simple" as in "not complex",
but "simple" is often used in this context to mean "not circular".
This commit therefore avoids this ambiguity by explicitly calling out
"circular".

Signed-off-by: Asif Rasheed <b00073877@aus.edu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/list.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index a18c87b63376..89bdc92e75c3 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -9,7 +9,7 @@
 #include <linux/kernel.h>
 
 /*
- * Simple doubly linked list implementation.
+ * Circular doubly linked list implementation.
  *
  * Some of the internal functions ("__xxx") are useful when
  * manipulating whole lists rather than single entries, as
-- 
cgit 


From 2bf31d94423c8ae3ff58e38a115b177df6940399 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 16 Nov 2020 11:18:08 +0100
Subject: jbd2: fix kernel-doc markups

Kernel-doc markup should use this format:
        identifier - description

They should not have any type before that, as otherwise
the parser won't do the right thing.

Also, some identifiers have different names between their
prototypes and the kernel-doc markup.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/72f5c6628f5f278d67625f60893ffbc2ca28d46e.1605521731.git.mchehab+huawei@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c     | 34 ++++++++++++++++++----------------
 fs/jbd2/transaction.c | 31 ++++++++++++++++---------------
 include/linux/jbd2.h  |  2 +-
 3 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0c3d5e3b24b2..188f79d76988 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -566,12 +566,14 @@ static int __jbd2_journal_force_commit(journal_t *journal)
 }
 
 /**
- * Force and wait upon a commit if the calling process is not within
- * transaction.  This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
+ * jbd2_journal_force_commit_nested - Force and wait upon a commit if the
+ * calling process is not within transaction.
  *
  * @journal: journal to force
  * Returns true if progress was made.
+ *
+ * This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
  */
 int jbd2_journal_force_commit_nested(journal_t *journal)
 {
@@ -582,7 +584,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 }
 
 /**
- * int journal_force_commit() - force any uncommitted transactions
+ * jbd2_journal_force_commit() - force any uncommitted transactions
  * @journal: journal to force
  *
  * Caller want unconditional commit. We can only force the running transaction
@@ -1881,7 +1883,7 @@ static int load_superblock(journal_t *journal)
 
 
 /**
- * int jbd2_journal_load() - Read journal from disk.
+ * jbd2_journal_load() - Read journal from disk.
  * @journal: Journal to act on.
  *
  * Given a journal_t structure which tells us which disk blocks contain
@@ -1951,7 +1953,7 @@ recovery_error:
 }
 
 /**
- * void jbd2_journal_destroy() - Release a journal_t structure.
+ * jbd2_journal_destroy() - Release a journal_t structure.
  * @journal: Journal to act on.
  *
  * Release a journal_t structure once it is no longer in use by the
@@ -2028,7 +2030,7 @@ int jbd2_journal_destroy(journal_t *journal)
 
 
 /**
- *int jbd2_journal_check_used_features() - Check if features specified are used.
+ * jbd2_journal_check_used_features() - Check if features specified are used.
  * @journal: Journal to check.
  * @compat: bitmask of compatible features
  * @ro: bitmask of features that force read-only mount
@@ -2063,7 +2065,7 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
 }
 
 /**
- * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * jbd2_journal_check_available_features() - Check feature set in journalling layer
  * @journal: Journal to check.
  * @compat: bitmask of compatible features
  * @ro: bitmask of features that force read-only mount
@@ -2126,7 +2128,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
 }
 
 /**
- * int jbd2_journal_set_features() - Mark a given journal feature in the superblock
+ * jbd2_journal_set_features() - Mark a given journal feature in the superblock
  * @journal: Journal to act on.
  * @compat: bitmask of compatible features
  * @ro: bitmask of features that force read-only mount
@@ -2217,7 +2219,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
 }
 
 /*
- * jbd2_journal_clear_features () - Clear a given journal feature in the
+ * jbd2_journal_clear_features() - Clear a given journal feature in the
  * 				    superblock
  * @journal: Journal to act on.
  * @compat: bitmask of compatible features
@@ -2246,7 +2248,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
 EXPORT_SYMBOL(jbd2_journal_clear_features);
 
 /**
- * int jbd2_journal_flush () - Flush journal
+ * jbd2_journal_flush() - Flush journal
  * @journal: Journal to act on.
  *
  * Flush all data for a given journal to disk and empty the journal.
@@ -2321,7 +2323,7 @@ out:
 }
 
 /**
- * int jbd2_journal_wipe() - Wipe journal contents
+ * jbd2_journal_wipe() - Wipe journal contents
  * @journal: Journal to act on.
  * @write: flag (see below)
  *
@@ -2362,7 +2364,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 }
 
 /**
- * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * jbd2_journal_abort () - Shutdown the journal immediately.
  * @journal: the journal to shutdown.
  * @errno:   an error number to record in the journal indicating
  *           the reason for the shutdown.
@@ -2453,7 +2455,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
 }
 
 /**
- * int jbd2_journal_errno () - returns the journal's error state.
+ * jbd2_journal_errno() - returns the journal's error state.
  * @journal: journal to examine.
  *
  * This is the errno number set with jbd2_journal_abort(), the last
@@ -2477,7 +2479,7 @@ int jbd2_journal_errno(journal_t *journal)
 }
 
 /**
- * int jbd2_journal_clear_err () - clears the journal's error state
+ * jbd2_journal_clear_err() - clears the journal's error state
  * @journal: journal to act on.
  *
  * An error must be cleared or acked to take a FS out of readonly
@@ -2497,7 +2499,7 @@ int jbd2_journal_clear_err(journal_t *journal)
 }
 
 /**
- * void jbd2_journal_ack_err() - Ack journal err.
+ * jbd2_journal_ack_err() - Ack journal err.
  * @journal: journal to act on.
  *
  * An error must be cleared or acked to take a FS out of readonly
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d54f04674e8e..9396666b7314 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -519,7 +519,7 @@ EXPORT_SYMBOL(jbd2__journal_start);
 
 
 /**
- * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * jbd2_journal_start() - Obtain a new handle.
  * @journal: Journal to start transaction on.
  * @nblocks: number of block buffer we might modify
  *
@@ -566,7 +566,7 @@ void jbd2_journal_free_reserved(handle_t *handle)
 EXPORT_SYMBOL(jbd2_journal_free_reserved);
 
 /**
- * int jbd2_journal_start_reserved() - start reserved handle
+ * jbd2_journal_start_reserved() - start reserved handle
  * @handle: handle to start
  * @type: for handle statistics
  * @line_no: for handle statistics
@@ -620,7 +620,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
 EXPORT_SYMBOL(jbd2_journal_start_reserved);
 
 /**
- * int jbd2_journal_extend() - extend buffer credits.
+ * jbd2_journal_extend() - extend buffer credits.
  * @handle:  handle to 'extend'
  * @nblocks: nr blocks to try to extend by.
  * @revoke_records: number of revoke records to try to extend by.
@@ -745,7 +745,7 @@ static void stop_this_handle(handle_t *handle)
 }
 
 /**
- * int jbd2_journal_restart() - restart a handle .
+ * jbd2__journal_restart() - restart a handle .
  * @handle:  handle to restart
  * @nblocks: nr credits requested
  * @revoke_records: number of revoke record credits requested
@@ -815,7 +815,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
 EXPORT_SYMBOL(jbd2_journal_restart);
 
 /**
- * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * jbd2_journal_lock_updates () - establish a transaction barrier.
  * @journal:  Journal to establish a barrier on.
  *
  * This locks out any further updates from being started, and blocks
@@ -874,7 +874,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
 }
 
 /**
- * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * jbd2_journal_unlock_updates () - release barrier
  * @journal:  Journal to release the barrier on.
  *
  * Release a transaction barrier obtained with jbd2_journal_lock_updates().
@@ -1182,7 +1182,8 @@ out:
 }
 
 /**
- * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * jbd2_journal_get_write_access() - notify intent to modify a buffer
+ *				     for metadata (not data) update.
  * @handle: transaction to add buffer modifications to
  * @bh:     bh to be used for metadata writes
  *
@@ -1226,7 +1227,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
  * unlocked buffer beforehand. */
 
 /**
- * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * jbd2_journal_get_create_access () - notify intent to use newly created bh
  * @handle: transaction to new buffer to
  * @bh: new buffer.
  *
@@ -1306,7 +1307,7 @@ out:
 }
 
 /**
- * int jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
+ * jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
  *     non-rewindable consequences
  * @handle: transaction
  * @bh: buffer to undo
@@ -1383,7 +1384,7 @@ out:
 }
 
 /**
- * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * jbd2_journal_set_triggers() - Add triggers for commit writeout
  * @bh: buffer to trigger on
  * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
  *
@@ -1425,7 +1426,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
 }
 
 /**
- * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
+ * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
  * @bh: buffer to mark
  *
@@ -1593,7 +1594,7 @@ out:
 }
 
 /**
- * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
  * @handle: transaction handle
  * @bh:     bh to 'forget'
  *
@@ -1762,7 +1763,7 @@ drop:
 }
 
 /**
- * int jbd2_journal_stop() - complete a transaction
+ * jbd2_journal_stop() - complete a transaction
  * @handle: transaction to complete.
  *
  * All done for a particular handle.
@@ -2080,7 +2081,7 @@ out:
 }
 
 /**
- * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * jbd2_journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
  * @page: to try and free
  *
@@ -2411,7 +2412,7 @@ zap_buffer_unlocked:
 }
 
 /**
- * void jbd2_journal_invalidatepage()
+ * jbd2_journal_invalidatepage()
  * @journal: journal to use for flush...
  * @page:    page to flush
  * @offset:  start of the range to invalidate
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 1c49fd62ff2e..578ff196b3ce 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -401,7 +401,7 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 #define JI_WAIT_DATA (1 << __JI_WAIT_DATA)
 
 /**
- * struct jbd_inode - The jbd_inode type is the structure linking inodes in
+ * struct jbd2_inode - The jbd_inode type is the structure linking inodes in
  * ordered mode present in a transaction so that we can sync them during commit.
  */
 struct jbd2_inode {
-- 
cgit 


From a24d22b225ce158651378869a6b88105c4bdb887 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 12 Nov 2020 21:20:21 -0800
Subject: crypto: sha - split sha.h into sha1.h and sha2.h

Currently <crypto/sha.h> contains declarations for both SHA-1 and SHA-2,
and <crypto/sha3.h> contains declarations for SHA-3.

This organization is inconsistent, but more importantly SHA-1 is no
longer considered to be cryptographically secure.  So to the extent
possible, SHA-1 shouldn't be grouped together with any of the other SHA
versions, and usage of it should be phased out.

Therefore, split <crypto/sha.h> into two headers <crypto/sha1.h> and
<crypto/sha2.h>, and make everyone explicitly specify whether they want
the declarations for SHA-1, SHA-2, or both.

This avoids making the SHA-1 declarations visible to files that don't
want anything to do with SHA-1.  It also prepares for potentially moving
sha1.h into a new insecure/ or dangerous/ directory.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/sha1-ce-glue.c                     |   2 +-
 arch/arm/crypto/sha1.h                             |   2 +-
 arch/arm/crypto/sha1_glue.c                        |   2 +-
 arch/arm/crypto/sha1_neon_glue.c                   |   2 +-
 arch/arm/crypto/sha2-ce-glue.c                     |   2 +-
 arch/arm/crypto/sha256_glue.c                      |   2 +-
 arch/arm/crypto/sha256_neon_glue.c                 |   2 +-
 arch/arm/crypto/sha512-glue.c                      |   2 +-
 arch/arm/crypto/sha512-neon-glue.c                 |   2 +-
 arch/arm64/crypto/aes-glue.c                       |   2 +-
 arch/arm64/crypto/sha1-ce-glue.c                   |   2 +-
 arch/arm64/crypto/sha2-ce-glue.c                   |   2 +-
 arch/arm64/crypto/sha256-glue.c                    |   2 +-
 arch/arm64/crypto/sha512-ce-glue.c                 |   2 +-
 arch/arm64/crypto/sha512-glue.c                    |   2 +-
 arch/mips/cavium-octeon/crypto/octeon-sha1.c       |   2 +-
 arch/mips/cavium-octeon/crypto/octeon-sha256.c     |   2 +-
 arch/mips/cavium-octeon/crypto/octeon-sha512.c     |   2 +-
 arch/powerpc/crypto/sha1-spe-glue.c                |   2 +-
 arch/powerpc/crypto/sha1.c                         |   2 +-
 arch/powerpc/crypto/sha256-spe-glue.c              |   2 +-
 arch/s390/crypto/sha.h                             |   3 +-
 arch/s390/crypto/sha1_s390.c                       |   2 +-
 arch/s390/crypto/sha256_s390.c                     |   2 +-
 arch/s390/crypto/sha3_256_s390.c                   |   1 -
 arch/s390/crypto/sha3_512_s390.c                   |   1 -
 arch/s390/crypto/sha512_s390.c                     |   2 +-
 arch/s390/purgatory/purgatory.c                    |   2 +-
 arch/sparc/crypto/sha1_glue.c                      |   2 +-
 arch/sparc/crypto/sha256_glue.c                    |   2 +-
 arch/sparc/crypto/sha512_glue.c                    |   2 +-
 arch/x86/crypto/sha1_ssse3_glue.c                  |   2 +-
 arch/x86/crypto/sha256_ssse3_glue.c                |   2 +-
 arch/x86/crypto/sha512_ssse3_glue.c                |   2 +-
 arch/x86/purgatory/purgatory.c                     |   2 +-
 crypto/asymmetric_keys/asym_tpm.c                  |   2 +-
 crypto/sha1_generic.c                              |   2 +-
 crypto/sha256_generic.c                            |   2 +-
 crypto/sha512_generic.c                            |   2 +-
 drivers/char/random.c                              |   2 +-
 drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h       |   2 +-
 drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c  |   3 +-
 drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h       |   3 +-
 drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c  |   3 +-
 drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h       |   3 +-
 drivers/crypto/amcc/crypto4xx_alg.c                |   2 +-
 drivers/crypto/amcc/crypto4xx_core.c               |   2 +-
 drivers/crypto/atmel-authenc.h                     |   3 +-
 drivers/crypto/atmel-sha.c                         |   3 +-
 drivers/crypto/axis/artpec6_crypto.c               |   3 +-
 drivers/crypto/bcm/cipher.c                        |   3 +-
 drivers/crypto/bcm/cipher.h                        |   3 +-
 drivers/crypto/bcm/spu.h                           |   3 +-
 drivers/crypto/caam/compat.h                       |   3 +-
 drivers/crypto/cavium/nitrox/nitrox_aead.c         |   1 -
 drivers/crypto/ccp/ccp-crypto-sha.c                |   3 +-
 drivers/crypto/ccp/ccp-crypto.h                    |   3 +-
 drivers/crypto/ccree/cc_driver.h                   |   3 +-
 drivers/crypto/chelsio/chcr_algo.c                 |   3 +-
 drivers/crypto/hisilicon/sec2/sec_crypto.c         |   3 +-
 drivers/crypto/img-hash.c                          |   3 +-
 drivers/crypto/inside-secure/safexcel.h            |   3 +-
 drivers/crypto/inside-secure/safexcel_cipher.c     |   3 +-
 drivers/crypto/inside-secure/safexcel_hash.c       |   3 +-
 drivers/crypto/ixp4xx_crypto.c                     |   2 +-
 drivers/crypto/marvell/cesa/hash.c                 |   3 +-
 drivers/crypto/marvell/octeontx/otx_cptvf_algs.c   |   3 +-
 drivers/crypto/mediatek/mtk-sha.c                  |   3 +-
 drivers/crypto/mxs-dcp.c                           |   3 +-
 drivers/crypto/n2_core.c                           |   3 +-
 drivers/crypto/nx/nx-sha256.c                      |   2 +-
 drivers/crypto/nx/nx-sha512.c                      |   2 +-
 drivers/crypto/nx/nx.c                             |   2 +-
 drivers/crypto/omap-sham.c                         |   3 +-
 drivers/crypto/padlock-sha.c                       |   3 +-
 drivers/crypto/picoxcell_crypto.c                  |   3 +-
 drivers/crypto/qat/qat_common/qat_algs.c           |   3 +-
 drivers/crypto/qce/common.c                        |   3 +-
 drivers/crypto/qce/core.c                          |   1 -
 drivers/crypto/qce/sha.h                           |   3 +-
 drivers/crypto/rockchip/rk3288_crypto.h            |   3 +-
 drivers/crypto/s5p-sss.c                           |   3 +-
 drivers/crypto/sa2ul.c                             |   3 +-
 drivers/crypto/sa2ul.h                             |   3 +-
 drivers/crypto/sahara.c                            |   3 +-
 drivers/crypto/stm32/stm32-hash.c                  |   3 +-
 drivers/crypto/talitos.c                           |   3 +-
 drivers/crypto/ux500/hash/hash_core.c              |   3 +-
 drivers/firmware/efi/embedded-firmware.c           |   2 +-
 .../chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c    |   3 +-
 .../ethernet/chelsio/inline_crypto/chtls/chtls.h   |   3 +-
 drivers/nfc/s3fwrn5/firmware.c                     |   2 +-
 drivers/tee/tee_core.c                             |   2 +-
 fs/crypto/fname.c                                  |   2 +-
 fs/crypto/hkdf.c                                   |   2 +-
 fs/ubifs/auth.c                                    |   1 -
 fs/verity/fsverity_private.h                       |   2 +-
 include/crypto/hash_info.h                         |   3 +-
 include/crypto/sha.h                               | 167 ---------------------
 include/crypto/sha1.h                              |  46 ++++++
 include/crypto/sha1_base.h                         |   2 +-
 include/crypto/sha2.h                              | 134 +++++++++++++++++
 include/crypto/sha256_base.h                       |   2 +-
 include/crypto/sha512_base.h                       |   2 +-
 include/linux/ccp.h                                |   3 +-
 include/linux/filter.h                             |   2 +-
 include/linux/purgatory.h                          |   2 +-
 kernel/crash_core.c                                |   2 +-
 kernel/kexec_core.c                                |   1 -
 kernel/kexec_file.c                                |   2 +-
 lib/crypto/sha256.c                                |   2 +-
 lib/digsig.c                                       |   2 +-
 lib/sha1.c                                         |   2 +-
 net/ipv6/seg6_hmac.c                               |   1 -
 net/mptcp/crypto.c                                 |   2 +-
 net/mptcp/options.c                                |   2 +-
 net/mptcp/subflow.c                                |   2 +-
 security/integrity/integrity.h                     |   2 +-
 security/keys/encrypted-keys/encrypted.c           |   2 +-
 security/keys/trusted-keys/trusted_tpm1.c          |   2 +-
 sound/soc/codecs/cros_ec_codec.c                   |   2 +-
 121 files changed, 335 insertions(+), 285 deletions(-)
 delete mode 100644 include/crypto/sha.h
 create mode 100644 include/crypto/sha1.h
 create mode 100644 include/crypto/sha2.h

(limited to 'include/linux')

diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
index e79b1fb4b4dc..de9100c67b37 100644
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ b/arch/arm/crypto/sha1-ce-glue.c
@@ -7,7 +7,7 @@
 
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
diff --git a/arch/arm/crypto/sha1.h b/arch/arm/crypto/sha1.h
index 758db3e9ff0a..b1b7e21da2c3 100644
--- a/arch/arm/crypto/sha1.h
+++ b/arch/arm/crypto/sha1.h
@@ -3,7 +3,7 @@
 #define ASM_ARM_CRYPTO_SHA1_H
 
 #include <linux/crypto.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 
 extern int sha1_update_arm(struct shash_desc *desc, const u8 *data,
 			   unsigned int len);
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
index 4e954b3f7ecd..6c2b849e459d 100644
--- a/arch/arm/crypto/sha1_glue.c
+++ b/arch/arm/crypto/sha1_glue.c
@@ -15,7 +15,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <asm/byteorder.h>
 
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
index 0071e5e4411a..cfe36ae0f3f5 100644
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ b/arch/arm/crypto/sha1_neon_glue.c
@@ -19,7 +19,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 87f0b62386c6..c62ce89dd3e0 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -7,7 +7,7 @@
 
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha256_base.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
index b8a4f79020cf..433ee4ddce6c 100644
--- a/arch/arm/crypto/sha256_glue.c
+++ b/arch/arm/crypto/sha256_glue.c
@@ -17,7 +17,7 @@
 #include <linux/mm.h>
 #include <linux/types.h>
 #include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha256_base.h>
 #include <asm/simd.h>
 #include <asm/neon.h>
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
index 79820b9e2541..701706262ef3 100644
--- a/arch/arm/crypto/sha256_neon_glue.c
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -13,7 +13,7 @@
 #include <crypto/internal/simd.h>
 #include <linux/types.h>
 #include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha256_base.h>
 #include <asm/byteorder.h>
 #include <asm/simd.h>
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
index 8775aa42bbbe..0635a65aa488 100644
--- a/arch/arm/crypto/sha512-glue.c
+++ b/arch/arm/crypto/sha512-glue.c
@@ -6,7 +6,7 @@
  */
 
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
index 96cb94403540..c879ad32db51 100644
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ b/arch/arm/crypto/sha512-neon-glue.c
@@ -7,7 +7,7 @@
 
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 395bbf64b2ab..34b8a89197be 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -10,7 +10,7 @@
 #include <asm/simd.h>
 #include <crypto/aes.h>
 #include <crypto/ctr.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index c63b99211db3..c93121bcfdeb 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -10,7 +10,7 @@
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 5e956d7582a5..31ba3da5e61b 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -10,7 +10,7 @@
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha256_base.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index 77bc6e72abae..9462f6088b3f 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -10,7 +10,7 @@
 #include <asm/simd.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha256_base.h>
 #include <linux/types.h>
 #include <linux/string.h>
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
index dc890a719f54..faa83f6cf376 100644
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -14,7 +14,7 @@
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
index 370ccb29602f..2acff1c7df5d 100644
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@@ -8,7 +8,7 @@
 #include <crypto/internal/hash.h>
 #include <linux/types.h>
 #include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
 #include <asm/neon.h>
 
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha1.c b/arch/mips/cavium-octeon/crypto/octeon-sha1.c
index 75e79b47abfe..30f1d75208a5 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-sha1.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha1.c
@@ -14,7 +14,7 @@
  */
 
 #include <linux/mm.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
index a682ce76716a..36cb92895d72 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
@@ -15,7 +15,7 @@
  */
 
 #include <linux/mm.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha512.c b/arch/mips/cavium-octeon/crypto/octeon-sha512.c
index 50722a0cfb53..359f039820d8 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-sha512.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha512.c
@@ -14,7 +14,7 @@
  */
 
 #include <linux/mm.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c
index cb57be4ada61..b1e577cbf00c 100644
--- a/arch/powerpc/crypto/sha1-spe-glue.c
+++ b/arch/powerpc/crypto/sha1-spe-glue.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <asm/byteorder.h>
 #include <asm/switch_to.h>
 #include <linux/hardirq.h>
diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c
index b40dc50a6908..7a55d790cdb1 100644
--- a/arch/powerpc/crypto/sha1.c
+++ b/arch/powerpc/crypto/sha1.c
@@ -17,7 +17,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <asm/byteorder.h>
 
 void powerpc_sha_transform(u32 *state, const u8 *src);
diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c
index ceb0b6c980b3..88530ae0791f 100644
--- a/arch/powerpc/crypto/sha256-spe-glue.c
+++ b/arch/powerpc/crypto/sha256-spe-glue.c
@@ -13,7 +13,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <asm/byteorder.h>
 #include <asm/switch_to.h>
 #include <linux/hardirq.h>
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
index ada2f98c27b7..65ea12fc87a1 100644
--- a/arch/s390/crypto/sha.h
+++ b/arch/s390/crypto/sha.h
@@ -11,7 +11,8 @@
 #define _CRYPTO_ARCH_S390_SHA_H
 
 #include <linux/crypto.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/sha3.h>
 
 /* must be big enough for the largest SHA variant */
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
index 698b1e6d3c14..a3fabf310a38 100644
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -22,7 +22,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <asm/cpacf.h>
 
 #include "sha.h"
diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c
index b52c87e44939..24983f175676 100644
--- a/arch/s390/crypto/sha256_s390.c
+++ b/arch/s390/crypto/sha256_s390.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <asm/cpacf.h>
 
 #include "sha.h"
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
index 460cbbbaa44a..30ac49b635bf 100644
--- a/arch/s390/crypto/sha3_256_s390.c
+++ b/arch/s390/crypto/sha3_256_s390.c
@@ -12,7 +12,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <crypto/sha.h>
 #include <crypto/sha3.h>
 #include <asm/cpacf.h>
 
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
index 72cf460a53e5..e70d50f7620f 100644
--- a/arch/s390/crypto/sha3_512_s390.c
+++ b/arch/s390/crypto/sha3_512_s390.c
@@ -11,7 +11,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <crypto/sha.h>
 #include <crypto/sha3.h>
 #include <asm/cpacf.h>
 
diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c
index ad29db085a18..29a6bd404c59 100644
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@@ -8,7 +8,7 @@
  * Author(s): Jan Glauber (jang@de.ibm.com)
  */
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c
index 0a423bcf6746..030efda05dbe 100644
--- a/arch/s390/purgatory/purgatory.c
+++ b/arch/s390/purgatory/purgatory.c
@@ -9,7 +9,7 @@
 
 #include <linux/kexec.h>
 #include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <asm/purgatory.h>
 
 int verify_sha256_digest(void)
diff --git a/arch/sparc/crypto/sha1_glue.c b/arch/sparc/crypto/sha1_glue.c
index dc017782be52..86a654cce5ab 100644
--- a/arch/sparc/crypto/sha1_glue.c
+++ b/arch/sparc/crypto/sha1_glue.c
@@ -16,7 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 
 #include <asm/pstate.h>
 #include <asm/elf.h>
diff --git a/arch/sparc/crypto/sha256_glue.c b/arch/sparc/crypto/sha256_glue.c
index ca2547df9652..60ec524cf9ca 100644
--- a/arch/sparc/crypto/sha256_glue.c
+++ b/arch/sparc/crypto/sha256_glue.c
@@ -16,7 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 
 #include <asm/pstate.h>
 #include <asm/elf.h>
diff --git a/arch/sparc/crypto/sha512_glue.c b/arch/sparc/crypto/sha512_glue.c
index 3b2ca732ff7a..273ce21918c1 100644
--- a/arch/sparc/crypto/sha512_glue.c
+++ b/arch/sparc/crypto/sha512_glue.c
@@ -15,7 +15,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 
 #include <asm/pstate.h>
 #include <asm/elf.h>
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 18200135603f..44340a1139e0 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -22,7 +22,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <asm/simd.h>
 
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index dd06249229e1..3a5f6be7dbba 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -35,7 +35,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha256_base.h>
 #include <linux/string.h>
 #include <asm/simd.h>
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index b0b05c93409e..30e70f4fe2f7 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -34,7 +34,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
 #include <asm/simd.h>
 
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index 7b37a412f829..f03b64d9cb51 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -9,7 +9,7 @@
  */
 
 #include <linux/bug.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <asm/purgatory.h>
 
 #include "../boot/string.h"
diff --git a/crypto/asymmetric_keys/asym_tpm.c b/crypto/asymmetric_keys/asym_tpm.c
index 378b18b9bc34..511932aa94a6 100644
--- a/crypto/asymmetric_keys/asym_tpm.c
+++ b/crypto/asymmetric_keys/asym_tpm.c
@@ -10,7 +10,7 @@
 #include <linux/tpm_command.h>
 #include <crypto/akcipher.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <asm/unaligned.h>
 #include <keys/asymmetric-subtype.h>
 #include <keys/trusted_tpm.h>
diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c
index 1d43472fecbd..325b57fe28dc 100644
--- a/crypto/sha1_generic.c
+++ b/crypto/sha1_generic.c
@@ -16,7 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <asm/byteorder.h>
 
diff --git a/crypto/sha256_generic.c b/crypto/sha256_generic.c
index 88156e3e2a33..3b377197236e 100644
--- a/crypto/sha256_generic.c
+++ b/crypto/sha256_generic.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha256_base.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c
index e34d09dd9971..c72d72ad828e 100644
--- a/crypto/sha512_generic.c
+++ b/crypto/sha512_generic.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/crypto.h>
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
 #include <linux/percpu.h>
 #include <asm/byteorder.h>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2a41b21623ae..5f3b8ac9d97b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -336,7 +336,7 @@
 #include <linux/completion.h>
 #include <linux/uuid.h>
 #include <crypto/chacha.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 
 #include <asm/processor.h>
 #include <linux/uaccess.h>
diff --git a/drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h b/drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h
index 163962f9e284..5c291e4a6857 100644
--- a/drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h
+++ b/drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h
@@ -25,7 +25,7 @@
 #include <linux/pm_runtime.h>
 #include <crypto/md5.h>
 #include <crypto/skcipher.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/hash.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c
index fa2f1b4fad7b..4927a6c82d32 100644
--- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c
+++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c
@@ -13,7 +13,8 @@
 #include <linux/pm_runtime.h>
 #include <linux/scatterlist.h>
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/md5.h>
 #include "sun8i-ce.h"
 
diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h
index 558027516aed..cec781d5063c 100644
--- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h
+++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h
@@ -16,7 +16,8 @@
 #include <crypto/internal/hash.h>
 #include <crypto/md5.h>
 #include <crypto/rng.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 /* CE Registers */
 #define CE_TDQ	0x00
diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c
index b6ab2054f217..11cbcbc83a7b 100644
--- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c
+++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c
@@ -13,7 +13,8 @@
 #include <linux/pm_runtime.h>
 #include <linux/scatterlist.h>
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/md5.h>
 #include "sun8i-ss.h"
 
diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h
index 1a66457f4a20..28188685b910 100644
--- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h
+++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h
@@ -15,7 +15,8 @@
 #include <linux/crypto.h>
 #include <crypto/internal/hash.h>
 #include <crypto/md5.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 #define SS_START	1
 
diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c
index 7729a637fb02..a3fa849b139a 100644
--- a/drivers/crypto/amcc/crypto4xx_alg.c
+++ b/drivers/crypto/amcc/crypto4xx_alg.c
@@ -20,7 +20,7 @@
 #include <crypto/aead.h>
 #include <crypto/aes.h>
 #include <crypto/gcm.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/ctr.h>
 #include <crypto/skcipher.h>
 #include "crypto4xx_reg_def.h"
diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c
index 2e3690f65786..8d1b918a0533 100644
--- a/drivers/crypto/amcc/crypto4xx_core.c
+++ b/drivers/crypto/amcc/crypto4xx_core.c
@@ -30,7 +30,7 @@
 #include <crypto/aes.h>
 #include <crypto/ctr.h>
 #include <crypto/gcm.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/rng.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/skcipher.h>
diff --git a/drivers/crypto/atmel-authenc.h b/drivers/crypto/atmel-authenc.h
index c6530a1c8c20..45171e89a7d2 100644
--- a/drivers/crypto/atmel-authenc.h
+++ b/drivers/crypto/atmel-authenc.h
@@ -16,7 +16,8 @@
 
 #include <crypto/authenc.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include "atmel-sha-regs.h"
 
 struct atmel_aes_dev;
diff --git a/drivers/crypto/atmel-sha.c b/drivers/crypto/atmel-sha.c
index 0eb6f54e3b66..352d80cb5ae9 100644
--- a/drivers/crypto/atmel-sha.c
+++ b/drivers/crypto/atmel-sha.c
@@ -33,7 +33,8 @@
 #include <linux/crypto.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/algapi.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/hash.h>
 #include <crypto/internal/hash.h>
 #include "atmel-sha-regs.h"
diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c
index 809c3033ca74..9ad188cffd0d 100644
--- a/drivers/crypto/axis/artpec6_crypto.c
+++ b/drivers/crypto/axis/artpec6_crypto.c
@@ -28,7 +28,8 @@
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/xts.h>
 
 /* Max length of a line in all cache levels for Artpec SoCs. */
diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c
index 50d169e61b41..30390a7324b2 100644
--- a/drivers/crypto/bcm/cipher.c
+++ b/drivers/crypto/bcm/cipher.c
@@ -26,11 +26,12 @@
 #include <crypto/aes.h>
 #include <crypto/internal/des.h>
 #include <crypto/hmac.h>
-#include <crypto/sha.h>
 #include <crypto/md5.h>
 #include <crypto/authenc.h>
 #include <crypto/skcipher.h>
 #include <crypto/hash.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/sha3.h>
 
 #include "util.h"
diff --git a/drivers/crypto/bcm/cipher.h b/drivers/crypto/bcm/cipher.h
index 035c8389cb3d..0ad5892b445d 100644
--- a/drivers/crypto/bcm/cipher.h
+++ b/drivers/crypto/bcm/cipher.h
@@ -16,7 +16,8 @@
 #include <crypto/aead.h>
 #include <crypto/arc4.h>
 #include <crypto/gcm.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/sha3.h>
 
 #include "spu.h"
diff --git a/drivers/crypto/bcm/spu.h b/drivers/crypto/bcm/spu.h
index dd132389bcaa..1c386a2d5506 100644
--- a/drivers/crypto/bcm/spu.h
+++ b/drivers/crypto/bcm/spu.h
@@ -17,7 +17,8 @@
 
 #include <linux/types.h>
 #include <linux/scatterlist.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 enum spu_cipher_alg {
 	CIPHER_ALG_NONE = 0x0,
diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h
index c3c22a8de4c0..c4f79764172b 100644
--- a/drivers/crypto/caam/compat.h
+++ b/drivers/crypto/caam/compat.h
@@ -34,7 +34,8 @@
 #include <crypto/ctr.h>
 #include <crypto/internal/des.h>
 #include <crypto/gcm.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/md5.h>
 #include <crypto/chacha.h>
 #include <crypto/poly1305.h>
diff --git a/drivers/crypto/cavium/nitrox/nitrox_aead.c b/drivers/crypto/cavium/nitrox/nitrox_aead.c
index e5d8607ecb1d..c93c4e41d267 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_aead.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_aead.c
@@ -7,7 +7,6 @@
 #include <crypto/aead.h>
 #include <crypto/authenc.h>
 #include <crypto/des.h>
-#include <crypto/sha.h>
 #include <crypto/internal/aead.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/gcm.h>
diff --git a/drivers/crypto/ccp/ccp-crypto-sha.c b/drivers/crypto/ccp/ccp-crypto-sha.c
index 8fbfdb9e8cd3..74fa5360e722 100644
--- a/drivers/crypto/ccp/ccp-crypto-sha.c
+++ b/drivers/crypto/ccp/ccp-crypto-sha.c
@@ -17,7 +17,8 @@
 #include <crypto/hash.h>
 #include <crypto/hmac.h>
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/scatterwalk.h>
 #include <linux/string.h>
 
diff --git a/drivers/crypto/ccp/ccp-crypto.h b/drivers/crypto/ccp/ccp-crypto.h
index aed3d2192d01..e42450d07168 100644
--- a/drivers/crypto/ccp/ccp-crypto.h
+++ b/drivers/crypto/ccp/ccp-crypto.h
@@ -19,7 +19,8 @@
 #include <crypto/aead.h>
 #include <crypto/ctr.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/akcipher.h>
 #include <crypto/skcipher.h>
 #include <crypto/internal/rsa.h>
diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h
index af77b2020350..ed2b2f13a256 100644
--- a/drivers/crypto/ccree/cc_driver.h
+++ b/drivers/crypto/ccree/cc_driver.h
@@ -17,7 +17,8 @@
 #include <crypto/algapi.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/aes.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/aead.h>
 #include <crypto/authenc.h>
 #include <crypto/hash.h>
diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index 13b908ea4873..f5a336634daa 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -53,7 +53,8 @@
 #include <crypto/algapi.h>
 #include <crypto/hash.h>
 #include <crypto/gcm.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/authenc.h>
 #include <crypto/ctr.h>
 #include <crypto/gf128mul.h>
diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c
index 891e04914615..2eaa516b3231 100644
--- a/drivers/crypto/hisilicon/sec2/sec_crypto.c
+++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c
@@ -7,7 +7,8 @@
 #include <crypto/des.h>
 #include <crypto/hash.h>
 #include <crypto/internal/aead.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/skcipher.h>
 #include <crypto/xts.h>
 #include <linux/crypto.h>
diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c
index 91f555ccbb31..e813115d5432 100644
--- a/drivers/crypto/img-hash.c
+++ b/drivers/crypto/img-hash.c
@@ -19,7 +19,8 @@
 
 #include <crypto/internal/hash.h>
 #include <crypto/md5.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 #define CR_RESET			0
 #define CR_RESET_SET			1
diff --git a/drivers/crypto/inside-secure/safexcel.h b/drivers/crypto/inside-secure/safexcel.h
index 9045f2d7f4c6..ce1e611a163e 100644
--- a/drivers/crypto/inside-secure/safexcel.h
+++ b/drivers/crypto/inside-secure/safexcel.h
@@ -11,7 +11,8 @@
 #include <crypto/aead.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/sha3.h>
 #include <crypto/skcipher.h>
 #include <linux/types.h>
diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c
index 9bcfb79a030f..d68ef16650d4 100644
--- a/drivers/crypto/inside-secure/safexcel_cipher.c
+++ b/drivers/crypto/inside-secure/safexcel_cipher.c
@@ -18,7 +18,8 @@
 #include <crypto/gcm.h>
 #include <crypto/ghash.h>
 #include <crypto/poly1305.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/sm3.h>
 #include <crypto/sm4.h>
 #include <crypto/xts.h>
diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index 56d5ccb5cc00..50fb6d90a2e0 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -8,7 +8,8 @@
 #include <crypto/aes.h>
 #include <crypto/hmac.h>
 #include <crypto/md5.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/sha3.h>
 #include <crypto/skcipher.h>
 #include <crypto/sm3.h>
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 276012e7c482..8b0f17fc09fb 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -20,7 +20,7 @@
 #include <crypto/internal/des.h>
 #include <crypto/aes.h>
 #include <crypto/hmac.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/skcipher.h>
diff --git a/drivers/crypto/marvell/cesa/hash.c b/drivers/crypto/marvell/cesa/hash.c
index add7ea011c98..8cf9fd518d86 100644
--- a/drivers/crypto/marvell/cesa/hash.c
+++ b/drivers/crypto/marvell/cesa/hash.c
@@ -11,7 +11,8 @@
 
 #include <crypto/hmac.h>
 #include <crypto/md5.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 
diff --git a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c
index 90bb31329d4b..ccbef01888d4 100644
--- a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c
+++ b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c
@@ -13,7 +13,8 @@
 #include <crypto/cryptd.h>
 #include <crypto/des.h>
 #include <crypto/internal/aead.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/xts.h>
 #include <crypto/scatterwalk.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/crypto/mediatek/mtk-sha.c b/drivers/crypto/mediatek/mtk-sha.c
index 3d5d7d68b03b..f55aacdafbef 100644
--- a/drivers/crypto/mediatek/mtk-sha.c
+++ b/drivers/crypto/mediatek/mtk-sha.c
@@ -10,7 +10,8 @@
  */
 
 #include <crypto/hmac.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include "mtk-platform.h"
 
 #define SHA_ALIGN_MSK		(sizeof(u32) - 1)
diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c
index 909a7eb748e3..d6a7784d2988 100644
--- a/drivers/crypto/mxs-dcp.c
+++ b/drivers/crypto/mxs-dcp.c
@@ -17,7 +17,8 @@
 #include <linux/clk.h>
 
 #include <crypto/aes.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index 3642bf83d809..3b0bf6fea491 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -15,7 +15,8 @@
 #include <linux/interrupt.h>
 #include <linux/crypto.h>
 #include <crypto/md5.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/aes.h>
 #include <crypto/internal/des.h>
 #include <linux/mutex.h>
diff --git a/drivers/crypto/nx/nx-sha256.c b/drivers/crypto/nx/nx-sha256.c
index 02fb53453195..90d9a37a57f6 100644
--- a/drivers/crypto/nx/nx-sha256.c
+++ b/drivers/crypto/nx/nx-sha256.c
@@ -8,7 +8,7 @@
  */
 
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/module.h>
 #include <asm/vio.h>
 #include <asm/byteorder.h>
diff --git a/drivers/crypto/nx/nx-sha512.c b/drivers/crypto/nx/nx-sha512.c
index 4c7a3e3eeebf..eb8627a0f317 100644
--- a/drivers/crypto/nx/nx-sha512.c
+++ b/drivers/crypto/nx/nx-sha512.c
@@ -8,7 +8,7 @@
  */
 
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/module.h>
 #include <asm/vio.h>
 
diff --git a/drivers/crypto/nx/nx.c b/drivers/crypto/nx/nx.c
index 40882d6d52c1..0d2dc5be7f19 100644
--- a/drivers/crypto/nx/nx.c
+++ b/drivers/crypto/nx/nx.c
@@ -10,7 +10,7 @@
 #include <crypto/internal/aead.h>
 #include <crypto/internal/hash.h>
 #include <crypto/aes.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/algapi.h>
 #include <crypto/scatterwalk.h>
 #include <linux/module.h>
diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c
index a3b38d2c92e7..ae0d320d3c60 100644
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
@@ -35,7 +35,8 @@
 #include <linux/crypto.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/algapi.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/hash.h>
 #include <crypto/hmac.h>
 #include <crypto/internal/hash.h>
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index a697a4a3f2d0..6865c7f1fc1a 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -9,7 +9,8 @@
 
 #include <crypto/internal/hash.h>
 #include <crypto/padlock.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c
index fb34bf92861d..84f9c16d984c 100644
--- a/drivers/crypto/picoxcell_crypto.c
+++ b/drivers/crypto/picoxcell_crypto.c
@@ -8,7 +8,8 @@
 #include <crypto/authenc.h>
 #include <crypto/internal/des.h>
 #include <crypto/md5.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/clk.h>
 #include <linux/crypto.h>
diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c
index 0fab8bb8ca59..b3a68d986417 100644
--- a/drivers/crypto/qat/qat_common/qat_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_algs.c
@@ -6,7 +6,8 @@
 #include <crypto/internal/aead.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/aes.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/hash.h>
 #include <crypto/hmac.h>
 #include <crypto/algapi.h>
diff --git a/drivers/crypto/qce/common.c b/drivers/crypto/qce/common.c
index 5006e74c40cd..a73db2a5637f 100644
--- a/drivers/crypto/qce/common.c
+++ b/drivers/crypto/qce/common.c
@@ -7,7 +7,8 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 #include "cipher.h"
 #include "common.h"
diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index ea616b7259ae..5e6717f9bbda 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -13,7 +13,6 @@
 #include <linux/types.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
 
 #include "core.h"
 #include "cipher.h"
diff --git a/drivers/crypto/qce/sha.h b/drivers/crypto/qce/sha.h
index d63526e3804d..a22695361f16 100644
--- a/drivers/crypto/qce/sha.h
+++ b/drivers/crypto/qce/sha.h
@@ -7,7 +7,8 @@
 #define _SHA_H_
 
 #include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 #include "common.h"
 #include "core.h"
diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h
index 3db595570c9c..97278c2574ff 100644
--- a/drivers/crypto/rockchip/rk3288_crypto.h
+++ b/drivers/crypto/rockchip/rk3288_crypto.h
@@ -12,7 +12,8 @@
 #include <crypto/internal/skcipher.h>
 
 #include <crypto/md5.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 #define _SBF(v, f)			((v) << (f))
 
diff --git a/drivers/crypto/s5p-sss.c b/drivers/crypto/s5p-sss.c
index 88a6c853ffd7..682c8a450a57 100644
--- a/drivers/crypto/s5p-sss.c
+++ b/drivers/crypto/s5p-sss.c
@@ -30,7 +30,8 @@
 
 #include <crypto/hash.h>
 #include <crypto/md5.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/internal/hash.h>
 
 #define _SBF(s, v)			((v) << (s))
diff --git a/drivers/crypto/sa2ul.c b/drivers/crypto/sa2ul.c
index c357010a159e..f300b0a5958a 100644
--- a/drivers/crypto/sa2ul.c
+++ b/drivers/crypto/sa2ul.c
@@ -25,7 +25,8 @@
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 #include "sa2ul.h"
 
diff --git a/drivers/crypto/sa2ul.h b/drivers/crypto/sa2ul.h
index bb40df3876e5..f597ddecde34 100644
--- a/drivers/crypto/sa2ul.h
+++ b/drivers/crypto/sa2ul.h
@@ -13,7 +13,8 @@
 #define _K3_SA2UL_
 
 #include <crypto/aes.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 #define SA_ENGINE_ENABLE_CONTROL	0x1000
 
diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c
index d60679c79822..8b5be29cb4dc 100644
--- a/drivers/crypto/sahara.c
+++ b/drivers/crypto/sahara.c
@@ -15,7 +15,8 @@
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 #include <linux/clk.h>
 #include <linux/dma-mapping.h>
diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c
index e3e25278a970..7ac0573ef663 100644
--- a/drivers/crypto/stm32/stm32-hash.c
+++ b/drivers/crypto/stm32/stm32-hash.c
@@ -25,7 +25,8 @@
 #include <crypto/hash.h>
 #include <crypto/md5.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/internal/hash.h>
 
 #define HASH_CR				0x00
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index a713a35dc502..4fd85f31630a 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -31,7 +31,8 @@
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
 #include <crypto/internal/des.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/md5.h>
 #include <crypto/internal/aead.h>
 #include <crypto/authenc.h>
diff --git a/drivers/crypto/ux500/hash/hash_core.c b/drivers/crypto/ux500/hash/hash_core.c
index 3d407eebb2ba..da284b0ea1b2 100644
--- a/drivers/crypto/ux500/hash/hash_core.c
+++ b/drivers/crypto/ux500/hash/hash_core.c
@@ -31,7 +31,8 @@
 #include <linux/bitops.h>
 
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/algapi.h>
 
diff --git a/drivers/firmware/efi/embedded-firmware.c b/drivers/firmware/efi/embedded-firmware.c
index 21ae0c48232a..f5be8e22305b 100644
--- a/drivers/firmware/efi/embedded-firmware.c
+++ b/drivers/firmware/efi/embedded-firmware.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/vmalloc.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 
 /* Exported for use by lib/test_firmware.c only */
 LIST_HEAD(efi_embedded_fw_list);
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c
index 072299b14b8d..47d9268a7e3c 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c
@@ -51,7 +51,8 @@
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/authenc.h>
 #include <crypto/internal/aead.h>
 #include <crypto/null.h>
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
index 2d3dfdd2a716..65617752c630 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
@@ -9,7 +9,8 @@
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/authenc.h>
 #include <crypto/ctr.h>
 #include <crypto/gf128mul.h>
diff --git a/drivers/nfc/s3fwrn5/firmware.c b/drivers/nfc/s3fwrn5/firmware.c
index ec930ee2c847..5d5ad8307211 100644
--- a/drivers/nfc/s3fwrn5/firmware.c
+++ b/drivers/nfc/s3fwrn5/firmware.c
@@ -9,7 +9,7 @@
 #include <linux/completion.h>
 #include <linux/firmware.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 
 #include "s3fwrn5.h"
 #include "firmware.h"
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index f53bf336c0a2..d70d4be91096 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -14,7 +14,7 @@
 #include <linux/tee_drv.h>
 #include <linux/uaccess.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include "tee_private.h"
 
 #define TEE_NUM_DEVICES	32
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 1fbe6c24d705..cf06ea3870eb 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -14,7 +14,7 @@
 #include <linux/namei.h>
 #include <linux/scatterlist.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/skcipher.h>
 #include "fscrypt_private.h"
 
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index 0cba7928446d..e0ec21055505 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -10,7 +10,7 @@
  */
 
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 
 #include "fscrypt_private.h"
 
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index b93b3cd10bfd..0886d835f597 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -12,7 +12,6 @@
 #include <linux/crypto.h>
 #include <linux/verification.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
 #include <crypto/algapi.h>
 #include <keys/user-type.h>
 #include <keys/asymmetric-type.h>
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index e96d99d5145e..6a8f2e3cce6c 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -14,7 +14,7 @@
 
 #define pr_fmt(fmt) "fs-verity: " fmt
 
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/fsverity.h>
 #include <linux/mempool.h>
 
diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h
index eb9d2e368969..dd4f06785049 100644
--- a/include/crypto/hash_info.h
+++ b/include/crypto/hash_info.h
@@ -8,7 +8,8 @@
 #ifndef _CRYPTO_HASH_INFO_H
 #define _CRYPTO_HASH_INFO_H
 
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 #include <crypto/md5.h>
 #include <crypto/streebog.h>
 
diff --git a/include/crypto/sha.h b/include/crypto/sha.h
deleted file mode 100644
index 4ff3da816630..000000000000
--- a/include/crypto/sha.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Common values for SHA algorithms
- */
-
-#ifndef _CRYPTO_SHA_H
-#define _CRYPTO_SHA_H
-
-#include <linux/types.h>
-
-#define SHA1_DIGEST_SIZE        20
-#define SHA1_BLOCK_SIZE         64
-
-#define SHA224_DIGEST_SIZE	28
-#define SHA224_BLOCK_SIZE	64
-
-#define SHA256_DIGEST_SIZE      32
-#define SHA256_BLOCK_SIZE       64
-
-#define SHA384_DIGEST_SIZE      48
-#define SHA384_BLOCK_SIZE       128
-
-#define SHA512_DIGEST_SIZE      64
-#define SHA512_BLOCK_SIZE       128
-
-#define SHA1_H0		0x67452301UL
-#define SHA1_H1		0xefcdab89UL
-#define SHA1_H2		0x98badcfeUL
-#define SHA1_H3		0x10325476UL
-#define SHA1_H4		0xc3d2e1f0UL
-
-#define SHA224_H0	0xc1059ed8UL
-#define SHA224_H1	0x367cd507UL
-#define SHA224_H2	0x3070dd17UL
-#define SHA224_H3	0xf70e5939UL
-#define SHA224_H4	0xffc00b31UL
-#define SHA224_H5	0x68581511UL
-#define SHA224_H6	0x64f98fa7UL
-#define SHA224_H7	0xbefa4fa4UL
-
-#define SHA256_H0	0x6a09e667UL
-#define SHA256_H1	0xbb67ae85UL
-#define SHA256_H2	0x3c6ef372UL
-#define SHA256_H3	0xa54ff53aUL
-#define SHA256_H4	0x510e527fUL
-#define SHA256_H5	0x9b05688cUL
-#define SHA256_H6	0x1f83d9abUL
-#define SHA256_H7	0x5be0cd19UL
-
-#define SHA384_H0	0xcbbb9d5dc1059ed8ULL
-#define SHA384_H1	0x629a292a367cd507ULL
-#define SHA384_H2	0x9159015a3070dd17ULL
-#define SHA384_H3	0x152fecd8f70e5939ULL
-#define SHA384_H4	0x67332667ffc00b31ULL
-#define SHA384_H5	0x8eb44a8768581511ULL
-#define SHA384_H6	0xdb0c2e0d64f98fa7ULL
-#define SHA384_H7	0x47b5481dbefa4fa4ULL
-
-#define SHA512_H0	0x6a09e667f3bcc908ULL
-#define SHA512_H1	0xbb67ae8584caa73bULL
-#define SHA512_H2	0x3c6ef372fe94f82bULL
-#define SHA512_H3	0xa54ff53a5f1d36f1ULL
-#define SHA512_H4	0x510e527fade682d1ULL
-#define SHA512_H5	0x9b05688c2b3e6c1fULL
-#define SHA512_H6	0x1f83d9abfb41bd6bULL
-#define SHA512_H7	0x5be0cd19137e2179ULL
-
-extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE];
-
-extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE];
-
-extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE];
-
-extern const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE];
-
-extern const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE];
-
-struct sha1_state {
-	u32 state[SHA1_DIGEST_SIZE / 4];
-	u64 count;
-	u8 buffer[SHA1_BLOCK_SIZE];
-};
-
-struct sha256_state {
-	u32 state[SHA256_DIGEST_SIZE / 4];
-	u64 count;
-	u8 buf[SHA256_BLOCK_SIZE];
-};
-
-struct sha512_state {
-	u64 state[SHA512_DIGEST_SIZE / 8];
-	u64 count[2];
-	u8 buf[SHA512_BLOCK_SIZE];
-};
-
-struct shash_desc;
-
-extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len);
-
-extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, u8 *hash);
-
-extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len);
-
-extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, u8 *hash);
-
-extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len);
-
-extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, u8 *hash);
-
-/*
- * An implementation of SHA-1's compression function.  Don't use in new code!
- * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't
- * the correct way to hash something with SHA-1 (use crypto_shash instead).
- */
-#define SHA1_DIGEST_WORDS	(SHA1_DIGEST_SIZE / 4)
-#define SHA1_WORKSPACE_WORDS	16
-void sha1_init(__u32 *buf);
-void sha1_transform(__u32 *digest, const char *data, __u32 *W);
-
-/*
- * Stand-alone implementation of the SHA256 algorithm. It is designed to
- * have as little dependencies as possible so it can be used in the
- * kexec_file purgatory. In other cases you should generally use the
- * hash APIs from include/crypto/hash.h. Especially when hashing large
- * amounts of data as those APIs may be hw-accelerated.
- *
- * For details see lib/crypto/sha256.c
- */
-
-static inline void sha256_init(struct sha256_state *sctx)
-{
-	sctx->state[0] = SHA256_H0;
-	sctx->state[1] = SHA256_H1;
-	sctx->state[2] = SHA256_H2;
-	sctx->state[3] = SHA256_H3;
-	sctx->state[4] = SHA256_H4;
-	sctx->state[5] = SHA256_H5;
-	sctx->state[6] = SHA256_H6;
-	sctx->state[7] = SHA256_H7;
-	sctx->count = 0;
-}
-void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len);
-void sha256_final(struct sha256_state *sctx, u8 *out);
-void sha256(const u8 *data, unsigned int len, u8 *out);
-
-static inline void sha224_init(struct sha256_state *sctx)
-{
-	sctx->state[0] = SHA224_H0;
-	sctx->state[1] = SHA224_H1;
-	sctx->state[2] = SHA224_H2;
-	sctx->state[3] = SHA224_H3;
-	sctx->state[4] = SHA224_H4;
-	sctx->state[5] = SHA224_H5;
-	sctx->state[6] = SHA224_H6;
-	sctx->state[7] = SHA224_H7;
-	sctx->count = 0;
-}
-void sha224_update(struct sha256_state *sctx, const u8 *data, unsigned int len);
-void sha224_final(struct sha256_state *sctx, u8 *out);
-
-#endif
diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h
new file mode 100644
index 000000000000..044ecea60ac8
--- /dev/null
+++ b/include/crypto/sha1.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values for SHA-1 algorithms
+ */
+
+#ifndef _CRYPTO_SHA1_H
+#define _CRYPTO_SHA1_H
+
+#include <linux/types.h>
+
+#define SHA1_DIGEST_SIZE        20
+#define SHA1_BLOCK_SIZE         64
+
+#define SHA1_H0		0x67452301UL
+#define SHA1_H1		0xefcdab89UL
+#define SHA1_H2		0x98badcfeUL
+#define SHA1_H3		0x10325476UL
+#define SHA1_H4		0xc3d2e1f0UL
+
+extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE];
+
+struct sha1_state {
+	u32 state[SHA1_DIGEST_SIZE / 4];
+	u64 count;
+	u8 buffer[SHA1_BLOCK_SIZE];
+};
+
+struct shash_desc;
+
+extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, u8 *hash);
+
+/*
+ * An implementation of SHA-1's compression function.  Don't use in new code!
+ * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't
+ * the correct way to hash something with SHA-1 (use crypto_shash instead).
+ */
+#define SHA1_DIGEST_WORDS	(SHA1_DIGEST_SIZE / 4)
+#define SHA1_WORKSPACE_WORDS	16
+void sha1_init(__u32 *buf);
+void sha1_transform(__u32 *digest, const char *data, __u32 *W);
+
+#endif /* _CRYPTO_SHA1_H */
diff --git a/include/crypto/sha1_base.h b/include/crypto/sha1_base.h
index a5d6033efef7..2e0e7c3827d1 100644
--- a/include/crypto/sha1_base.h
+++ b/include/crypto/sha1_base.h
@@ -9,7 +9,7 @@
 #define _CRYPTO_SHA1_BASE_H
 
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
 #include <linux/string.h>
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
new file mode 100644
index 000000000000..2838f529f31e
--- /dev/null
+++ b/include/crypto/sha2.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values for SHA-2 algorithms
+ */
+
+#ifndef _CRYPTO_SHA2_H
+#define _CRYPTO_SHA2_H
+
+#include <linux/types.h>
+
+#define SHA224_DIGEST_SIZE	28
+#define SHA224_BLOCK_SIZE	64
+
+#define SHA256_DIGEST_SIZE      32
+#define SHA256_BLOCK_SIZE       64
+
+#define SHA384_DIGEST_SIZE      48
+#define SHA384_BLOCK_SIZE       128
+
+#define SHA512_DIGEST_SIZE      64
+#define SHA512_BLOCK_SIZE       128
+
+#define SHA224_H0	0xc1059ed8UL
+#define SHA224_H1	0x367cd507UL
+#define SHA224_H2	0x3070dd17UL
+#define SHA224_H3	0xf70e5939UL
+#define SHA224_H4	0xffc00b31UL
+#define SHA224_H5	0x68581511UL
+#define SHA224_H6	0x64f98fa7UL
+#define SHA224_H7	0xbefa4fa4UL
+
+#define SHA256_H0	0x6a09e667UL
+#define SHA256_H1	0xbb67ae85UL
+#define SHA256_H2	0x3c6ef372UL
+#define SHA256_H3	0xa54ff53aUL
+#define SHA256_H4	0x510e527fUL
+#define SHA256_H5	0x9b05688cUL
+#define SHA256_H6	0x1f83d9abUL
+#define SHA256_H7	0x5be0cd19UL
+
+#define SHA384_H0	0xcbbb9d5dc1059ed8ULL
+#define SHA384_H1	0x629a292a367cd507ULL
+#define SHA384_H2	0x9159015a3070dd17ULL
+#define SHA384_H3	0x152fecd8f70e5939ULL
+#define SHA384_H4	0x67332667ffc00b31ULL
+#define SHA384_H5	0x8eb44a8768581511ULL
+#define SHA384_H6	0xdb0c2e0d64f98fa7ULL
+#define SHA384_H7	0x47b5481dbefa4fa4ULL
+
+#define SHA512_H0	0x6a09e667f3bcc908ULL
+#define SHA512_H1	0xbb67ae8584caa73bULL
+#define SHA512_H2	0x3c6ef372fe94f82bULL
+#define SHA512_H3	0xa54ff53a5f1d36f1ULL
+#define SHA512_H4	0x510e527fade682d1ULL
+#define SHA512_H5	0x9b05688c2b3e6c1fULL
+#define SHA512_H6	0x1f83d9abfb41bd6bULL
+#define SHA512_H7	0x5be0cd19137e2179ULL
+
+extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE];
+
+extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE];
+
+extern const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE];
+
+extern const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE];
+
+struct sha256_state {
+	u32 state[SHA256_DIGEST_SIZE / 4];
+	u64 count;
+	u8 buf[SHA256_BLOCK_SIZE];
+};
+
+struct sha512_state {
+	u64 state[SHA512_DIGEST_SIZE / 8];
+	u64 count[2];
+	u8 buf[SHA512_BLOCK_SIZE];
+};
+
+struct shash_desc;
+
+extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *hash);
+
+extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *hash);
+
+/*
+ * Stand-alone implementation of the SHA256 algorithm. It is designed to
+ * have as little dependencies as possible so it can be used in the
+ * kexec_file purgatory. In other cases you should generally use the
+ * hash APIs from include/crypto/hash.h. Especially when hashing large
+ * amounts of data as those APIs may be hw-accelerated.
+ *
+ * For details see lib/crypto/sha256.c
+ */
+
+static inline void sha256_init(struct sha256_state *sctx)
+{
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+}
+void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len);
+void sha256_final(struct sha256_state *sctx, u8 *out);
+void sha256(const u8 *data, unsigned int len, u8 *out);
+
+static inline void sha224_init(struct sha256_state *sctx)
+{
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+}
+void sha224_update(struct sha256_state *sctx, const u8 *data, unsigned int len);
+void sha224_final(struct sha256_state *sctx, u8 *out);
+
+#endif /* _CRYPTO_SHA2_H */
diff --git a/include/crypto/sha256_base.h b/include/crypto/sha256_base.h
index 93f9fd21cc06..76173c613058 100644
--- a/include/crypto/sha256_base.h
+++ b/include/crypto/sha256_base.h
@@ -9,7 +9,7 @@
 #define _CRYPTO_SHA256_BASE_H
 
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
 #include <linux/string.h>
diff --git a/include/crypto/sha512_base.h b/include/crypto/sha512_base.h
index 93ab73baa38e..b370b3340b16 100644
--- a/include/crypto/sha512_base.h
+++ b/include/crypto/sha512_base.h
@@ -9,7 +9,7 @@
 #define _CRYPTO_SHA512_BASE_H
 
 #include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
 #include <linux/string.h>
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index a5dfbaf2470d..868924dec5a1 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -15,7 +15,8 @@
 #include <linux/workqueue.h>
 #include <linux/list.h>
 #include <crypto/aes.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
 
 struct ccp_device;
 struct ccp_cmd;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 72d62cbc1578..6c00140538b9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -21,7 +21,7 @@
 #include <linux/if_vlan.h>
 #include <linux/vmalloc.h>
 #include <linux/sockptr.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 
 #include <net/sch_generic.h>
 
diff --git a/include/linux/purgatory.h b/include/linux/purgatory.h
index b950e961cfa8..d7dc1559427f 100644
--- a/include/linux/purgatory.h
+++ b/include/linux/purgatory.h
@@ -3,7 +3,7 @@
 #define _LINUX_PURGATORY_H
 
 #include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <uapi/linux/kexec.h>
 
 struct kexec_sha_region {
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 106e4500fd53..4fcfe0b70c4e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,7 +11,7 @@
 #include <asm/page.h>
 #include <asm/sections.h>
 
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 
 /* vmcoreinfo stuff */
 unsigned char *vmcoreinfo_data;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8798a8183974..4f8efc278aa7 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -42,7 +42,6 @@
 #include <asm/sections.h>
 
 #include <crypto/hash.h>
-#include <crypto/sha.h>
 #include "kexec_internal.h"
 
 DEFINE_MUTEX(kexec_mutex);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index e21f6b9234f7..b02086d70492 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -20,7 +20,7 @@
 #include <linux/fs.h>
 #include <linux/ima.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
 #include <linux/kernel.h>
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index cdef37c05972..72a4b0b1df28 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -15,7 +15,7 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <asm/unaligned.h>
 
 static const u32 SHA256_K[] = {
diff --git a/lib/digsig.c b/lib/digsig.c
index e0627c3e53b2..04b5e55ed95f 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -20,7 +20,7 @@
 #include <linux/key.h>
 #include <linux/crypto.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <keys/user-type.h>
 #include <linux/mpi.h>
 #include <linux/digsig.h>
diff --git a/lib/sha1.c b/lib/sha1.c
index 49257a915bb6..9bd1935a1472 100644
--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -9,7 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/bitops.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <asm/unaligned.h>
 
 /*
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 85dddfe3a2c6..687d95dce085 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -35,7 +35,6 @@
 #include <net/xfrm.h>
 
 #include <crypto/hash.h>
-#include <crypto/sha.h>
 #include <net/seg6.h>
 #include <net/genetlink.h>
 #include <net/seg6_hmac.h>
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index 05d398d3fde4..b472dc149856 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -21,7 +21,7 @@
  */
 
 #include <linux/kernel.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <asm/unaligned.h>
 
 #include "protocol.h"
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index a044dd43411d..90cd52df99a6 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -7,7 +7,7 @@
 #define pr_fmt(fmt) "MPTCP: " fmt
 
 #include <linux/kernel.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <net/tcp.h>
 #include <net/mptcp.h>
 #include "protocol.h"
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ac4a1fe3550b..b229ae914d76 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <crypto/algapi.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h
index 413c803c5208..547425c20e11 100644
--- a/security/integrity/integrity.h
+++ b/security/integrity/integrity.h
@@ -14,7 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/integrity.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <linux/key.h>
 #include <linux/audit.h>
 
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index 192e531c146f..87432b35d771 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -29,7 +29,7 @@
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <crypto/skcipher.h>
 
 #include "encrypted.h"
diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c
index b9fe02e5f84f..74d82093cbaa 100644
--- a/security/keys/trusted-keys/trusted_tpm1.c
+++ b/security/keys/trusted-keys/trusted_tpm1.c
@@ -22,7 +22,7 @@
 #include <linux/rcupdate.h>
 #include <linux/crypto.h>
 #include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
 #include <linux/capability.h>
 #include <linux/tpm.h>
 #include <linux/tpm_command.h>
diff --git a/sound/soc/codecs/cros_ec_codec.c b/sound/soc/codecs/cros_ec_codec.c
index 28f039adfa13..58894bf47514 100644
--- a/sound/soc/codecs/cros_ec_codec.c
+++ b/sound/soc/codecs/cros_ec_codec.c
@@ -8,7 +8,7 @@
  * EC for audio function.
  */
 
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
 #include <linux/acpi.h>
 #include <linux/delay.h>
 #include <linux/device.h>
-- 
cgit 


From ded5ed04d85e299770dcb7e82c2127b8054a00c8 Mon Sep 17 00:00:00 2001
From: Souradeep Chowdhury <schowdhu@codeaurora.org>
Date: Wed, 30 Sep 2020 13:44:13 +0530
Subject: soc: qcom: llcc: Add configuration data for SM8150

Add LLCC configuration data for SM8150 SoC which controls
LLCC behaviour.

Signed-off-by: Souradeep Chowdhury <schowdhu@codeaurora.org>
Link: https://lore.kernel.org/r/957e3ae50c75720ef6227529d5ce3d4b457802e9.1601452132.git.schowdhu@codeaurora.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/soc/qcom/llcc-qcom.c       | 30 ++++++++++++++++++++++++++++++
 include/linux/soc/qcom/llcc-qcom.h |  6 ++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index 96c20e673436..16b421608e9c 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -123,6 +123,30 @@ static const struct llcc_slice_config sdm845_data[] =  {
 	{ LLCC_AUDHW,    22, 1024, 1, 1, 0xffc, 0x2,   0, 0, 1, 1, 0 },
 };
 
+static const struct llcc_slice_config sm8150_data[] =  {
+	{  LLCC_CPUSS,    1, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 1 },
+	{  LLCC_VIDSC0,   2, 512,  2, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_VIDSC1,   3, 512,  2, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_AUDIO,    6, 1024, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MDMHPGRW, 7, 3072, 1, 0, 0xFF,  0xF00, 0, 0, 0, 1, 0 },
+	{  LLCC_MDM,      8, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MODHW,    9, 1024, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_CMPT,    10, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_GPUHTW , 11, 512,  1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_GPU,     12, 2560, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MMUHWT,  13, 1024, 1, 1, 0xFFF, 0x0,   0, 0, 0, 0, 1 },
+	{  LLCC_CMPTDMA, 15, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_DISP,    16, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MDMHPFX, 20, 1024, 2, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MDMHPFX, 21, 1024, 0, 1, 0xF,   0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_AUDHW,   22, 1024, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_NPU,     23, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_WLHW,    24, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MODPE,   29, 256,  1, 1, 0xF,   0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_APTCM,   30, 256,  3, 1, 0x0,   0x1,   1, 0, 0, 1, 0 },
+	{  LLCC_WRCACHE, 31, 128,  1, 1, 0xFFF, 0x0,   0, 0, 0, 0, 0 },
+};
+
 static const struct qcom_llcc_config sc7180_cfg = {
 	.sct_data	= sc7180_data,
 	.size		= ARRAY_SIZE(sc7180_data),
@@ -135,6 +159,11 @@ static const struct qcom_llcc_config sdm845_cfg = {
 	.need_llcc_cfg	= false,
 };
 
+static const struct qcom_llcc_config sm8150_cfg = {
+	.sct_data       = sm8150_data,
+	.size           = ARRAY_SIZE(sm8150_data),
+};
+
 static struct llcc_drv_data *drv_data = (void *) -EPROBE_DEFER;
 
 /**
@@ -529,6 +558,7 @@ err:
 static const struct of_device_id qcom_llcc_of_match[] = {
 	{ .compatible = "qcom,sc7180-llcc", .data = &sc7180_cfg },
 	{ .compatible = "qcom,sdm845-llcc", .data = &sdm845_cfg },
+	{ .compatible = "qcom,sm8150-llcc", .data = &sm8150_cfg },
 	{ }
 };
 
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 90b864655822..3db6797ba6ff 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -16,6 +16,7 @@
 #define LLCC_AUDIO       6
 #define LLCC_MDMHPGRW    7
 #define LLCC_MDM         8
+#define LLCC_MODHW       9
 #define LLCC_CMPT        10
 #define LLCC_GPUHTW      11
 #define LLCC_GPU         12
@@ -26,6 +27,11 @@
 #define LLCC_MDMHPFX     20
 #define LLCC_MDMPNG      21
 #define LLCC_AUDHW       22
+#define LLCC_NPU         23
+#define LLCC_WLHW        24
+#define LLCC_MODPE       29
+#define LLCC_APTCM       30
+#define LLCC_WRCACHE     31
 
 /**
  * llcc_slice_desc - Cache slice descriptor
-- 
cgit 


From 69d98969a0540039fc04e0f22bbe9f41b0a13d66 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 10 Nov 2020 11:18:46 +0100
Subject: can: rename get_can_dlc() macro with can_cc_dlc2len()

The get_can_dlc() macro is used to ensure the payload length information of
the Classical CAN frame to be max 8 bytes (the CAN_MAX_DLEN).

Rename the macro and use the correct constant in preparation of the len/dlc
cleanup for Classical CAN frames.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20201110101852.1973-3-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/at91_can.c                        | 2 +-
 drivers/net/can/c_can/c_can.c                     | 2 +-
 drivers/net/can/cc770/cc770.c                     | 2 +-
 drivers/net/can/flexcan.c                         | 2 +-
 drivers/net/can/grcan.c                           | 2 +-
 drivers/net/can/ifi_canfd/ifi_canfd.c             | 2 +-
 drivers/net/can/janz-ican3.c                      | 4 ++--
 drivers/net/can/m_can/m_can.c                     | 2 +-
 drivers/net/can/mscan/mscan.c                     | 2 +-
 drivers/net/can/pch_can.c                         | 4 ++--
 drivers/net/can/peak_canfd/peak_canfd.c           | 2 +-
 drivers/net/can/rcar/rcar_can.c                   | 2 +-
 drivers/net/can/rcar/rcar_canfd.c                 | 4 ++--
 drivers/net/can/sja1000/sja1000.c                 | 2 +-
 drivers/net/can/softing/softing_main.c            | 2 +-
 drivers/net/can/spi/hi311x.c                      | 2 +-
 drivers/net/can/spi/mcp251x.c                     | 4 ++--
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c    | 2 +-
 drivers/net/can/sun4i_can.c                       | 2 +-
 drivers/net/can/ti_hecc.c                         | 2 +-
 drivers/net/can/usb/ems_usb.c                     | 2 +-
 drivers/net/can/usb/esd_usb2.c                    | 2 +-
 drivers/net/can/usb/gs_usb.c                      | 2 +-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 4 ++--
 drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c  | 4 ++--
 drivers/net/can/usb/mcba_usb.c                    | 2 +-
 drivers/net/can/usb/peak_usb/pcan_usb.c           | 2 +-
 drivers/net/can/usb/peak_usb/pcan_usb_fd.c        | 2 +-
 drivers/net/can/usb/ucan.c                        | 8 ++++----
 drivers/net/can/usb/usb_8dev.c                    | 2 +-
 drivers/net/can/xilinx_can.c                      | 4 ++--
 include/linux/can/dev.h                           | 8 ++++----
 32 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index c14de95d2ca7..db06254f8eb7 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -580,7 +580,7 @@ static void at91_read_mb(struct net_device *dev, unsigned int mb,
 		cf->can_id = (reg_mid >> 18) & CAN_SFF_MASK;
 
 	reg_msr = at91_read(priv, AT91_MSR(mb));
-	cf->can_dlc = get_can_dlc((reg_msr >> 16) & 0xf);
+	cf->can_dlc = can_cc_dlc2len((reg_msr >> 16) & 0xf);
 
 	if (reg_msr & AT91_MSR_MRTR)
 		cf->can_id |= CAN_RTR_FLAG;
diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
index 1ccdbe89585b..56cc705959ea 100644
--- a/drivers/net/can/c_can/c_can.c
+++ b/drivers/net/can/c_can/c_can.c
@@ -397,7 +397,7 @@ static int c_can_read_msg_object(struct net_device *dev, int iface, u32 ctrl)
 		return -ENOMEM;
 	}
 
-	frame->can_dlc = get_can_dlc(ctrl & 0x0F);
+	frame->can_dlc = can_cc_dlc2len(ctrl & 0x0F);
 
 	arb = priv->read_reg32(priv, C_CAN_IFACE(ARB1_REG, iface));
 
diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c
index 07e2b8df5153..3fd2a276dd93 100644
--- a/drivers/net/can/cc770/cc770.c
+++ b/drivers/net/can/cc770/cc770.c
@@ -486,7 +486,7 @@ static void cc770_rx(struct net_device *dev, unsigned int mo, u8 ctrl1)
 		}
 
 		cf->can_id = id;
-		cf->can_dlc = get_can_dlc((config & 0xf0) >> 4);
+		cf->can_dlc = can_cc_dlc2len((config & 0xf0) >> 4);
 		for (i = 0; i < cf->can_dlc; i++)
 			cf->data[i] = cc770_read_reg(priv, msgobj[mo].data[i]);
 	}
diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 99e5f272205d..50ccd18170c8 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -1005,7 +1005,7 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload,
 		if (reg_ctrl & FLEXCAN_MB_CNT_BRS)
 			cfd->flags |= CANFD_BRS;
 	} else {
-		cfd->len = get_can_dlc((reg_ctrl >> 16) & 0xf);
+		cfd->len = can_cc_dlc2len((reg_ctrl >> 16) & 0xf);
 
 		if (reg_ctrl & FLEXCAN_MB_CNT_RTR)
 			cfd->can_id |= CAN_RTR_FLAG;
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index 39802f107eb1..c71c9b8683d5 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -1201,7 +1201,7 @@ static int grcan_receive(struct net_device *dev, int budget)
 			cf->can_id = ((slot[0] & GRCAN_MSG_BID)
 				      >> GRCAN_MSG_BID_BIT);
 		}
-		cf->can_dlc = get_can_dlc((slot[1] & GRCAN_MSG_DLC)
+		cf->can_dlc = can_cc_dlc2len((slot[1] & GRCAN_MSG_DLC)
 					  >> GRCAN_MSG_DLC_BIT);
 		if (rtr) {
 			cf->can_id |= CAN_RTR_FLAG;
diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c
index 74503cacf594..cc790354a8ee 100644
--- a/drivers/net/can/ifi_canfd/ifi_canfd.c
+++ b/drivers/net/can/ifi_canfd/ifi_canfd.c
@@ -273,7 +273,7 @@ static void ifi_canfd_read_fifo(struct net_device *ndev)
 	if (rxdlc & IFI_CANFD_RXFIFO_DLC_EDL)
 		cf->len = can_dlc2len(dlc);
 	else
-		cf->len = get_can_dlc(dlc);
+		cf->len = can_cc_dlc2len(dlc);
 
 	rxid = readl(priv->base + IFI_CANFD_RXFIFO_ID);
 	id = (rxid >> IFI_CANFD_RXFIFO_ID_ID_OFFSET);
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index f929db893957..6a21af05ba27 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -916,10 +916,10 @@ static void ican3_to_can_frame(struct ican3_dev *mod,
 
 		cf->can_id |= desc->data[0] << 3;
 		cf->can_id |= (desc->data[1] & 0xe0) >> 5;
-		cf->can_dlc = get_can_dlc(desc->data[1] & ICAN3_CAN_DLC_MASK);
+		cf->can_dlc = can_cc_dlc2len(desc->data[1] & ICAN3_CAN_DLC_MASK);
 		memcpy(cf->data, &desc->data[2], cf->can_dlc);
 	} else {
-		cf->can_dlc = get_can_dlc(desc->data[0] & ICAN3_CAN_DLC_MASK);
+		cf->can_dlc = can_cc_dlc2len(desc->data[0] & ICAN3_CAN_DLC_MASK);
 		if (desc->data[0] & ICAN3_EFF_RTR)
 			cf->can_id |= CAN_RTR_FLAG;
 
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index f3fc37e96b08..2e46a5916656 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -459,7 +459,7 @@ static void m_can_read_fifo(struct net_device *dev, u32 rxfs)
 	if (dlc & RX_BUF_FDF)
 		cf->len = can_dlc2len((dlc >> 16) & 0x0F);
 	else
-		cf->len = get_can_dlc((dlc >> 16) & 0x0F);
+		cf->len = can_cc_dlc2len((dlc >> 16) & 0x0F);
 
 	id = m_can_fifo_read(cdev, fgi, M_CAN_FIFO_ID);
 	if (id & RX_BUF_XTD)
diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c
index 640ba1b356ec..95bf7338b358 100644
--- a/drivers/net/can/mscan/mscan.c
+++ b/drivers/net/can/mscan/mscan.c
@@ -312,7 +312,7 @@ static void mscan_get_rx_frame(struct net_device *dev, struct can_frame *frame)
 	if (can_id & 1)
 		frame->can_id |= CAN_RTR_FLAG;
 
-	frame->can_dlc = get_can_dlc(in_8(&regs->rx.dlr) & 0xf);
+	frame->can_dlc = can_cc_dlc2len(in_8(&regs->rx.dlr) & 0xf);
 
 	if (!(frame->can_id & CAN_RTR_FLAG)) {
 		void __iomem *data = &regs->rx.dsr1_0;
diff --git a/drivers/net/can/pch_can.c b/drivers/net/can/pch_can.c
index 5c180d2f3c3c..9509bac8352d 100644
--- a/drivers/net/can/pch_can.c
+++ b/drivers/net/can/pch_can.c
@@ -683,7 +683,7 @@ static int pch_can_rx_normal(struct net_device *ndev, u32 obj_num, int quota)
 		if (id2 & PCH_ID2_DIR)
 			cf->can_id |= CAN_RTR_FLAG;
 
-		cf->can_dlc = get_can_dlc((ioread32(&priv->regs->
+		cf->can_dlc = can_cc_dlc2len((ioread32(&priv->regs->
 						    ifregs[0].mcont)) & 0xF);
 
 		for (i = 0; i < cf->can_dlc; i += 2) {
@@ -715,7 +715,7 @@ static void pch_can_tx_complete(struct net_device *ndev, u32 int_stat)
 	iowrite32(PCH_CMASK_RX_TX_GET | PCH_CMASK_CLRINTPND,
 		  &priv->regs->ifregs[1].cmask);
 	pch_can_rw_msg_obj(&priv->regs->ifregs[1].creq, int_stat);
-	dlc = get_can_dlc(ioread32(&priv->regs->ifregs[1].mcont) &
+	dlc = can_cc_dlc2len(ioread32(&priv->regs->ifregs[1].mcont) &
 			  PCH_IF_MCONT_DLC);
 	stats->tx_bytes += dlc;
 	stats->tx_packets++;
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index 40c33b8a5fda..9ea2adea3f0f 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -259,7 +259,7 @@ static int pucan_handle_can_rx(struct peak_canfd_priv *priv,
 	if (rx_msg_flags & PUCAN_MSG_EXT_DATA_LEN)
 		cf_len = can_dlc2len(get_canfd_dlc(pucan_msg_get_dlc(msg)));
 	else
-		cf_len = get_can_dlc(pucan_msg_get_dlc(msg));
+		cf_len = can_cc_dlc2len(pucan_msg_get_dlc(msg));
 
 	/* if this frame is an echo, */
 	if (rx_msg_flags & PUCAN_MSG_LOOPED_BACK) {
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index 48575900adb7..711ef4996b48 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -659,7 +659,7 @@ static void rcar_can_rx_pkt(struct rcar_can_priv *priv)
 		cf->can_id = (data >> RCAR_CAN_SID_SHIFT) & CAN_SFF_MASK;
 
 	dlc = readb(&priv->regs->mb[RCAR_CAN_RX_FIFO_MBX].dlc);
-	cf->can_dlc = get_can_dlc(dlc);
+	cf->can_dlc = can_cc_dlc2len(dlc);
 	if (data & RCAR_CAN_RTR) {
 		cf->can_id |= CAN_RTR_FLAG;
 	} else {
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index de59dd6aad29..899a3218ce5e 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1448,7 +1448,7 @@ static void rcar_canfd_rx_pkt(struct rcar_canfd_channel *priv)
 		if (sts & RCANFD_RFFDSTS_RFFDF)
 			cf->len = can_dlc2len(RCANFD_RFPTR_RFDLC(dlc));
 		else
-			cf->len = get_can_dlc(RCANFD_RFPTR_RFDLC(dlc));
+			cf->len = can_cc_dlc2len(RCANFD_RFPTR_RFDLC(dlc));
 
 		if (sts & RCANFD_RFFDSTS_RFESI) {
 			cf->flags |= CANFD_ESI;
@@ -1464,7 +1464,7 @@ static void rcar_canfd_rx_pkt(struct rcar_canfd_channel *priv)
 			rcar_canfd_get_data(priv, cf, RCANFD_F_RFDF(ridx, 0));
 		}
 	} else {
-		cf->len = get_can_dlc(RCANFD_RFPTR_RFDLC(dlc));
+		cf->len = can_cc_dlc2len(RCANFD_RFPTR_RFDLC(dlc));
 		if (id & RCANFD_RFID_RFRTR)
 			cf->can_id |= CAN_RTR_FLAG;
 		else
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 9f107798f904..1f188f2d126e 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -367,7 +367,7 @@ static void sja1000_rx(struct net_device *dev)
 		    | (priv->read_reg(priv, SJA1000_ID2) >> 5);
 	}
 
-	cf->can_dlc = get_can_dlc(fi & 0x0F);
+	cf->can_dlc = can_cc_dlc2len(fi & 0x0F);
 	if (fi & SJA1000_FI_RTR) {
 		id |= CAN_RTR_FLAG;
 	} else {
diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c
index 9d2faaa39ce4..39e8275ee7ba 100644
--- a/drivers/net/can/softing/softing_main.c
+++ b/drivers/net/can/softing/softing_main.c
@@ -261,7 +261,7 @@ static int softing_handle_1(struct softing *card)
 	} else {
 		if (cmd & CMD_RTR)
 			msg.can_id |= CAN_RTR_FLAG;
-		msg.can_dlc = get_can_dlc(*ptr++);
+		msg.can_dlc = can_cc_dlc2len(*ptr++);
 		if (cmd & CMD_XTD) {
 			msg.can_id |= CAN_EFF_FLAG;
 			msg.can_id |= le32_to_cpup((void *)ptr);
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index 73d48c3b8ded..728f34b696a7 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -341,7 +341,7 @@ static void hi3110_hw_rx(struct spi_device *spi)
 	}
 
 	/* Data length */
-	frame->can_dlc = get_can_dlc(buf[HI3110_FIFO_WOTIME_DLC_OFF] & 0x0F);
+	frame->can_dlc = can_cc_dlc2len(buf[HI3110_FIFO_WOTIME_DLC_OFF] & 0x0F);
 
 	if (buf[HI3110_FIFO_WOTIME_ID_OFF + 3] & HI3110_FIFO_WOTIME_ID_RTR)
 		frame->can_id |= CAN_RTR_FLAG;
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index 22d814ae4edc..6ddebb1c4a24 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -664,7 +664,7 @@ static void mcp251x_hw_rx_frame(struct spi_device *spi, u8 *buf,
 		for (i = 1; i < RXBDAT_OFF; i++)
 			buf[i] = mcp251x_read_reg(spi, RXBCTRL(buf_idx) + i);
 
-		len = get_can_dlc(buf[RXBDLC_OFF] & RXBDLC_LEN_MASK);
+		len = can_cc_dlc2len(buf[RXBDLC_OFF] & RXBDLC_LEN_MASK);
 		for (; i < (RXBDAT_OFF + len); i++)
 			buf[i] = mcp251x_read_reg(spi, RXBCTRL(buf_idx) + i);
 	} else {
@@ -720,7 +720,7 @@ static void mcp251x_hw_rx(struct spi_device *spi, int buf_idx)
 			frame->can_id |= CAN_RTR_FLAG;
 	}
 	/* Data length */
-	frame->can_dlc = get_can_dlc(buf[RXBDLC_OFF] & RXBDLC_LEN_MASK);
+	frame->can_dlc = can_cc_dlc2len(buf[RXBDLC_OFF] & RXBDLC_LEN_MASK);
 	memcpy(frame->data, buf + RXBDAT_OFF, frame->can_dlc);
 
 	priv->net->stats.rx_packets++;
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 9c215f7c5f81..c0a08400f444 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -1410,7 +1410,7 @@ mcp251xfd_hw_rx_obj_to_skb(const struct mcp251xfd_priv *priv,
 		if (hw_rx_obj->flags & MCP251XFD_OBJ_FLAGS_RTR)
 			cfd->can_id |= CAN_RTR_FLAG;
 
-		cfd->len = get_can_dlc(FIELD_GET(MCP251XFD_OBJ_FLAGS_DLC,
+		cfd->len = can_cc_dlc2len(FIELD_GET(MCP251XFD_OBJ_FLAGS_DLC,
 						 hw_rx_obj->flags));
 	}
 
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index e2c6cf4b2228..0e2569779895 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -475,7 +475,7 @@ static void sun4i_can_rx(struct net_device *dev)
 		return;
 
 	fi = readl(priv->base + SUN4I_REG_BUF0_ADDR);
-	cf->can_dlc = get_can_dlc(fi & 0x0F);
+	cf->can_dlc = can_cc_dlc2len(fi & 0x0F);
 	if (fi & SUN4I_MSG_EFF_FLAG) {
 		dreg = SUN4I_REG_BUF5_ADDR;
 		id = (readl(priv->base + SUN4I_REG_BUF1_ADDR) << 21) |
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 2c22f40e12bd..0b1fd34f4c83 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -566,7 +566,7 @@ static struct sk_buff *ti_hecc_mailbox_read(struct can_rx_offload *offload,
 	data = hecc_read_mbx(priv, mbxno, HECC_CANMCF);
 	if (data & HECC_CANMCF_RTR)
 		cf->can_id |= CAN_RTR_FLAG;
-	cf->can_dlc = get_can_dlc(data & 0xF);
+	cf->can_dlc = can_cc_dlc2len(data & 0xF);
 
 	data = hecc_read_mbx(priv, mbxno, HECC_CANMDL);
 	*(__be32 *)(cf->data) = cpu_to_be32(data);
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index 4f52810bebf8..288781934149 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -306,7 +306,7 @@ static void ems_usb_rx_can_msg(struct ems_usb *dev, struct ems_cpc_msg *msg)
 		return;
 
 	cf->can_id = le32_to_cpu(msg->msg.can_msg.id);
-	cf->can_dlc = get_can_dlc(msg->msg.can_msg.length & 0xF);
+	cf->can_dlc = can_cc_dlc2len(msg->msg.can_msg.length & 0xF);
 
 	if (msg->type == CPC_MSG_TYPE_EXT_CAN_FRAME ||
 	    msg->type == CPC_MSG_TYPE_EXT_RTR_FRAME)
diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c
index b5d7ed21d7d9..72999de550d1 100644
--- a/drivers/net/can/usb/esd_usb2.c
+++ b/drivers/net/can/usb/esd_usb2.c
@@ -321,7 +321,7 @@ static void esd_usb2_rx_can_msg(struct esd_usb2_net_priv *priv,
 		}
 
 		cf->can_id = id & ESD_IDMASK;
-		cf->can_dlc = get_can_dlc(msg->msg.rx.dlc & ~ESD_RTR);
+		cf->can_dlc = can_cc_dlc2len(msg->msg.rx.dlc & ~ESD_RTR);
 
 		if (id & ESD_EXTID)
 			cf->can_id |= CAN_EFF_FLAG;
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 3005157059ca..b1729b208788 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -331,7 +331,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
 
 		cf->can_id = hf->can_id;
 
-		cf->can_dlc = get_can_dlc(hf->can_dlc);
+		cf->can_dlc = can_cc_dlc2len(hf->can_dlc);
 		memcpy(cf->data, hf->data, 8);
 
 		/* ERROR frames tell us information about the controller */
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
index 218fadc91155..493a614bf333 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
@@ -1180,7 +1180,7 @@ static void kvaser_usb_hydra_rx_msg_std(const struct kvaser_usb *dev,
 	if (flags & KVASER_USB_HYDRA_CF_FLAG_OVERRUN)
 		kvaser_usb_can_rx_over_error(priv->netdev);
 
-	cf->can_dlc = get_can_dlc(cmd->rx_can.dlc);
+	cf->can_dlc = can_cc_dlc2len(cmd->rx_can.dlc);
 
 	if (flags & KVASER_USB_HYDRA_CF_FLAG_REMOTE_FRAME)
 		cf->can_id |= CAN_RTR_FLAG;
@@ -1257,7 +1257,7 @@ static void kvaser_usb_hydra_rx_msg_ext(const struct kvaser_usb *dev,
 		if (flags & KVASER_USB_HYDRA_CF_FLAG_ESI)
 			cf->flags |= CANFD_ESI;
 	} else {
-		cf->len = get_can_dlc(dlc);
+		cf->len = can_cc_dlc2len(dlc);
 	}
 
 	if (flags & KVASER_USB_HYDRA_CF_FLAG_REMOTE_FRAME)
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
index 1b9957f12459..916ab994cce4 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
@@ -978,7 +978,7 @@ static void kvaser_usb_leaf_rx_can_msg(const struct kvaser_usb *dev,
 		else
 			cf->can_id &= CAN_SFF_MASK;
 
-		cf->can_dlc = get_can_dlc(cmd->u.leaf.log_message.dlc);
+		cf->can_dlc = can_cc_dlc2len(cmd->u.leaf.log_message.dlc);
 
 		if (cmd->u.leaf.log_message.flags & MSG_FLAG_REMOTE_FRAME)
 			cf->can_id |= CAN_RTR_FLAG;
@@ -996,7 +996,7 @@ static void kvaser_usb_leaf_rx_can_msg(const struct kvaser_usb *dev,
 			cf->can_id |= CAN_EFF_FLAG;
 		}
 
-		cf->can_dlc = get_can_dlc(rx_data[5]);
+		cf->can_dlc = can_cc_dlc2len(rx_data[5]);
 
 		if (cmd->u.rx_can_header.flag & MSG_FLAG_REMOTE_FRAME)
 			cf->can_id |= CAN_RTR_FLAG;
diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c
index e97f2e0da6b0..295886c6b565 100644
--- a/drivers/net/can/usb/mcba_usb.c
+++ b/drivers/net/can/usb/mcba_usb.c
@@ -451,7 +451,7 @@ static void mcba_usb_process_can(struct mcba_priv *priv,
 	if (msg->dlc & MCBA_DLC_RTR_MASK)
 		cf->can_id |= CAN_RTR_FLAG;
 
-	cf->can_dlc = get_can_dlc(msg->dlc & MCBA_DLC_MASK);
+	cf->can_dlc = can_cc_dlc2len(msg->dlc & MCBA_DLC_MASK);
 
 	memcpy(cf->data, msg->data, cf->can_dlc);
 
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
index 63bd2ed96697..f7fc82203489 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
@@ -734,7 +734,7 @@ static int pcan_usb_decode_data(struct pcan_usb_msg_context *mc, u8 status_len)
 		cf->can_id = le16_to_cpu(tmp16) >> 5;
 	}
 
-	cf->can_dlc = get_can_dlc(rec_len);
+	cf->can_dlc = can_cc_dlc2len(rec_len);
 
 	/* Only first packet timestamp is a word */
 	if (pcan_usb_decode_ts(mc, !mc->rec_ts_idx))
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
index d29d20525588..1f08dd22b3d5 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
@@ -499,7 +499,7 @@ static int pcan_usb_fd_decode_canmsg(struct pcan_usb_fd_if *usb_if,
 		if (!skb)
 			return -ENOMEM;
 
-		cfd->len = get_can_dlc(pucan_msg_get_dlc(rm));
+		cfd->len = can_cc_dlc2len(pucan_msg_get_dlc(rm));
 	}
 
 	cfd->can_id = le32_to_cpu(rm->can_id);
diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
index dc5290b36598..072058c6f6e8 100644
--- a/drivers/net/can/usb/ucan.c
+++ b/drivers/net/can/usb/ucan.c
@@ -303,12 +303,12 @@ struct ucan_priv {
 	struct ucan_urb_context *context_array;
 };
 
-static u8 ucan_get_can_dlc(struct ucan_can_msg *msg, u16 len)
+static u8 ucan_can_cc_dlc2len(struct ucan_can_msg *msg, u16 len)
 {
 	if (le32_to_cpu(msg->id) & CAN_RTR_FLAG)
-		return get_can_dlc(msg->dlc);
+		return can_cc_dlc2len(msg->dlc);
 	else
-		return get_can_dlc(len - (UCAN_IN_HDR_SIZE + sizeof(msg->id)));
+		return can_cc_dlc2len(len - (UCAN_IN_HDR_SIZE + sizeof(msg->id)));
 }
 
 static void ucan_release_context_array(struct ucan_priv *up)
@@ -614,7 +614,7 @@ static void ucan_rx_can_msg(struct ucan_priv *up, struct ucan_message_in *m)
 	cf->can_id = canid;
 
 	/* compute DLC taking RTR_FLAG into account */
-	cf->can_dlc = ucan_get_can_dlc(&m->msg.can_msg, len);
+	cf->can_dlc = ucan_can_cc_dlc2len(&m->msg.can_msg, len);
 
 	/* copy the payload of non RTR frames */
 	if (!(cf->can_id & CAN_RTR_FLAG) || (cf->can_id & CAN_ERR_FLAG))
diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index 62749c67c959..216c58f8df6e 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c
@@ -470,7 +470,7 @@ static void usb_8dev_rx_can_msg(struct usb_8dev_priv *priv,
 			return;
 
 		cf->can_id = be32_to_cpu(msg->id);
-		cf->can_dlc = get_can_dlc(msg->dlc & 0xF);
+		cf->can_dlc = can_cc_dlc2len(msg->dlc & 0xF);
 
 		if (msg->flags & USB_8DEV_EXTID)
 			cf->can_id |= CAN_EFF_FLAG;
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 48d746e18f30..73e8b1df1071 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -759,7 +759,7 @@ static int xcan_rx(struct net_device *ndev, int frame_base)
 				   XCAN_DLCR_DLC_SHIFT;
 
 	/* Change Xilinx CAN data length format to socketCAN data format */
-	cf->can_dlc = get_can_dlc(dlc);
+	cf->can_dlc = can_cc_dlc2len(dlc);
 
 	/* Change Xilinx CAN ID format to socketCAN ID format */
 	if (id_xcan & XCAN_IDR_IDE_MASK) {
@@ -835,7 +835,7 @@ static int xcanfd_rx(struct net_device *ndev, int frame_base)
 		cf->len = can_dlc2len((dlc & XCAN_DLCR_DLC_MASK) >>
 				  XCAN_DLCR_DLC_SHIFT);
 	else
-		cf->len = get_can_dlc((dlc & XCAN_DLCR_DLC_MASK) >>
+		cf->len = can_cc_dlc2len((dlc & XCAN_DLCR_DLC_MASK) >>
 					  XCAN_DLCR_DLC_SHIFT);
 
 	/* Change Xilinx CAN ID format to socketCAN ID format */
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 41ff31795320..9bc84c6978ec 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -98,14 +98,14 @@ static inline unsigned int can_bit_time(const struct can_bittiming *bt)
 }
 
 /*
- * get_can_dlc(value) - helper macro to cast a given data length code (dlc)
- * to u8 and ensure the dlc value to be max. 8 bytes.
+ * can_cc_dlc2len(value) - convert a given data length code (dlc) of a
+ * Classical CAN frame into a valid data length of max. 8 bytes.
  *
  * To be used in the CAN netdriver receive path to ensure conformance with
  * ISO 11898-1 Chapter 8.4.2.3 (DLC field)
  */
-#define get_can_dlc(i)		(min_t(u8, (i), CAN_MAX_DLC))
-#define get_canfd_dlc(i)	(min_t(u8, (i), CANFD_MAX_DLC))
+#define can_cc_dlc2len(dlc)	(min_t(u8, (dlc), CAN_MAX_DLEN))
+#define get_canfd_dlc(dlc)	(min_t(u8, (dlc), CANFD_MAX_DLC))
 
 /* Check for outgoing skbs that have not been created by the CAN subsystem */
 static inline bool can_skb_headroom_valid(struct net_device *dev,
-- 
cgit 


From cd1124e76d740327be5d8f9ce3785ce1119daf4b Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 10 Nov 2020 11:18:47 +0100
Subject: can: remove obsolete get_canfd_dlc() macro

The macro was always used together with can_dlc2len() which sanitizes the
given dlc value on its own.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20201110101852.1973-4-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/flexcan.c                         | 2 +-
 drivers/net/can/peak_canfd/peak_canfd.c           | 2 +-
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c    | 2 +-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 2 +-
 drivers/net/can/usb/peak_usb/pcan_usb_fd.c        | 2 +-
 include/linux/can/dev.h                           | 1 -
 include/linux/can/dev/peak_canfd.h                | 2 +-
 7 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 50ccd18170c8..985569f946e5 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -1000,7 +1000,7 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload,
 		cfd->can_id = (reg_id >> 18) & CAN_SFF_MASK;
 
 	if (reg_ctrl & FLEXCAN_MB_CNT_EDL) {
-		cfd->len = can_dlc2len(get_canfd_dlc((reg_ctrl >> 16) & 0xf));
+		cfd->len = can_dlc2len((reg_ctrl >> 16) & 0xf);
 
 		if (reg_ctrl & FLEXCAN_MB_CNT_BRS)
 			cfd->flags |= CANFD_BRS;
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index 9ea2adea3f0f..c6077e07214e 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -257,7 +257,7 @@ static int pucan_handle_can_rx(struct peak_canfd_priv *priv,
 	u8 cf_len;
 
 	if (rx_msg_flags & PUCAN_MSG_EXT_DATA_LEN)
-		cf_len = can_dlc2len(get_canfd_dlc(pucan_msg_get_dlc(msg)));
+		cf_len = can_dlc2len(pucan_msg_get_dlc(msg));
 	else
 		cf_len = can_cc_dlc2len(pucan_msg_get_dlc(msg));
 
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index c0a08400f444..3bac7274ee5b 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -1405,7 +1405,7 @@ mcp251xfd_hw_rx_obj_to_skb(const struct mcp251xfd_priv *priv,
 			cfd->flags |= CANFD_BRS;
 
 		dlc = FIELD_GET(MCP251XFD_OBJ_FLAGS_DLC, hw_rx_obj->flags);
-		cfd->len = can_dlc2len(get_canfd_dlc(dlc));
+		cfd->len = can_dlc2len(dlc);
 	} else {
 		if (hw_rx_obj->flags & MCP251XFD_OBJ_FLAGS_RTR)
 			cfd->can_id |= CAN_RTR_FLAG;
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
index 493a614bf333..843e2e1392e8 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
@@ -1251,7 +1251,7 @@ static void kvaser_usb_hydra_rx_msg_ext(const struct kvaser_usb *dev,
 		kvaser_usb_can_rx_over_error(priv->netdev);
 
 	if (flags & KVASER_USB_HYDRA_CF_FLAG_FDF) {
-		cf->len = can_dlc2len(get_canfd_dlc(dlc));
+		cf->len = can_dlc2len(dlc);
 		if (flags & KVASER_USB_HYDRA_CF_FLAG_BRS)
 			cf->flags |= CANFD_BRS;
 		if (flags & KVASER_USB_HYDRA_CF_FLAG_ESI)
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
index 1f08dd22b3d5..1233ef20646a 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
@@ -492,7 +492,7 @@ static int pcan_usb_fd_decode_canmsg(struct pcan_usb_fd_if *usb_if,
 		if (rx_msg_flags & PUCAN_MSG_ERROR_STATE_IND)
 			cfd->flags |= CANFD_ESI;
 
-		cfd->len = can_dlc2len(get_canfd_dlc(pucan_msg_get_dlc(rm)));
+		cfd->len = can_dlc2len(pucan_msg_get_dlc(rm));
 	} else {
 		/* CAN 2.0 frame case */
 		skb = alloc_can_skb(netdev, (struct can_frame **)&cfd);
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 9bc84c6978ec..802606e36b58 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -105,7 +105,6 @@ static inline unsigned int can_bit_time(const struct can_bittiming *bt)
  * ISO 11898-1 Chapter 8.4.2.3 (DLC field)
  */
 #define can_cc_dlc2len(dlc)	(min_t(u8, (dlc), CAN_MAX_DLEN))
-#define get_canfd_dlc(dlc)	(min_t(u8, (dlc), CANFD_MAX_DLC))
 
 /* Check for outgoing skbs that have not been created by the CAN subsystem */
 static inline bool can_skb_headroom_valid(struct net_device *dev,
diff --git a/include/linux/can/dev/peak_canfd.h b/include/linux/can/dev/peak_canfd.h
index 5fd627e9da19..f38772fd0c07 100644
--- a/include/linux/can/dev/peak_canfd.h
+++ b/include/linux/can/dev/peak_canfd.h
@@ -282,7 +282,7 @@ static inline int pucan_msg_get_channel(const struct pucan_rx_msg *msg)
 }
 
 /* return the dlc value from any received message channel_dlc field */
-static inline int pucan_msg_get_dlc(const struct pucan_rx_msg *msg)
+static inline u8 pucan_msg_get_dlc(const struct pucan_rx_msg *msg)
 {
 	return msg->channel_dlc >> 4;
 }
-- 
cgit 


From 964db79d6c186cc2ecc6ae46f98eed7e0ea8cf71 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Thu, 19 Nov 2020 18:53:55 +0100
Subject: of/address: Introduce of_dma_get_max_cpu_address()

Introduce of_dma_get_max_cpu_address(), which provides the highest CPU
physical address addressable by all DMA masters in the system. It's
specially useful for setting memory zones sizes at early boot time.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201119175400.9995-4-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/of/address.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/of.h   |  7 +++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index eb9ab4f1e80b..09c0af7fd1c4 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -1024,6 +1024,48 @@ out:
 }
 #endif /* CONFIG_HAS_DMA */
 
+/**
+ * of_dma_get_max_cpu_address - Gets highest CPU address suitable for DMA
+ * @np: The node to start searching from or NULL to start from the root
+ *
+ * Gets the highest CPU physical address that is addressable by all DMA masters
+ * in the sub-tree pointed by np, or the whole tree if NULL is passed. If no
+ * DMA constrained device is found, it returns PHYS_ADDR_MAX.
+ */
+phys_addr_t __init of_dma_get_max_cpu_address(struct device_node *np)
+{
+	phys_addr_t max_cpu_addr = PHYS_ADDR_MAX;
+	struct of_range_parser parser;
+	phys_addr_t subtree_max_addr;
+	struct device_node *child;
+	struct of_range range;
+	const __be32 *ranges;
+	u64 cpu_end = 0;
+	int len;
+
+	if (!np)
+		np = of_root;
+
+	ranges = of_get_property(np, "dma-ranges", &len);
+	if (ranges && len) {
+		of_dma_range_parser_init(&parser, np);
+		for_each_of_range(&parser, &range)
+			if (range.cpu_addr + range.size > cpu_end)
+				cpu_end = range.cpu_addr + range.size - 1;
+
+		if (max_cpu_addr > cpu_end)
+			max_cpu_addr = cpu_end;
+	}
+
+	for_each_available_child_of_node(np, child) {
+		subtree_max_addr = of_dma_get_max_cpu_address(child);
+		if (max_cpu_addr > subtree_max_addr)
+			max_cpu_addr = subtree_max_addr;
+	}
+
+	return max_cpu_addr;
+}
+
 /**
  * of_dma_is_coherent - Check if device is coherent
  * @np:	device node
diff --git a/include/linux/of.h b/include/linux/of.h
index 5d51891cbf1a..9ed5b8532c30 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -558,6 +558,8 @@ int of_map_id(struct device_node *np, u32 id,
 	       const char *map_name, const char *map_mask_name,
 	       struct device_node **target, u32 *id_out);
 
+phys_addr_t of_dma_get_max_cpu_address(struct device_node *np);
+
 #else /* CONFIG_OF */
 
 static inline void of_core_init(void)
@@ -995,6 +997,11 @@ static inline int of_map_id(struct device_node *np, u32 id,
 	return -EINVAL;
 }
 
+static inline phys_addr_t of_dma_get_max_cpu_address(struct device_node *np)
+{
+	return PHYS_ADDR_MAX;
+}
+
 #define of_match_ptr(_ptr)	NULL
 #define of_match_node(_matches, _node)	NULL
 #endif /* CONFIG_OF */
-- 
cgit 


From 2b8652936f0ca9ca2e6c984ae76c7bfcda1b3f22 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Thu, 19 Nov 2020 18:53:58 +0100
Subject: arm64: mm: Set ZONE_DMA size based on early IORT scan

We recently introduced a 1 GB sized ZONE_DMA to cater for platforms
incorporating masters that can address less than 32 bits of DMA, in
particular the Raspberry Pi 4, which has 4 or 8 GB of DRAM, but has
peripherals that can only address up to 1 GB (and its PCIe host
bridge can only access the bottom 3 GB)

Instructing the DMA layer about these limitations is straight-forward,
even though we had to fix some issues regarding memory limits set in
the IORT for named components, and regarding the handling of ACPI _DMA
methods. However, the DMA layer also needs to be able to allocate
memory that is guaranteed to meet those DMA constraints, for bounce
buffering as well as allocating the backing for consistent mappings.

This is why the 1 GB ZONE_DMA was introduced recently. Unfortunately,
it turns out the having a 1 GB ZONE_DMA as well as a ZONE_DMA32 causes
problems with kdump, and potentially in other places where allocations
cannot cross zone boundaries. Therefore, we should avoid having two
separate DMA zones when possible.

So let's do an early scan of the IORT, and only create the ZONE_DMA
if we encounter any devices that need it. This puts the burden on
the firmware to describe such limitations in the IORT, which may be
redundant (and less precise) if _DMA methods are also being provided.
However, it should be noted that this situation is highly unusual for
arm64 ACPI machines. Also, the DMA subsystem still gives precedence to
the _DMA method if implemented, and so we will not lose the ability to
perform streaming DMA outside the ZONE_DMA if the _DMA method permits
it.

[nsaenz: unified implementation with DT's counterpart]

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20201119175400.9995-7-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/init.c      |  5 ++++-
 drivers/acpi/arm64/iort.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi_iort.h |  4 ++++
 3 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index b736890ad3d3..5e534d674c9c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -29,6 +29,7 @@
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
 #include <linux/hugetlb.h>
+#include <linux/acpi_iort.h>
 
 #include <asm/boot.h>
 #include <asm/fixmap.h>
@@ -193,11 +194,13 @@ static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
 static void __init zone_sizes_init(unsigned long min, unsigned long max)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
+	unsigned int __maybe_unused acpi_zone_dma_bits;
 	unsigned int __maybe_unused dt_zone_dma_bits;
 
 #ifdef CONFIG_ZONE_DMA
+	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
 	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
-	zone_dma_bits = min(32U, dt_zone_dma_bits);
+	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
 	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
 	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
 #endif
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 9929ff50c0c0..1787406684aa 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1718,3 +1718,58 @@ void __init acpi_iort_init(void)
 
 	iort_init_platform_devices();
 }
+
+#ifdef CONFIG_ZONE_DMA
+/*
+ * Extract the highest CPU physical address accessible to all DMA masters in
+ * the system. PHYS_ADDR_MAX is returned when no constrained device is found.
+ */
+phys_addr_t __init acpi_iort_dma_get_max_cpu_address(void)
+{
+	phys_addr_t limit = PHYS_ADDR_MAX;
+	struct acpi_iort_node *node, *end;
+	struct acpi_table_iort *iort;
+	acpi_status status;
+	int i;
+
+	if (acpi_disabled)
+		return limit;
+
+	status = acpi_get_table(ACPI_SIG_IORT, 0,
+				(struct acpi_table_header **)&iort);
+	if (ACPI_FAILURE(status))
+		return limit;
+
+	node = ACPI_ADD_PTR(struct acpi_iort_node, iort, iort->node_offset);
+	end = ACPI_ADD_PTR(struct acpi_iort_node, iort, iort->header.length);
+
+	for (i = 0; i < iort->node_count; i++) {
+		if (node >= end)
+			break;
+
+		switch (node->type) {
+			struct acpi_iort_named_component *ncomp;
+			struct acpi_iort_root_complex *rc;
+			phys_addr_t local_limit;
+
+		case ACPI_IORT_NODE_NAMED_COMPONENT:
+			ncomp = (struct acpi_iort_named_component *)node->node_data;
+			local_limit = DMA_BIT_MASK(ncomp->memory_address_limit);
+			limit = min_not_zero(limit, local_limit);
+			break;
+
+		case ACPI_IORT_NODE_PCI_ROOT_COMPLEX:
+			if (node->revision < 1)
+				break;
+
+			rc = (struct acpi_iort_root_complex *)node->node_data;
+			local_limit = DMA_BIT_MASK(rc->memory_address_limit);
+			limit = min_not_zero(limit, local_limit);
+			break;
+		}
+		node = ACPI_ADD_PTR(struct acpi_iort_node, node, node->length);
+	}
+	acpi_put_table(&iort->header);
+	return limit;
+}
+#endif
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 20a32120bb88..1a12baa58e40 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -38,6 +38,7 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size);
 const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
 						const u32 *id_in);
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
+phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
 #else
 static inline void acpi_iort_init(void) { }
 static inline u32 iort_msi_map_id(struct device *dev, u32 id)
@@ -55,6 +56,9 @@ static inline const struct iommu_ops *iort_iommu_configure_id(
 static inline
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 { return 0; }
+
+static inline phys_addr_t acpi_iort_dma_get_max_cpu_address(void)
+{ return PHYS_ADDR_MAX; }
 #endif
 
 #endif /* __ACPI_IORT_H__ */
-- 
cgit 


From 04435217f96869ac3a8f055ff68c5237a60bcd7e Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Thu, 19 Nov 2020 18:53:59 +0100
Subject: mm: Remove examples from enum zone_type comment

We can't really list every setup in common code. On top of that they are
unlikely to stay true for long as things change in the arch trees
independently of this comment.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20201119175400.9995-8-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/mmzone.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fb3bf696c05e..9d0c454d23cd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -354,26 +354,6 @@ enum zone_type {
 	 * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
 	 * platforms may need both zones as they support peripherals with
 	 * different DMA addressing limitations.
-	 *
-	 * Some examples:
-	 *
-	 *  - i386 and x86_64 have a fixed 16M ZONE_DMA and ZONE_DMA32 for the
-	 *    rest of the lower 4G.
-	 *
-	 *  - arm only uses ZONE_DMA, the size, up to 4G, may vary depending on
-	 *    the specific device.
-	 *
-	 *  - arm64 has a fixed 1G ZONE_DMA and ZONE_DMA32 for the rest of the
-	 *    lower 4G.
-	 *
-	 *  - powerpc only uses ZONE_DMA, the size, up to 2G, may vary
-	 *    depending on the specific device.
-	 *
-	 *  - s390 uses ZONE_DMA fixed to the lower 2G.
-	 *
-	 *  - ia64 and riscv only use ZONE_DMA32.
-	 *
-	 *  - parisc uses neither.
 	 */
 #ifdef CONFIG_ZONE_DMA
 	ZONE_DMA,
-- 
cgit 


From 607a4672b458b12674b96724e2f9bd42a5e928c6 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 20 Nov 2020 10:55:17 +0000
Subject: firmware: arm_scmi: Add full list of sensor type enumeration

SCMI v2.0 provides a big list of sensor type enumeration from the
sensorUnits enumeration table of Distributed Management Task Force(DMTF)
specification number DSP 0248 (Platform Level Data Model for Platform
Monitoring and Control Specification). It is however not an exact
replica of the sensorUnits enumeration table.

Let us just update the table as per SCMI v2.0 specification.

Link: https://lore.kernel.org/r/20201119174906.43862-3-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 81 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 9cd312a1ff92..13d75956aa91 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -163,11 +163,92 @@ struct scmi_sensor_info {
  */
 enum scmi_sensor_class {
 	NONE = 0x0,
+	UNSPEC = 0x1,
 	TEMPERATURE_C = 0x2,
+	TEMPERATURE_F = 0x3,
+	TEMPERATURE_K = 0x4,
 	VOLTAGE = 0x5,
 	CURRENT = 0x6,
 	POWER = 0x7,
 	ENERGY = 0x8,
+	CHARGE = 0x9,
+	VOLTAMPERE = 0xA,
+	NITS = 0xB,
+	LUMENS = 0xC,
+	LUX = 0xD,
+	CANDELAS = 0xE,
+	KPA = 0xF,
+	PSI = 0x10,
+	NEWTON = 0x11,
+	CFM = 0x12,
+	RPM = 0x13,
+	HERTZ = 0x14,
+	SECS = 0x15,
+	MINS = 0x16,
+	HOURS = 0x17,
+	DAYS = 0x18,
+	WEEKS = 0x19,
+	MILS = 0x1A,
+	INCHES = 0x1B,
+	FEET = 0x1C,
+	CUBIC_INCHES = 0x1D,
+	CUBIC_FEET = 0x1E,
+	METERS = 0x1F,
+	CUBIC_CM = 0x20,
+	CUBIC_METERS = 0x21,
+	LITERS = 0x22,
+	FLUID_OUNCES = 0x23,
+	RADIANS = 0x24,
+	STERADIANS = 0x25,
+	REVOLUTIONS = 0x26,
+	CYCLES = 0x27,
+	GRAVITIES = 0x28,
+	OUNCES = 0x29,
+	POUNDS = 0x2A,
+	FOOT_POUNDS = 0x2B,
+	OUNCE_INCHES = 0x2C,
+	GAUSS = 0x2D,
+	GILBERTS = 0x2E,
+	HENRIES = 0x2F,
+	FARADS = 0x30,
+	OHMS = 0x31,
+	SIEMENS = 0x32,
+	MOLES = 0x33,
+	BECQUERELS = 0x34,
+	PPM = 0x35,
+	DECIBELS = 0x36,
+	DBA = 0x37,
+	DBC = 0x38,
+	GRAYS = 0x39,
+	SIEVERTS = 0x3A,
+	COLOR_TEMP_K = 0x3B,
+	BITS = 0x3C,
+	BYTES = 0x3D,
+	WORDS = 0x3E,
+	DWORDS = 0x3F,
+	QWORDS = 0x40,
+	PERCENTAGE = 0x41,
+	PASCALS = 0x42,
+	COUNTS = 0x43,
+	GRAMS = 0x44,
+	NEWTON_METERS = 0x45,
+	HITS = 0x46,
+	MISSES = 0x47,
+	RETRIES = 0x48,
+	OVERRUNS = 0x49,
+	UNDERRUNS = 0x4A,
+	COLLISIONS = 0x4B,
+	PACKETS = 0x4C,
+	MESSAGES = 0x4D,
+	CHARS = 0x4E,
+	ERRORS = 0x4F,
+	CORRECTED_ERRS = 0x50,
+	UNCORRECTABLE_ERRS = 0x51,
+	SQ_MILS = 0x52,
+	SQ_INCHES = 0x53,
+	SQ_FEET = 0x54,
+	SQ_CM = 0x55,
+	SQ_METERS = 0x56,
 };
 
 /**
-- 
cgit 


From 1fe00b8b4276ddf335216f884cb719edbea129e1 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 19 Nov 2020 17:49:02 +0000
Subject: firmware: arm_scmi: Add SCMI v3.0 sensors descriptors extensions

Add support for new SCMI v3.0 Sensors extensions related to new sensors'
features, like multiple axis and update intervals, while keeping
compatibility with SCMI v2.0 features.

While at that, refactor and simplify all the internal helpers macros and
move struct scmi_sensor_info to use only non-fixed-size typing.

Link: https://lore.kernel.org/r/20201119174906.43862-3-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 390 ++++++++++++++++++++++++++++++++++--
 include/linux/scmi_protocol.h       | 139 ++++++++++++-
 2 files changed, 504 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index 6aaff478d032..a85827f60a02 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -7,16 +7,22 @@
 
 #define pr_fmt(fmt) "SCMI Notifications SENSOR - " fmt
 
+#include <linux/bitfield.h>
 #include <linux/scmi_protocol.h>
 
 #include "common.h"
 #include "notify.h"
 
+#define SCMI_MAX_NUM_SENSOR_AXIS	63
+#define	SCMIv2_SENSOR_PROTOCOL		0x10000
+
 enum scmi_sensor_protocol_cmd {
 	SENSOR_DESCRIPTION_GET = 0x3,
 	SENSOR_TRIP_POINT_NOTIFY = 0x4,
 	SENSOR_TRIP_POINT_CONFIG = 0x5,
 	SENSOR_READING_GET = 0x6,
+	SENSOR_AXIS_DESCRIPTION_GET = 0x7,
+	SENSOR_LIST_UPDATE_INTERVALS = 0x8,
 };
 
 struct scmi_msg_resp_sensor_attributes {
@@ -28,23 +34,100 @@ struct scmi_msg_resp_sensor_attributes {
 	__le32 reg_size;
 };
 
+/* v3 attributes_low macros */
+#define SUPPORTS_UPDATE_NOTIFY(x)	FIELD_GET(BIT(30), (x))
+#define SENSOR_TSTAMP_EXP(x)		FIELD_GET(GENMASK(14, 10), (x))
+#define SUPPORTS_TIMESTAMP(x)		FIELD_GET(BIT(9), (x))
+#define SUPPORTS_EXTEND_ATTRS(x)	FIELD_GET(BIT(8), (x))
+
+/* v2 attributes_high macros */
+#define SENSOR_UPDATE_BASE(x)		FIELD_GET(GENMASK(31, 27), (x))
+#define SENSOR_UPDATE_SCALE(x)		FIELD_GET(GENMASK(26, 22), (x))
+
+/* v3 attributes_high macros */
+#define SENSOR_AXIS_NUMBER(x)		FIELD_GET(GENMASK(21, 16), (x))
+#define SUPPORTS_AXIS(x)		FIELD_GET(BIT(8), (x))
+
+/* v3 resolution macros */
+#define SENSOR_RES(x)			FIELD_GET(GENMASK(26, 0), (x))
+#define SENSOR_RES_EXP(x)		FIELD_GET(GENMASK(31, 27), (x))
+
+struct scmi_msg_resp_attrs {
+	__le32 min_range_low;
+	__le32 min_range_high;
+	__le32 max_range_low;
+	__le32 max_range_high;
+};
+
 struct scmi_msg_resp_sensor_description {
 	__le16 num_returned;
 	__le16 num_remaining;
-	struct {
+	struct scmi_sensor_descriptor {
+		__le32 id;
+		__le32 attributes_low;
+/* Common attributes_low macros */
+#define SUPPORTS_ASYNC_READ(x)		FIELD_GET(BIT(31), (x))
+#define NUM_TRIP_POINTS(x)		FIELD_GET(GENMASK(7, 0), (x))
+		__le32 attributes_high;
+/* Common attributes_high macros */
+#define SENSOR_SCALE(x)			FIELD_GET(GENMASK(15, 11), (x))
+#define SENSOR_SCALE_SIGN		BIT(4)
+#define SENSOR_SCALE_EXTEND		GENMASK(31, 5)
+#define SENSOR_TYPE(x)			FIELD_GET(GENMASK(7, 0), (x))
+		u8 name[SCMI_MAX_STR_SIZE];
+		/* only for version > 2.0 */
+		__le32 power;
+		__le32 resolution;
+		struct scmi_msg_resp_attrs scalar_attrs;
+	} desc[];
+};
+
+/* Base scmi_sensor_descriptor size excluding extended attrs after name */
+#define SCMI_MSG_RESP_SENS_DESCR_BASE_SZ	28
+
+/* Sign extend to a full s32 */
+#define	S32_EXT(v)							\
+	({								\
+		int __v = (v);						\
+									\
+		if (__v & SENSOR_SCALE_SIGN)				\
+			__v |= SENSOR_SCALE_EXTEND;			\
+		__v;							\
+	})
+
+struct scmi_msg_sensor_axis_description_get {
+	__le32 id;
+	__le32 axis_desc_index;
+};
+
+struct scmi_msg_resp_sensor_axis_description {
+	__le32 num_axis_flags;
+#define NUM_AXIS_RETURNED(x)		FIELD_GET(GENMASK(5, 0), (x))
+#define NUM_AXIS_REMAINING(x)		FIELD_GET(GENMASK(31, 26), (x))
+	struct scmi_axis_descriptor {
 		__le32 id;
 		__le32 attributes_low;
-#define SUPPORTS_ASYNC_READ(x)	((x) & BIT(31))
-#define NUM_TRIP_POINTS(x)	((x) & 0xff)
 		__le32 attributes_high;
-#define SENSOR_TYPE(x)		((x) & 0xff)
-#define SENSOR_SCALE(x)		(((x) >> 11) & 0x1f)
-#define SENSOR_SCALE_SIGN	BIT(4)
-#define SENSOR_SCALE_EXTEND	GENMASK(7, 5)
-#define SENSOR_UPDATE_SCALE(x)	(((x) >> 22) & 0x1f)
-#define SENSOR_UPDATE_BASE(x)	(((x) >> 27) & 0x1f)
-		    u8 name[SCMI_MAX_STR_SIZE];
-	} desc[0];
+		u8 name[SCMI_MAX_STR_SIZE];
+		__le32 resolution;
+		struct scmi_msg_resp_attrs attrs;
+	} desc[];
+};
+
+/* Base scmi_axis_descriptor size excluding extended attrs after name */
+#define SCMI_MSG_RESP_AXIS_DESCR_BASE_SZ	28
+
+struct scmi_msg_sensor_list_update_intervals {
+	__le32 id;
+	__le32 index;
+};
+
+struct scmi_msg_resp_sensor_list_update_intervals {
+	__le32 num_intervals_flags;
+#define NUM_INTERVALS_RETURNED(x)	FIELD_GET(GENMASK(11, 0), (x))
+#define SEGMENTED_INTVL_FORMAT(x)	FIELD_GET(BIT(12), (x))
+#define NUM_INTERVALS_REMAINING(x)	FIELD_GET(GENMASK(31, 16), (x))
+	__le32 intervals[];
 };
 
 struct scmi_msg_sensor_trip_point_notify {
@@ -114,6 +197,194 @@ static int scmi_sensor_attributes_get(const struct scmi_handle *handle,
 	return ret;
 }
 
+static inline void scmi_parse_range_attrs(struct scmi_range_attrs *out,
+					  struct scmi_msg_resp_attrs *in)
+{
+	out->min_range = get_unaligned_le64((void *)&in->min_range_low);
+	out->max_range = get_unaligned_le64((void *)&in->max_range_low);
+}
+
+static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
+					struct scmi_sensor_info *s)
+{
+	int ret, cnt;
+	u32 desc_index = 0;
+	u16 num_returned, num_remaining;
+	struct scmi_xfer *ti;
+	struct scmi_msg_resp_sensor_list_update_intervals *buf;
+	struct scmi_msg_sensor_list_update_intervals *msg;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_LIST_UPDATE_INTERVALS,
+				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &ti);
+	if (ret)
+		return ret;
+
+	buf = ti->rx.buf;
+	do {
+		u32 flags;
+
+		msg = ti->tx.buf;
+		/* Set the number of sensors to be skipped/already read */
+		msg->id = cpu_to_le32(s->id);
+		msg->index = cpu_to_le32(desc_index);
+
+		ret = scmi_do_xfer(handle, ti);
+		if (ret)
+			break;
+
+		flags = le32_to_cpu(buf->num_intervals_flags);
+		num_returned = NUM_INTERVALS_RETURNED(flags);
+		num_remaining = NUM_INTERVALS_REMAINING(flags);
+
+		/*
+		 * Max intervals is not declared previously anywhere so we
+		 * assume it's returned+remaining.
+		 */
+		if (!s->intervals.count) {
+			s->intervals.segmented = SEGMENTED_INTVL_FORMAT(flags);
+			s->intervals.count = num_returned + num_remaining;
+			/* segmented intervals are reported in one triplet */
+			if (s->intervals.segmented &&
+			    (num_remaining || num_returned != 3)) {
+				dev_err(handle->dev,
+					"Sensor ID:%d advertises an invalid segmented interval (%d)\n",
+					s->id, s->intervals.count);
+				s->intervals.segmented = false;
+				s->intervals.count = 0;
+				ret = -EINVAL;
+				break;
+			}
+			/* Direct allocation when exceeding pre-allocated */
+			if (s->intervals.count >= SCMI_MAX_PREALLOC_POOL) {
+				s->intervals.desc =
+					devm_kcalloc(handle->dev,
+						     s->intervals.count,
+						     sizeof(*s->intervals.desc),
+						     GFP_KERNEL);
+				if (!s->intervals.desc) {
+					s->intervals.segmented = false;
+					s->intervals.count = 0;
+					ret = -ENOMEM;
+					break;
+				}
+			}
+		} else if (desc_index + num_returned > s->intervals.count) {
+			dev_err(handle->dev,
+				"No. of update intervals can't exceed %d\n",
+				s->intervals.count);
+			ret = -EINVAL;
+			break;
+		}
+
+		for (cnt = 0; cnt < num_returned; cnt++)
+			s->intervals.desc[desc_index + cnt] =
+					le32_to_cpu(buf->intervals[cnt]);
+
+		desc_index += num_returned;
+
+		scmi_reset_rx_to_maxsz(handle, ti);
+		/*
+		 * check for both returned and remaining to avoid infinite
+		 * loop due to buggy firmware
+		 */
+	} while (num_returned && num_remaining);
+
+	scmi_xfer_put(handle, ti);
+	return ret;
+}
+
+static int scmi_sensor_axis_description(const struct scmi_handle *handle,
+					struct scmi_sensor_info *s)
+{
+	int ret, cnt;
+	u32 desc_index = 0;
+	u16 num_returned, num_remaining;
+	struct scmi_xfer *te;
+	struct scmi_msg_resp_sensor_axis_description *buf;
+	struct scmi_msg_sensor_axis_description_get *msg;
+
+	s->axis = devm_kcalloc(handle->dev, s->num_axis,
+			       sizeof(*s->axis), GFP_KERNEL);
+	if (!s->axis)
+		return -ENOMEM;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_AXIS_DESCRIPTION_GET,
+				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &te);
+	if (ret)
+		return ret;
+
+	buf = te->rx.buf;
+	do {
+		u32 flags;
+		struct scmi_axis_descriptor *adesc;
+
+		msg = te->tx.buf;
+		/* Set the number of sensors to be skipped/already read */
+		msg->id = cpu_to_le32(s->id);
+		msg->axis_desc_index = cpu_to_le32(desc_index);
+
+		ret = scmi_do_xfer(handle, te);
+		if (ret)
+			break;
+
+		flags = le32_to_cpu(buf->num_axis_flags);
+		num_returned = NUM_AXIS_RETURNED(flags);
+		num_remaining = NUM_AXIS_REMAINING(flags);
+
+		if (desc_index + num_returned > s->num_axis) {
+			dev_err(handle->dev, "No. of axis can't exceed %d\n",
+				s->num_axis);
+			break;
+		}
+
+		adesc = &buf->desc[0];
+		for (cnt = 0; cnt < num_returned; cnt++) {
+			u32 attrh, attrl;
+			struct scmi_sensor_axis_info *a;
+			size_t dsize = SCMI_MSG_RESP_AXIS_DESCR_BASE_SZ;
+
+			attrl = le32_to_cpu(adesc->attributes_low);
+
+			a = &s->axis[desc_index + cnt];
+
+			a->id = le32_to_cpu(adesc->id);
+			a->extended_attrs = SUPPORTS_EXTEND_ATTRS(attrl);
+
+			attrh = le32_to_cpu(adesc->attributes_high);
+			a->scale = S32_EXT(SENSOR_SCALE(attrh));
+			a->type = SENSOR_TYPE(attrh);
+			strlcpy(a->name, adesc->name, SCMI_MAX_STR_SIZE);
+
+			if (a->extended_attrs) {
+				unsigned int ares =
+					le32_to_cpu(adesc->resolution);
+
+				a->resolution = SENSOR_RES(ares);
+				a->exponent =
+					S32_EXT(SENSOR_RES_EXP(ares));
+				dsize += sizeof(adesc->resolution);
+
+				scmi_parse_range_attrs(&a->attrs,
+						       &adesc->attrs);
+				dsize += sizeof(adesc->attrs);
+			}
+
+			adesc = (typeof(adesc))((u8 *)adesc + dsize);
+		}
+
+		desc_index += num_returned;
+
+		scmi_reset_rx_to_maxsz(handle, te);
+		/*
+		 * check for both returned and remaining to avoid infinite
+		 * loop due to buggy firmware
+		 */
+	} while (num_returned && num_remaining);
+
+	scmi_xfer_put(handle, te);
+	return ret;
+}
+
 static int scmi_sensor_description_get(const struct scmi_handle *handle,
 				       struct sensors_info *si)
 {
@@ -131,9 +402,10 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 	buf = t->rx.buf;
 
 	do {
+		struct scmi_sensor_descriptor *sdesc;
+
 		/* Set the number of sensors to be skipped/already read */
 		put_unaligned_le32(desc_index, t->tx.buf);
-
 		ret = scmi_do_xfer(handle, t);
 		if (ret)
 			break;
@@ -147,22 +419,97 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 			break;
 		}
 
+		sdesc = &buf->desc[0];
 		for (cnt = 0; cnt < num_returned; cnt++) {
 			u32 attrh, attrl;
 			struct scmi_sensor_info *s;
+			size_t dsize = SCMI_MSG_RESP_SENS_DESCR_BASE_SZ;
 
-			attrl = le32_to_cpu(buf->desc[cnt].attributes_low);
-			attrh = le32_to_cpu(buf->desc[cnt].attributes_high);
 			s = &si->sensors[desc_index + cnt];
-			s->id = le32_to_cpu(buf->desc[cnt].id);
-			s->type = SENSOR_TYPE(attrh);
-			s->scale = SENSOR_SCALE(attrh);
-			/* Sign extend to a full s8 */
-			if (s->scale & SENSOR_SCALE_SIGN)
-				s->scale |= SENSOR_SCALE_EXTEND;
+			s->id = le32_to_cpu(sdesc->id);
+
+			attrl = le32_to_cpu(sdesc->attributes_low);
+			/* common bitfields parsing */
 			s->async = SUPPORTS_ASYNC_READ(attrl);
 			s->num_trip_points = NUM_TRIP_POINTS(attrl);
-			strlcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE);
+			/**
+			 * only SCMIv3.0 specific bitfield below.
+			 * Such bitfields are assumed to be zeroed on non
+			 * relevant fw versions...assuming fw not buggy !
+			 */
+			s->update = SUPPORTS_UPDATE_NOTIFY(attrl);
+			s->timestamped = SUPPORTS_TIMESTAMP(attrl);
+			if (s->timestamped)
+				s->tstamp_scale =
+					S32_EXT(SENSOR_TSTAMP_EXP(attrl));
+			s->extended_scalar_attrs =
+				SUPPORTS_EXTEND_ATTRS(attrl);
+
+			attrh = le32_to_cpu(sdesc->attributes_high);
+			/* common bitfields parsing */
+			s->scale = S32_EXT(SENSOR_SCALE(attrh));
+			s->type = SENSOR_TYPE(attrh);
+			/* Use pre-allocated pool wherever possible */
+			s->intervals.desc = s->intervals.prealloc_pool;
+			if (si->version == SCMIv2_SENSOR_PROTOCOL) {
+				s->intervals.segmented = false;
+				s->intervals.count = 1;
+				/*
+				 * Convert SCMIv2.0 update interval format to
+				 * SCMIv3.0 to be used as the common exposed
+				 * descriptor, accessible via common macros.
+				 */
+				s->intervals.desc[0] =
+					(SENSOR_UPDATE_BASE(attrh) << 5) |
+					 SENSOR_UPDATE_SCALE(attrh);
+			} else {
+				/*
+				 * From SCMIv3.0 update intervals are retrieved
+				 * via a dedicated (optional) command.
+				 * Since the command is optional, on error carry
+				 * on without any update interval.
+				 */
+				if (scmi_sensor_update_intervals(handle, s))
+					dev_dbg(handle->dev,
+						"Update Intervals not available for sensor ID:%d\n",
+						s->id);
+			}
+			/**
+			 * only > SCMIv2.0 specific bitfield below.
+			 * Such bitfields are assumed to be zeroed on non
+			 * relevant fw versions...assuming fw not buggy !
+			 */
+			s->num_axis = min_t(unsigned int,
+					    SUPPORTS_AXIS(attrh) ?
+					    SENSOR_AXIS_NUMBER(attrh) : 0,
+					    SCMI_MAX_NUM_SENSOR_AXIS);
+			strlcpy(s->name, sdesc->name, SCMI_MAX_STR_SIZE);
+
+			if (s->extended_scalar_attrs) {
+				s->sensor_power = le32_to_cpu(sdesc->power);
+				dsize += sizeof(sdesc->power);
+				/* Only for sensors reporting scalar values */
+				if (s->num_axis == 0) {
+					unsigned int sres =
+						le32_to_cpu(sdesc->resolution);
+
+					s->resolution = SENSOR_RES(sres);
+					s->exponent =
+						S32_EXT(SENSOR_RES_EXP(sres));
+					dsize += sizeof(sdesc->resolution);
+
+					scmi_parse_range_attrs(&s->scalar_attrs,
+							       &sdesc->scalar_attrs);
+					dsize += sizeof(sdesc->scalar_attrs);
+				}
+			}
+			if (s->num_axis > 0) {
+				ret = scmi_sensor_axis_description(handle, s);
+				if (ret)
+					goto out;
+			}
+
+			sdesc = (typeof(sdesc))((u8 *)sdesc + dsize);
 		}
 
 		desc_index += num_returned;
@@ -174,6 +521,7 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 		 */
 	} while (num_returned && num_remaining);
 
+out:
 	scmi_xfer_put(handle, t);
 	return ret;
 }
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 13d75956aa91..0792b0be25a3 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_SCMI_PROTOCOL_H
 #define _LINUX_SCMI_PROTOCOL_H
 
+#include <linux/bitfield.h>
 #include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/types.h>
@@ -148,13 +149,135 @@ struct scmi_power_ops {
 			 u32 *state);
 };
 
+/**
+ * scmi_range_attrs  - specifies a sensor or axis values' range
+ * @min_range: The minimum value which can be represented by the sensor/axis.
+ * @max_range: The maximum value which can be represented by the sensor/axis.
+ */
+struct scmi_range_attrs {
+	long long min_range;
+	long long max_range;
+};
+
+/**
+ * scmi_sensor_axis_info  - describes one sensor axes
+ * @id: The axes ID.
+ * @type: Axes type. Chosen amongst one of @enum scmi_sensor_class.
+ * @scale: Power-of-10 multiplier applied to the axis unit.
+ * @name: NULL-terminated string representing axes name as advertised by
+ *	  SCMI platform.
+ * @extended_attrs: Flag to indicate the presence of additional extended
+ *		    attributes for this axes.
+ * @resolution: Extended attribute representing the resolution of the axes.
+ *		Set to 0 if not reported by this axes.
+ * @exponent: Extended attribute representing the power-of-10 multiplier that
+ *	      is applied to the resolution field. Set to 0 if not reported by
+ *	      this axes.
+ * @attrs: Extended attributes representing minimum and maximum values
+ *	   measurable by this axes. Set to 0 if not reported by this sensor.
+ */
+struct scmi_sensor_axis_info {
+	unsigned int id;
+	unsigned int type;
+	int scale;
+	char name[SCMI_MAX_STR_SIZE];
+	bool extended_attrs;
+	unsigned int resolution;
+	int exponent;
+	struct scmi_range_attrs attrs;
+};
+
+/**
+ * scmi_sensor_intervals_info  - describes number and type of available update
+ * intervals
+ * @segmented: Flag for segmented intervals' representation. When True there
+ *	       will be exactly 3 intervals in @desc, with each entry
+ *	       representing a member of a segment in this order:
+ *	       {lowest update interval, highest update interval, step size}
+ * @count: Number of intervals described in @desc.
+ * @desc: Array of @count interval descriptor bitmask represented as detailed in
+ *	  the SCMI specification: it can be accessed using the accompanying
+ *	  macros.
+ * @prealloc_pool: A minimal preallocated pool of desc entries used to avoid
+ *		   lesser-than-64-bytes dynamic allocation for small @count
+ *		   values.
+ */
+struct scmi_sensor_intervals_info {
+	bool segmented;
+	unsigned int count;
+#define SCMI_SENS_INTVL_SEGMENT_LOW	0
+#define SCMI_SENS_INTVL_SEGMENT_HIGH	1
+#define SCMI_SENS_INTVL_SEGMENT_STEP	2
+	unsigned int *desc;
+#define SCMI_SENS_INTVL_GET_SECS(x)		FIELD_GET(GENMASK(20, 5), (x))
+#define SCMI_SENS_INTVL_GET_EXP(x)					\
+	({								\
+		int __signed_exp = FIELD_GET(GENMASK(4, 0), (x));	\
+									\
+		if (__signed_exp & BIT(4))				\
+			__signed_exp |= GENMASK(31, 5);			\
+		__signed_exp;						\
+	})
+#define SCMI_MAX_PREALLOC_POOL			16
+	unsigned int prealloc_pool[SCMI_MAX_PREALLOC_POOL];
+};
+
+/**
+ * struct scmi_sensor_info - represents information related to one of the
+ * available sensors.
+ * @id: Sensor ID.
+ * @type: Sensor type. Chosen amongst one of @enum scmi_sensor_class.
+ * @scale: Power-of-10 multiplier applied to the sensor unit.
+ * @num_trip_points: Number of maximum configurable trip points.
+ * @async: Flag for asynchronous read support.
+ * @update: Flag for continuouos update notification support.
+ * @timestamped: Flag for timestamped read support.
+ * @tstamp_scale: Power-of-10 multiplier applied to the sensor timestamps to
+ *		  represent it in seconds.
+ * @num_axis: Number of supported axis if any. Reported as 0 for scalar sensors.
+ * @axis: Pointer to an array of @num_axis descriptors.
+ * @intervals: Descriptor of available update intervals.
+ * @sensor_config: A bitmask reporting the current sensor configuration as
+ *		   detailed in the SCMI specification: it can accessed and
+ *		   modified through the accompanying macros.
+ * @name: NULL-terminated string representing sensor name as advertised by
+ *	  SCMI platform.
+ * @extended_scalar_attrs: Flag to indicate the presence of additional extended
+ *			   attributes for this sensor.
+ * @sensor_power: Extended attribute representing the average power
+ *		  consumed by the sensor in microwatts (uW) when it is active.
+ *		  Reported here only for scalar sensors.
+ *		  Set to 0 if not reported by this sensor.
+ * @resolution: Extended attribute representing the resolution of the sensor.
+ *		Reported here only for scalar sensors.
+ *		Set to 0 if not reported by this sensor.
+ * @exponent: Extended attribute representing the power-of-10 multiplier that is
+ *	      applied to the resolution field.
+ *	      Reported here only for scalar sensors.
+ *	      Set to 0 if not reported by this sensor.
+ * @scalar_attrs: Extended attributes representing minimum and maximum
+ *		  measurable values by this sensor.
+ *		  Reported here only for scalar sensors.
+ *		  Set to 0 if not reported by this sensor.
+ */
 struct scmi_sensor_info {
-	u32 id;
-	u8 type;
-	s8 scale;
-	u8 num_trip_points;
+	unsigned int id;
+	unsigned int type;
+	int scale;
+	unsigned int num_trip_points;
 	bool async;
+	bool update;
+	bool timestamped;
+	int tstamp_scale;
+	unsigned int num_axis;
+	struct scmi_sensor_axis_info *axis;
+	struct scmi_sensor_intervals_info intervals;
 	char name[SCMI_MAX_STR_SIZE];
+	bool extended_scalar_attrs;
+	unsigned int sensor_power;
+	unsigned int resolution;
+	int exponent;
+	struct scmi_range_attrs scalar_attrs;
 };
 
 /*
@@ -249,6 +372,14 @@ enum scmi_sensor_class {
 	SQ_FEET = 0x54,
 	SQ_CM = 0x55,
 	SQ_METERS = 0x56,
+	RADIANS_SEC = 0x57,
+	BPM = 0x58,
+	METERS_SEC_SQUARED = 0x59,
+	METERS_SEC = 0x5A,
+	CUBIC_METERS_SEC = 0x5B,
+	MM_MERCURY = 0x5C,
+	RADIANS_SEC_SQUARED = 0x5D,
+	OEM_UNIT = 0xFF
 };
 
 /**
-- 
cgit 


From c7b74967799b1af52b3045d69d4c26836b2d41de Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Fri, 20 Nov 2020 11:04:44 +0100
Subject: can: replace can_dlc as variable/element for payload length

The naming of can_dlc as element of struct can_frame and also as variable
name is misleading as it claims to be a 'data length CODE' but in reality
it always was a plain data length.

With the indroduction of a new 'len' element in struct can_frame we can now
remove can_dlc as name and make clear which of the former uses was a plain
length (-> 'len') or a data length code (-> 'dlc') value.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20201120100444.3199-1-socketcan@hartkopp.net
[mkl: gs_usb: keep struct gs_host_frame::can_dlc as is]
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/at91_can.c                        | 14 +++++-----
 drivers/net/can/c_can/c_can.c                     | 20 +++++++-------
 drivers/net/can/cc770/cc770.c                     | 14 +++++-----
 drivers/net/can/dev.c                             | 10 +++----
 drivers/net/can/grcan.c                           | 10 +++----
 drivers/net/can/ifi_canfd/ifi_canfd.c             |  4 +--
 drivers/net/can/janz-ican3.c                      | 20 +++++++-------
 drivers/net/can/kvaser_pciefd.c                   |  4 +--
 drivers/net/can/m_can/m_can.c                     |  4 +--
 drivers/net/can/mscan/mscan.c                     | 20 +++++++-------
 drivers/net/can/pch_can.c                         | 12 ++++-----
 drivers/net/can/peak_canfd/peak_canfd.c           | 12 ++++-----
 drivers/net/can/rcar/rcar_can.c                   | 14 +++++-----
 drivers/net/can/rcar/rcar_canfd.c                 |  4 +--
 drivers/net/can/rx-offload.c                      |  2 +-
 drivers/net/can/sja1000/sja1000.c                 | 10 +++----
 drivers/net/can/slcan.c                           | 32 +++++++++++------------
 drivers/net/can/softing/softing_fw.c              |  2 +-
 drivers/net/can/softing/softing_main.c            | 14 +++++-----
 drivers/net/can/spi/hi311x.c                      | 20 +++++++-------
 drivers/net/can/spi/mcp251x.c                     | 18 ++++++-------
 drivers/net/can/sun4i_can.c                       | 10 +++----
 drivers/net/can/ti_hecc.c                         |  8 +++---
 drivers/net/can/usb/ems_usb.c                     | 16 ++++++------
 drivers/net/can/usb/esd_usb2.c                    | 16 ++++++------
 drivers/net/can/usb/gs_usb.c                      |  8 +++---
 drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c  |  2 +-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 16 ++++++------
 drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c  | 22 ++++++++--------
 drivers/net/can/usb/mcba_usb.c                    | 10 +++----
 drivers/net/can/usb/peak_usb/pcan_usb.c           | 14 +++++-----
 drivers/net/can/usb/peak_usb/pcan_usb_fd.c        | 10 +++----
 drivers/net/can/usb/peak_usb/pcan_usb_pro.c       | 14 +++++-----
 drivers/net/can/usb/ucan.c                        | 14 +++++-----
 drivers/net/can/usb/usb_8dev.c                    | 14 +++++-----
 drivers/net/can/xilinx_can.c                      | 10 +++----
 include/linux/can/dev.h                           |  4 +--
 net/can/af_can.c                                  |  2 +-
 net/can/gw.c                                      |  2 +-
 net/can/j1939/main.c                              |  4 +--
 40 files changed, 228 insertions(+), 228 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index db06254f8eb7..5284f0ab3b06 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -468,7 +468,7 @@ static netdev_tx_t at91_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	reg_mid = at91_can_id_to_reg_mid(cf->can_id);
 	reg_mcr = ((cf->can_id & CAN_RTR_FLAG) ? AT91_MCR_MRTR : 0) |
-		(cf->can_dlc << 16) | AT91_MCR_MTCR;
+		(cf->len << 16) | AT91_MCR_MTCR;
 
 	/* disable MB while writing ID (see datasheet) */
 	set_mb_mode(priv, mb, AT91_MB_MODE_DISABLED);
@@ -481,7 +481,7 @@ static netdev_tx_t at91_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* This triggers transmission */
 	at91_write(priv, AT91_MCR(mb), reg_mcr);
 
-	stats->tx_bytes += cf->can_dlc;
+	stats->tx_bytes += cf->len;
 
 	/* _NOTE_: subtract AT91_MB_TX_FIRST offset from mb! */
 	can_put_echo_skb(skb, dev, mb - get_mb_tx_first(priv));
@@ -554,7 +554,7 @@ static void at91_rx_overflow_err(struct net_device *dev)
 	cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 }
 
@@ -580,7 +580,7 @@ static void at91_read_mb(struct net_device *dev, unsigned int mb,
 		cf->can_id = (reg_mid >> 18) & CAN_SFF_MASK;
 
 	reg_msr = at91_read(priv, AT91_MSR(mb));
-	cf->can_dlc = can_cc_dlc2len((reg_msr >> 16) & 0xf);
+	cf->len = can_cc_dlc2len((reg_msr >> 16) & 0xf);
 
 	if (reg_msr & AT91_MSR_MRTR)
 		cf->can_id |= CAN_RTR_FLAG;
@@ -619,7 +619,7 @@ static void at91_read_msg(struct net_device *dev, unsigned int mb)
 	at91_read_mb(dev, mb, cf);
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 
 	can_led_event(dev, CAN_LED_EVENT_RX);
@@ -780,7 +780,7 @@ static int at91_poll_err(struct net_device *dev, int quota, u32 reg_sr)
 	at91_poll_err_frame(dev, cf, reg_sr);
 
 	dev->stats.rx_packets++;
-	dev->stats.rx_bytes += cf->can_dlc;
+	dev->stats.rx_bytes += cf->len;
 	netif_receive_skb(skb);
 
 	return 1;
@@ -1047,7 +1047,7 @@ static void at91_irq_err(struct net_device *dev)
 	at91_irq_err_state(dev, cf, new_state);
 
 	dev->stats.rx_packets++;
-	dev->stats.rx_bytes += cf->can_dlc;
+	dev->stats.rx_bytes += cf->len;
 	netif_rx(skb);
 
 	priv->can.state = new_state;
diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
index 56cc705959ea..0420f09f2b70 100644
--- a/drivers/net/can/c_can/c_can.c
+++ b/drivers/net/can/c_can/c_can.c
@@ -306,7 +306,7 @@ static void c_can_setup_tx_object(struct net_device *dev, int iface,
 				  struct can_frame *frame, int idx)
 {
 	struct c_can_priv *priv = netdev_priv(dev);
-	u16 ctrl = IF_MCONT_TX | frame->can_dlc;
+	u16 ctrl = IF_MCONT_TX | frame->len;
 	bool rtr = frame->can_id & CAN_RTR_FLAG;
 	u32 arb = IF_ARB_MSGVAL;
 	int i;
@@ -339,7 +339,7 @@ static void c_can_setup_tx_object(struct net_device *dev, int iface,
 	if (priv->type == BOSCH_D_CAN) {
 		u32 data = 0, dreg = C_CAN_IFACE(DATA1_REG, iface);
 
-		for (i = 0; i < frame->can_dlc; i += 4, dreg += 2) {
+		for (i = 0; i < frame->len; i += 4, dreg += 2) {
 			data = (u32)frame->data[i];
 			data |= (u32)frame->data[i + 1] << 8;
 			data |= (u32)frame->data[i + 2] << 16;
@@ -347,7 +347,7 @@ static void c_can_setup_tx_object(struct net_device *dev, int iface,
 			priv->write_reg32(priv, dreg, data);
 		}
 	} else {
-		for (i = 0; i < frame->can_dlc; i += 2) {
+		for (i = 0; i < frame->len; i += 2) {
 			priv->write_reg(priv,
 					C_CAN_IFACE(DATA1_REG, iface) + i / 2,
 					frame->data[i] |
@@ -397,7 +397,7 @@ static int c_can_read_msg_object(struct net_device *dev, int iface, u32 ctrl)
 		return -ENOMEM;
 	}
 
-	frame->can_dlc = can_cc_dlc2len(ctrl & 0x0F);
+	frame->len = can_cc_dlc2len(ctrl & 0x0F);
 
 	arb = priv->read_reg32(priv, C_CAN_IFACE(ARB1_REG, iface));
 
@@ -412,7 +412,7 @@ static int c_can_read_msg_object(struct net_device *dev, int iface, u32 ctrl)
 		int i, dreg = C_CAN_IFACE(DATA1_REG, iface);
 
 		if (priv->type == BOSCH_D_CAN) {
-			for (i = 0; i < frame->can_dlc; i += 4, dreg += 2) {
+			for (i = 0; i < frame->len; i += 4, dreg += 2) {
 				data = priv->read_reg32(priv, dreg);
 				frame->data[i] = data;
 				frame->data[i + 1] = data >> 8;
@@ -420,7 +420,7 @@ static int c_can_read_msg_object(struct net_device *dev, int iface, u32 ctrl)
 				frame->data[i + 3] = data >> 24;
 			}
 		} else {
-			for (i = 0; i < frame->can_dlc; i += 2, dreg++) {
+			for (i = 0; i < frame->len; i += 2, dreg++) {
 				data = priv->read_reg(priv, dreg);
 				frame->data[i] = data;
 				frame->data[i + 1] = data >> 8;
@@ -429,7 +429,7 @@ static int c_can_read_msg_object(struct net_device *dev, int iface, u32 ctrl)
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += frame->can_dlc;
+	stats->rx_bytes += frame->len;
 
 	netif_receive_skb(skb);
 	return 0;
@@ -475,7 +475,7 @@ static netdev_tx_t c_can_start_xmit(struct sk_buff *skb,
 	 * transmit as we might race against do_tx().
 	 */
 	c_can_setup_tx_object(dev, IF_TX, frame, idx);
-	priv->dlc[idx] = frame->can_dlc;
+	priv->dlc[idx] = frame->len;
 	can_put_echo_skb(skb, dev, idx);
 
 	/* Update the active bits */
@@ -977,7 +977,7 @@ static int c_can_handle_state_change(struct net_device *dev,
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 
 	return 1;
@@ -1047,7 +1047,7 @@ static int c_can_handle_bus_err(struct net_device *dev,
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 	return 1;
 }
diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c
index 3fd2a276dd93..8d9f332c35e0 100644
--- a/drivers/net/can/cc770/cc770.c
+++ b/drivers/net/can/cc770/cc770.c
@@ -390,7 +390,7 @@ static void cc770_tx(struct net_device *dev, int mo)
 	u32 id;
 	int i;
 
-	dlc = cf->can_dlc;
+	dlc = cf->len;
 	id = cf->can_id;
 	rtr = cf->can_id & CAN_RTR_FLAG ? 0 : MSGCFG_DIR;
 
@@ -470,7 +470,7 @@ static void cc770_rx(struct net_device *dev, unsigned int mo, u8 ctrl1)
 		cf->can_id = CAN_RTR_FLAG;
 		if (config & MSGCFG_XTD)
 			cf->can_id |= CAN_EFF_FLAG;
-		cf->can_dlc = 0;
+		cf->len = 0;
 	} else {
 		if (config & MSGCFG_XTD) {
 			id = cc770_read_reg(priv, msgobj[mo].id[3]);
@@ -486,13 +486,13 @@ static void cc770_rx(struct net_device *dev, unsigned int mo, u8 ctrl1)
 		}
 
 		cf->can_id = id;
-		cf->can_dlc = can_cc_dlc2len((config & 0xf0) >> 4);
-		for (i = 0; i < cf->can_dlc; i++)
+		cf->len = can_cc_dlc2len((config & 0xf0) >> 4);
+		for (i = 0; i < cf->len; i++)
 			cf->data[i] = cc770_read_reg(priv, msgobj[mo].data[i]);
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -572,7 +572,7 @@ static int cc770_err(struct net_device *dev, u8 status)
 
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 
 	return 0;
@@ -699,7 +699,7 @@ static void cc770_tx_interrupt(struct net_device *dev, unsigned int o)
 	}
 
 	cf = (struct can_frame *)priv->tx_skb->data;
-	stats->tx_bytes += cf->can_dlc;
+	stats->tx_bytes += cf->len;
 	stats->tx_packets++;
 
 	can_put_echo_skb(priv->tx_skb, dev, 0);
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 81e39d7507d8..806e8b646b12 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -30,10 +30,10 @@ MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>");
 static const u8 dlc2len[] = {0, 1, 2, 3, 4, 5, 6, 7,
 			     8, 12, 16, 20, 24, 32, 48, 64};
 
-/* get data length from can_dlc with sanitized can_dlc */
-u8 can_dlc2len(u8 can_dlc)
+/* get data length from raw data length code (DLC) */
+u8 can_dlc2len(u8 dlc)
 {
-	return dlc2len[can_dlc & 0x0F];
+	return dlc2len[dlc & 0x0F];
 }
 EXPORT_SYMBOL_GPL(can_dlc2len);
 
@@ -595,7 +595,7 @@ static void can_restart(struct net_device *dev)
 	netif_rx_ni(skb);
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 
 restart:
 	netdev_dbg(dev, "restarted\n");
@@ -737,7 +737,7 @@ struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf)
 		return NULL;
 
 	(*cf)->can_id = CAN_ERR_FLAG;
-	(*cf)->can_dlc = CAN_ERR_DLC;
+	(*cf)->len = CAN_ERR_DLC;
 
 	return skb;
 }
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index c71c9b8683d5..f5d94a692576 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -1201,12 +1201,12 @@ static int grcan_receive(struct net_device *dev, int budget)
 			cf->can_id = ((slot[0] & GRCAN_MSG_BID)
 				      >> GRCAN_MSG_BID_BIT);
 		}
-		cf->can_dlc = can_cc_dlc2len((slot[1] & GRCAN_MSG_DLC)
+		cf->len = can_cc_dlc2len((slot[1] & GRCAN_MSG_DLC)
 					  >> GRCAN_MSG_DLC_BIT);
 		if (rtr) {
 			cf->can_id |= CAN_RTR_FLAG;
 		} else {
-			for (i = 0; i < cf->can_dlc; i++) {
+			for (i = 0; i < cf->len; i++) {
 				j = GRCAN_MSG_DATA_SLOT_INDEX(i);
 				shift = GRCAN_MSG_DATA_SHIFT(i);
 				cf->data[i] = (u8)(slot[j] >> shift);
@@ -1215,7 +1215,7 @@ static int grcan_receive(struct net_device *dev, int budget)
 
 		/* Update statistics and read pointer */
 		stats->rx_packets++;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		netif_receive_skb(skb);
 
 		rd = grcan_ring_add(rd, GRCAN_MSG_SIZE, dma->rx.size);
@@ -1399,7 +1399,7 @@ static netdev_tx_t grcan_start_xmit(struct sk_buff *skb,
 	eff = cf->can_id & CAN_EFF_FLAG;
 	rtr = cf->can_id & CAN_RTR_FLAG;
 	id = cf->can_id & (eff ? CAN_EFF_MASK : CAN_SFF_MASK);
-	dlc = cf->can_dlc;
+	dlc = cf->len;
 	if (eff)
 		tmp = (id << GRCAN_MSG_EID_BIT) & GRCAN_MSG_EID;
 	else
@@ -1447,7 +1447,7 @@ static netdev_tx_t grcan_start_xmit(struct sk_buff *skb,
 	 * can_put_echo_skb would be an error unless other measures are
 	 * taken.
 	 */
-	priv->txdlc[slotindex] = cf->can_dlc; /* Store dlc for statistics */
+	priv->txdlc[slotindex] = cf->len; /* Store dlc for statistics */
 	can_put_echo_skb(skb, dev, slotindex);
 
 	/* Make sure everything is written before allowing hardware to
diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c
index cc790354a8ee..3df55b0e4ef3 100644
--- a/drivers/net/can/ifi_canfd/ifi_canfd.c
+++ b/drivers/net/can/ifi_canfd/ifi_canfd.c
@@ -431,7 +431,7 @@ static int ifi_canfd_handle_lec_err(struct net_device *ndev)
 	writel(IFI_CANFD_ERROR_CTR_ER_ENABLE, priv->base + IFI_CANFD_ERROR_CTR);
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 
 	return 1;
@@ -523,7 +523,7 @@ static int ifi_canfd_handle_state_change(struct net_device *ndev,
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 
 	return 1;
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index 6a21af05ba27..2a6c918186c0 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -916,10 +916,10 @@ static void ican3_to_can_frame(struct ican3_dev *mod,
 
 		cf->can_id |= desc->data[0] << 3;
 		cf->can_id |= (desc->data[1] & 0xe0) >> 5;
-		cf->can_dlc = can_cc_dlc2len(desc->data[1] & ICAN3_CAN_DLC_MASK);
-		memcpy(cf->data, &desc->data[2], cf->can_dlc);
+		cf->len = can_cc_dlc2len(desc->data[1] & ICAN3_CAN_DLC_MASK);
+		memcpy(cf->data, &desc->data[2], cf->len);
 	} else {
-		cf->can_dlc = can_cc_dlc2len(desc->data[0] & ICAN3_CAN_DLC_MASK);
+		cf->len = can_cc_dlc2len(desc->data[0] & ICAN3_CAN_DLC_MASK);
 		if (desc->data[0] & ICAN3_EFF_RTR)
 			cf->can_id |= CAN_RTR_FLAG;
 
@@ -934,7 +934,7 @@ static void ican3_to_can_frame(struct ican3_dev *mod,
 			cf->can_id |= desc->data[3] >> 5;  /* 2-0   */
 		}
 
-		memcpy(cf->data, &desc->data[6], cf->can_dlc);
+		memcpy(cf->data, &desc->data[6], cf->len);
 	}
 }
 
@@ -947,7 +947,7 @@ static void can_frame_to_ican3(struct ican3_dev *mod,
 
 	/* we always use the extended format, with the ECHO flag set */
 	desc->command = ICAN3_CAN_TYPE_EFF;
-	desc->data[0] |= cf->can_dlc;
+	desc->data[0] |= cf->len;
 	desc->data[1] |= ICAN3_ECHO;
 
 	/* support single transmission (no retries) mode */
@@ -970,7 +970,7 @@ static void can_frame_to_ican3(struct ican3_dev *mod,
 	}
 
 	/* copy the data bits into the descriptor */
-	memcpy(&desc->data[6], cf->data, cf->can_dlc);
+	memcpy(&desc->data[6], cf->data, cf->len);
 }
 
 /*
@@ -1294,7 +1294,7 @@ static unsigned int ican3_get_echo_skb(struct ican3_dev *mod)
 	}
 
 	cf = (struct can_frame *)skb->data;
-	dlc = cf->can_dlc;
+	dlc = cf->len;
 
 	/* check flag whether this packet has to be looped back */
 	if (skb->pkt_type != PACKET_LOOPBACK) {
@@ -1332,10 +1332,10 @@ static bool ican3_echo_skb_matches(struct ican3_dev *mod, struct sk_buff *skb)
 	if (cf->can_id != echo_cf->can_id)
 		return false;
 
-	if (cf->can_dlc != echo_cf->can_dlc)
+	if (cf->len != echo_cf->len)
 		return false;
 
-	return memcmp(cf->data, echo_cf->data, cf->can_dlc) == 0;
+	return memcmp(cf->data, echo_cf->data, cf->len) == 0;
 }
 
 /*
@@ -1421,7 +1421,7 @@ static int ican3_recv_skb(struct ican3_dev *mod)
 
 	/* update statistics, receive the skb */
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 
 err_noalloc:
diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index 72acd1ba162d..de268a344fcf 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -1299,7 +1299,7 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can,
 	cf->data[7] = bec.rxerr;
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 
 	netif_rx(skb);
 	return 0;
@@ -1498,7 +1498,7 @@ static void kvaser_pciefd_handle_nack_packet(struct kvaser_pciefd_can *can,
 
 	if (skb) {
 		cf->can_id |= CAN_ERR_BUSERROR;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		stats->rx_packets++;
 		netif_rx(skb);
 	} else {
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 2e46a5916656..b7df90ceee4b 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -596,7 +596,7 @@ static int m_can_handle_lec_err(struct net_device *dev,
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 
 	return 1;
@@ -723,7 +723,7 @@ static int m_can_handle_state_change(struct net_device *dev,
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_receive_skb(skb);
 
 	return 1;
diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c
index 95bf7338b358..5ed00a1558e1 100644
--- a/drivers/net/can/mscan/mscan.c
+++ b/drivers/net/can/mscan/mscan.c
@@ -250,16 +250,16 @@ static netdev_tx_t mscan_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		void __iomem *data = &regs->tx.dsr1_0;
 		u16 *payload = (u16 *)frame->data;
 
-		for (i = 0; i < frame->can_dlc / 2; i++) {
+		for (i = 0; i < frame->len / 2; i++) {
 			out_be16(data, *payload++);
 			data += 2 + _MSCAN_RESERVED_DSR_SIZE;
 		}
 		/* write remaining byte if necessary */
-		if (frame->can_dlc & 1)
-			out_8(data, frame->data[frame->can_dlc - 1]);
+		if (frame->len & 1)
+			out_8(data, frame->data[frame->len - 1]);
 	}
 
-	out_8(&regs->tx.dlr, frame->can_dlc);
+	out_8(&regs->tx.dlr, frame->len);
 	out_8(&regs->tx.tbpr, priv->cur_pri);
 
 	/* Start transmission. */
@@ -312,19 +312,19 @@ static void mscan_get_rx_frame(struct net_device *dev, struct can_frame *frame)
 	if (can_id & 1)
 		frame->can_id |= CAN_RTR_FLAG;
 
-	frame->can_dlc = can_cc_dlc2len(in_8(&regs->rx.dlr) & 0xf);
+	frame->len = can_cc_dlc2len(in_8(&regs->rx.dlr) & 0xf);
 
 	if (!(frame->can_id & CAN_RTR_FLAG)) {
 		void __iomem *data = &regs->rx.dsr1_0;
 		u16 *payload = (u16 *)frame->data;
 
-		for (i = 0; i < frame->can_dlc / 2; i++) {
+		for (i = 0; i < frame->len / 2; i++) {
 			*payload++ = in_be16(data);
 			data += 2 + _MSCAN_RESERVED_DSR_SIZE;
 		}
 		/* read remaining byte if necessary */
-		if (frame->can_dlc & 1)
-			frame->data[frame->can_dlc - 1] = in_8(data);
+		if (frame->len & 1)
+			frame->data[frame->len - 1] = in_8(data);
 	}
 
 	out_8(&regs->canrflg, MSCAN_RXF);
@@ -372,7 +372,7 @@ static void mscan_get_err_frame(struct net_device *dev, struct can_frame *frame,
 		}
 	}
 	priv->shadow_statflg = canrflg & MSCAN_STAT_MSK;
-	frame->can_dlc = CAN_ERR_DLC;
+	frame->len = CAN_ERR_DLC;
 	out_8(&regs->canrflg, MSCAN_ERR_IF);
 }
 
@@ -407,7 +407,7 @@ static int mscan_rx_poll(struct napi_struct *napi, int quota)
 			mscan_get_err_frame(dev, frame, canrflg);
 
 		stats->rx_packets++;
-		stats->rx_bytes += frame->can_dlc;
+		stats->rx_bytes += frame->len;
 		work_done++;
 		netif_receive_skb(skb);
 	}
diff --git a/drivers/net/can/pch_can.c b/drivers/net/can/pch_can.c
index 9509bac8352d..4f9e7ec192aa 100644
--- a/drivers/net/can/pch_can.c
+++ b/drivers/net/can/pch_can.c
@@ -563,7 +563,7 @@ static void pch_can_error(struct net_device *ndev, u32 status)
 	netif_receive_skb(skb);
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 }
 
 static irqreturn_t pch_can_interrupt(int irq, void *dev_id)
@@ -683,10 +683,10 @@ static int pch_can_rx_normal(struct net_device *ndev, u32 obj_num, int quota)
 		if (id2 & PCH_ID2_DIR)
 			cf->can_id |= CAN_RTR_FLAG;
 
-		cf->can_dlc = can_cc_dlc2len((ioread32(&priv->regs->
+		cf->len = can_cc_dlc2len((ioread32(&priv->regs->
 						    ifregs[0].mcont)) & 0xF);
 
-		for (i = 0; i < cf->can_dlc; i += 2) {
+		for (i = 0; i < cf->len; i += 2) {
 			data_reg = ioread16(&priv->regs->ifregs[0].data[i / 2]);
 			cf->data[i] = data_reg;
 			cf->data[i + 1] = data_reg >> 8;
@@ -696,7 +696,7 @@ static int pch_can_rx_normal(struct net_device *ndev, u32 obj_num, int quota)
 		rcv_pkts++;
 		stats->rx_packets++;
 		quota--;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 
 		pch_fifo_thresh(priv, obj_num);
 		obj_num++;
@@ -919,7 +919,7 @@ static netdev_tx_t pch_xmit(struct sk_buff *skb, struct net_device *ndev)
 	iowrite32(id2, &priv->regs->ifregs[1].id2);
 
 	/* Copy data to register */
-	for (i = 0; i < cf->can_dlc; i += 2) {
+	for (i = 0; i < cf->len; i += 2) {
 		iowrite16(cf->data[i] | (cf->data[i + 1] << 8),
 			  &priv->regs->ifregs[1].data[i / 2]);
 	}
@@ -927,7 +927,7 @@ static netdev_tx_t pch_xmit(struct sk_buff *skb, struct net_device *ndev)
 	can_put_echo_skb(skb, ndev, tx_obj_no - PCH_RX_OBJ_END - 1);
 
 	/* Set the size of the data. Update if2_mcont */
-	iowrite32(cf->can_dlc | PCH_IF_MCONT_NEWDAT | PCH_IF_MCONT_TXRQXT |
+	iowrite32(cf->len | PCH_IF_MCONT_NEWDAT | PCH_IF_MCONT_TXRQXT |
 		  PCH_IF_MCONT_TXIE, &priv->regs->ifregs[1].mcont);
 
 	pch_can_rw_msg_obj(&priv->regs->ifregs[1].creq, tx_obj_no);
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index c6077e07214e..fff3a35276aa 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -410,7 +410,7 @@ static int pucan_handle_status(struct peak_canfd_priv *priv,
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	pucan_netif_rx(skb, msg->ts_low, msg->ts_high);
 
 	return 0;
@@ -438,7 +438,7 @@ static int pucan_handle_cache_critical(struct peak_canfd_priv *priv)
 	cf->data[6] = priv->bec.txerr;
 	cf->data[7] = priv->bec.rxerr;
 
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	stats->rx_packets++;
 	netif_rx(skb);
 
@@ -652,7 +652,7 @@ static netdev_tx_t peak_canfd_start_xmit(struct sk_buff *skb,
 	unsigned long flags;
 	bool should_stop_tx_queue;
 	int room_left;
-	u8 can_dlc;
+	u8 len;
 
 	if (can_dropped_invalid_skb(ndev, skb))
 		return NETDEV_TX_OK;
@@ -682,7 +682,7 @@ static netdev_tx_t peak_canfd_start_xmit(struct sk_buff *skb,
 
 	if (can_is_canfd_skb(skb)) {
 		/* CAN FD frame format */
-		can_dlc = can_len2dlc(cf->len);
+		len = can_len2dlc(cf->len);
 
 		msg_flags |= PUCAN_MSG_EXT_DATA_LEN;
 
@@ -693,7 +693,7 @@ static netdev_tx_t peak_canfd_start_xmit(struct sk_buff *skb,
 			msg_flags |= PUCAN_MSG_ERROR_STATE_IND;
 	} else {
 		/* CAN 2.0 frame format */
-		can_dlc = cf->len;
+		len = cf->len;
 
 		if (cf->can_id & CAN_RTR_FLAG)
 			msg_flags |= PUCAN_MSG_RTR;
@@ -707,7 +707,7 @@ static netdev_tx_t peak_canfd_start_xmit(struct sk_buff *skb,
 		msg_flags |= PUCAN_MSG_SELF_RECEIVE;
 
 	msg->flags = cpu_to_le16(msg_flags);
-	msg->channel_dlc = PUCAN_MSG_CHANNEL_DLC(priv->index, can_dlc);
+	msg->channel_dlc = PUCAN_MSG_CHANNEL_DLC(priv->index, len);
 	memcpy(msg->d, cf->data, cf->len);
 
 	/* struct msg client field is used as an index in the echo skbs ring */
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index 711ef4996b48..c803327f8f79 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -364,7 +364,7 @@ static void rcar_can_error(struct net_device *ndev)
 
 	if (skb) {
 		stats->rx_packets++;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		netif_rx(skb);
 	}
 }
@@ -607,16 +607,16 @@ static netdev_tx_t rcar_can_start_xmit(struct sk_buff *skb,
 	if (cf->can_id & CAN_RTR_FLAG) { /* Remote transmission request */
 		data |= RCAR_CAN_RTR;
 	} else {
-		for (i = 0; i < cf->can_dlc; i++)
+		for (i = 0; i < cf->len; i++)
 			writeb(cf->data[i],
 			       &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].data[i]);
 	}
 
 	writel(data, &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].id);
 
-	writeb(cf->can_dlc, &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].dlc);
+	writeb(cf->len, &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].dlc);
 
-	priv->tx_dlc[priv->tx_head % RCAR_CAN_FIFO_DEPTH] = cf->can_dlc;
+	priv->tx_dlc[priv->tx_head % RCAR_CAN_FIFO_DEPTH] = cf->len;
 	can_put_echo_skb(skb, ndev, priv->tx_head % RCAR_CAN_FIFO_DEPTH);
 	priv->tx_head++;
 	/* Start Tx: write 0xff to the TFPCR register to increment
@@ -659,18 +659,18 @@ static void rcar_can_rx_pkt(struct rcar_can_priv *priv)
 		cf->can_id = (data >> RCAR_CAN_SID_SHIFT) & CAN_SFF_MASK;
 
 	dlc = readb(&priv->regs->mb[RCAR_CAN_RX_FIFO_MBX].dlc);
-	cf->can_dlc = can_cc_dlc2len(dlc);
+	cf->len = can_cc_dlc2len(dlc);
 	if (data & RCAR_CAN_RTR) {
 		cf->can_id |= CAN_RTR_FLAG;
 	} else {
-		for (dlc = 0; dlc < cf->can_dlc; dlc++)
+		for (dlc = 0; dlc < cf->len; dlc++)
 			cf->data[dlc] =
 			readb(&priv->regs->mb[RCAR_CAN_RX_FIFO_MBX].data[dlc]);
 	}
 
 	can_led_event(priv->ndev, CAN_LED_EVENT_RX);
 
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	stats->rx_packets++;
 	netif_receive_skb(skb);
 }
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index 899a3218ce5e..86c6cbdb7e53 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1025,7 +1025,7 @@ static void rcar_canfd_error(struct net_device *ndev, u32 cerfl,
 	rcar_canfd_write(priv->base, RCANFD_CERFL(ch),
 			 RCANFD_CERFL_ERR(~cerfl));
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -1134,7 +1134,7 @@ static void rcar_canfd_state_change(struct net_device *ndev,
 
 		can_change_state(ndev, cf, tx_state, rx_state);
 		stats->rx_packets++;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		netif_rx(skb);
 	}
 }
diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c
index 6e95193b215b..450c5cfcb3fc 100644
--- a/drivers/net/can/rx-offload.c
+++ b/drivers/net/can/rx-offload.c
@@ -55,7 +55,7 @@ static int can_rx_offload_napi_poll(struct napi_struct *napi, int quota)
 
 		work_done++;
 		stats->rx_packets++;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		netif_receive_skb(skb);
 	}
 
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 1f188f2d126e..d55394aa0b95 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -295,7 +295,7 @@ static netdev_tx_t sja1000_start_xmit(struct sk_buff *skb,
 
 	netif_stop_queue(dev);
 
-	fi = dlc = cf->can_dlc;
+	fi = dlc = cf->len;
 	id = cf->can_id;
 
 	if (id & CAN_RTR_FLAG)
@@ -367,11 +367,11 @@ static void sja1000_rx(struct net_device *dev)
 		    | (priv->read_reg(priv, SJA1000_ID2) >> 5);
 	}
 
-	cf->can_dlc = can_cc_dlc2len(fi & 0x0F);
+	cf->len = can_cc_dlc2len(fi & 0x0F);
 	if (fi & SJA1000_FI_RTR) {
 		id |= CAN_RTR_FLAG;
 	} else {
-		for (i = 0; i < cf->can_dlc; i++)
+		for (i = 0; i < cf->len; i++)
 			cf->data[i] = priv->read_reg(priv, dreg++);
 	}
 
@@ -381,7 +381,7 @@ static void sja1000_rx(struct net_device *dev)
 	sja1000_write_cmdreg(priv, CMD_RRB);
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 
 	can_led_event(dev, CAN_LED_EVENT_RX);
@@ -490,7 +490,7 @@ static int sja1000_err(struct net_device *dev, uint8_t isrc, uint8_t status)
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 
 	return 0;
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index b4a39f0449ba..a1bd1be09548 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -106,8 +106,8 @@ static struct net_device **slcan_devs;
 
 /*
  * A CAN frame has a can_id (11 bit standard frame format OR 29 bit extended
- * frame format) a data length code (can_dlc) which can be from 0 to 8
- * and up to <can_dlc> data bytes as payload.
+ * frame format) a data length code (len) which can be from 0 to 8
+ * and up to <len> data bytes as payload.
  * Additionally a CAN frame may become a remote transmission frame if the
  * RTR-bit is set. This causes another ECU to send a CAN frame with the
  * given can_id.
@@ -128,10 +128,10 @@ static struct net_device **slcan_devs;
  *
  * Examples:
  *
- * t1230 : can_id 0x123, can_dlc 0, no data
- * t4563112233 : can_id 0x456, can_dlc 3, data 0x11 0x22 0x33
- * T12ABCDEF2AA55 : extended can_id 0x12ABCDEF, can_dlc 2, data 0xAA 0x55
- * r1230 : can_id 0x123, can_dlc 0, no data, remote transmission request
+ * t1230 : can_id 0x123, len 0, no data
+ * t4563112233 : can_id 0x456, len 3, data 0x11 0x22 0x33
+ * T12ABCDEF2AA55 : extended can_id 0x12ABCDEF, len 2, data 0xAA 0x55
+ * r1230 : can_id 0x123, len 0, no data, remote transmission request
  *
  */
 
@@ -156,7 +156,7 @@ static void slc_bump(struct slcan *sl)
 		fallthrough;
 	case 't':
 		/* store dlc ASCII value and terminate SFF CAN ID string */
-		cf.can_dlc = sl->rbuff[SLC_CMD_LEN + SLC_SFF_ID_LEN];
+		cf.len = sl->rbuff[SLC_CMD_LEN + SLC_SFF_ID_LEN];
 		sl->rbuff[SLC_CMD_LEN + SLC_SFF_ID_LEN] = 0;
 		/* point to payload data behind the dlc */
 		cmd += SLC_CMD_LEN + SLC_SFF_ID_LEN + 1;
@@ -167,7 +167,7 @@ static void slc_bump(struct slcan *sl)
 	case 'T':
 		cf.can_id |= CAN_EFF_FLAG;
 		/* store dlc ASCII value and terminate EFF CAN ID string */
-		cf.can_dlc = sl->rbuff[SLC_CMD_LEN + SLC_EFF_ID_LEN];
+		cf.len = sl->rbuff[SLC_CMD_LEN + SLC_EFF_ID_LEN];
 		sl->rbuff[SLC_CMD_LEN + SLC_EFF_ID_LEN] = 0;
 		/* point to payload data behind the dlc */
 		cmd += SLC_CMD_LEN + SLC_EFF_ID_LEN + 1;
@@ -181,15 +181,15 @@ static void slc_bump(struct slcan *sl)
 
 	cf.can_id |= tmpid;
 
-	/* get can_dlc from sanitized ASCII value */
-	if (cf.can_dlc >= '0' && cf.can_dlc < '9')
-		cf.can_dlc -= '0';
+	/* get len from sanitized ASCII value */
+	if (cf.len >= '0' && cf.len < '9')
+		cf.len -= '0';
 	else
 		return;
 
 	/* RTR frames may have a dlc > 0 but they never have any data bytes */
 	if (!(cf.can_id & CAN_RTR_FLAG)) {
-		for (i = 0; i < cf.can_dlc; i++) {
+		for (i = 0; i < cf.len; i++) {
 			tmp = hex_to_bin(*cmd++);
 			if (tmp < 0)
 				return;
@@ -218,7 +218,7 @@ static void slc_bump(struct slcan *sl)
 	skb_put_data(skb, &cf, sizeof(struct can_frame));
 
 	sl->dev->stats.rx_packets++;
-	sl->dev->stats.rx_bytes += cf.can_dlc;
+	sl->dev->stats.rx_bytes += cf.len;
 	netif_rx_ni(skb);
 }
 
@@ -282,11 +282,11 @@ static void slc_encaps(struct slcan *sl, struct can_frame *cf)
 
 	pos += (cf->can_id & CAN_EFF_FLAG) ? SLC_EFF_ID_LEN : SLC_SFF_ID_LEN;
 
-	*pos++ = cf->can_dlc + '0';
+	*pos++ = cf->len + '0';
 
 	/* RTR frames may have a dlc > 0 but they never have any data bytes */
 	if (!(cf->can_id & CAN_RTR_FLAG)) {
-		for (i = 0; i < cf->can_dlc; i++)
+		for (i = 0; i < cf->len; i++)
 			pos = hex_byte_pack_upper(pos, cf->data[i]);
 	}
 
@@ -304,7 +304,7 @@ static void slc_encaps(struct slcan *sl, struct can_frame *cf)
 	actual = sl->tty->ops->write(sl->tty, sl->xbuff, pos - sl->xbuff);
 	sl->xleft = (pos - sl->xbuff) - actual;
 	sl->xhead = sl->xbuff + actual;
-	sl->dev->stats.tx_bytes += cf->can_dlc;
+	sl->dev->stats.tx_bytes += cf->len;
 }
 
 /* Write out any remaining transmit buffer. Scheduled when tty is writable */
diff --git a/drivers/net/can/softing/softing_fw.c b/drivers/net/can/softing/softing_fw.c
index ccd649a8e37b..7e1536877993 100644
--- a/drivers/net/can/softing/softing_fw.c
+++ b/drivers/net/can/softing/softing_fw.c
@@ -624,7 +624,7 @@ int softing_startstop(struct net_device *dev, int up)
 	 */
 	memset(&msg, 0, sizeof(msg));
 	msg.can_id = CAN_ERR_FLAG | CAN_ERR_RESTARTED;
-	msg.can_dlc = CAN_ERR_DLC;
+	msg.len = CAN_ERR_DLC;
 	for (j = 0; j < ARRAY_SIZE(card->net); ++j) {
 		if (!(bus_bitmask_start & (1 << j)))
 			continue;
diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c
index 39e8275ee7ba..03a68bb486fd 100644
--- a/drivers/net/can/softing/softing_main.c
+++ b/drivers/net/can/softing/softing_main.c
@@ -84,7 +84,7 @@ static netdev_tx_t softing_netdev_start_xmit(struct sk_buff *skb,
 	if (priv->index)
 		*ptr |= CMD_BUS2;
 	++ptr;
-	*ptr++ = cf->can_dlc;
+	*ptr++ = cf->len;
 	*ptr++ = (cf->can_id >> 0);
 	*ptr++ = (cf->can_id >> 8);
 	if (cf->can_id & CAN_EFF_FLAG) {
@@ -95,7 +95,7 @@ static netdev_tx_t softing_netdev_start_xmit(struct sk_buff *skb,
 		ptr += 1;
 	}
 	if (!(cf->can_id & CAN_RTR_FLAG))
-		memcpy(ptr, &cf->data[0], cf->can_dlc);
+		memcpy(ptr, &cf->data[0], cf->len);
 	memcpy_toio(&card->dpram[DPRAM_TX + DPRAM_TX_SIZE * fifo_wr],
 			buf, DPRAM_TX_SIZE);
 	if (++fifo_wr >= DPRAM_TX_CNT)
@@ -167,7 +167,7 @@ static int softing_handle_1(struct softing *card)
 		iowrite8(0, &card->dpram[DPRAM_RX_LOST]);
 		/* prepare msg */
 		msg.can_id = CAN_ERR_FLAG | CAN_ERR_CRTL;
-		msg.can_dlc = CAN_ERR_DLC;
+		msg.len = CAN_ERR_DLC;
 		msg.data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
 		/*
 		 * service to all buses, we don't know which it was applicable
@@ -218,7 +218,7 @@ static int softing_handle_1(struct softing *card)
 		state = *ptr++;
 
 		msg.can_id = CAN_ERR_FLAG;
-		msg.can_dlc = CAN_ERR_DLC;
+		msg.len = CAN_ERR_DLC;
 
 		if (state & SF_MASK_BUSOFF) {
 			can_state = CAN_STATE_BUS_OFF;
@@ -261,7 +261,7 @@ static int softing_handle_1(struct softing *card)
 	} else {
 		if (cmd & CMD_RTR)
 			msg.can_id |= CAN_RTR_FLAG;
-		msg.can_dlc = can_cc_dlc2len(*ptr++);
+		msg.len = can_cc_dlc2len(*ptr++);
 		if (cmd & CMD_XTD) {
 			msg.can_id |= CAN_EFF_FLAG;
 			msg.can_id |= le32_to_cpup((void *)ptr);
@@ -294,7 +294,7 @@ static int softing_handle_1(struct softing *card)
 				--card->tx.pending;
 			++netdev->stats.tx_packets;
 			if (!(msg.can_id & CAN_RTR_FLAG))
-				netdev->stats.tx_bytes += msg.can_dlc;
+				netdev->stats.tx_bytes += msg.len;
 		} else {
 			int ret;
 
@@ -302,7 +302,7 @@ static int softing_handle_1(struct softing *card)
 			if (ret == NET_RX_SUCCESS) {
 				++netdev->stats.rx_packets;
 				if (!(msg.can_id & CAN_RTR_FLAG))
-					netdev->stats.rx_bytes += msg.can_dlc;
+					netdev->stats.rx_bytes += msg.len;
 			} else {
 				++netdev->stats.rx_dropped;
 			}
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index 728f34b696a7..f9455de94786 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -277,13 +277,13 @@ static void hi3110_hw_tx(struct spi_device *spi, struct can_frame *frame)
 			((frame->can_id & CAN_EFF_MASK) << 1) |
 			((frame->can_id & CAN_RTR_FLAG) ? 1 : 0);
 
-		buf[HI3110_FIFO_EXT_DLC_OFF] = frame->can_dlc;
+		buf[HI3110_FIFO_EXT_DLC_OFF] = frame->len;
 
 		memcpy(buf + HI3110_FIFO_EXT_DATA_OFF,
-		       frame->data, frame->can_dlc);
+		       frame->data, frame->len);
 
 		hi3110_hw_tx_frame(spi, buf, HI3110_TX_EXT_BUF_LEN -
-				   (HI3110_CAN_MAX_DATA_LEN - frame->can_dlc));
+				   (HI3110_CAN_MAX_DATA_LEN - frame->len));
 	} else {
 		/* Standard frame */
 		buf[HI3110_FIFO_ID_OFF] =   (frame->can_id & CAN_SFF_MASK) >> 3;
@@ -291,13 +291,13 @@ static void hi3110_hw_tx(struct spi_device *spi, struct can_frame *frame)
 			((frame->can_id & CAN_SFF_MASK) << 5) |
 			((frame->can_id & CAN_RTR_FLAG) ? (1 << 4) : 0);
 
-		buf[HI3110_FIFO_STD_DLC_OFF] = frame->can_dlc;
+		buf[HI3110_FIFO_STD_DLC_OFF] = frame->len;
 
 		memcpy(buf + HI3110_FIFO_STD_DATA_OFF,
-		       frame->data, frame->can_dlc);
+		       frame->data, frame->len);
 
 		hi3110_hw_tx_frame(spi, buf, HI3110_TX_STD_BUF_LEN -
-				   (HI3110_CAN_MAX_DATA_LEN - frame->can_dlc));
+				   (HI3110_CAN_MAX_DATA_LEN - frame->len));
 	}
 }
 
@@ -341,16 +341,16 @@ static void hi3110_hw_rx(struct spi_device *spi)
 	}
 
 	/* Data length */
-	frame->can_dlc = can_cc_dlc2len(buf[HI3110_FIFO_WOTIME_DLC_OFF] & 0x0F);
+	frame->len = can_cc_dlc2len(buf[HI3110_FIFO_WOTIME_DLC_OFF] & 0x0F);
 
 	if (buf[HI3110_FIFO_WOTIME_ID_OFF + 3] & HI3110_FIFO_WOTIME_ID_RTR)
 		frame->can_id |= CAN_RTR_FLAG;
 	else
 		memcpy(frame->data, buf + HI3110_FIFO_WOTIME_DAT_OFF,
-		       frame->can_dlc);
+		       frame->len);
 
 	priv->net->stats.rx_packets++;
-	priv->net->stats.rx_bytes += frame->can_dlc;
+	priv->net->stats.rx_bytes += frame->len;
 
 	can_led_event(priv->net, CAN_LED_EVENT_RX);
 
@@ -585,7 +585,7 @@ static void hi3110_tx_work_handler(struct work_struct *ws)
 		} else {
 			frame = (struct can_frame *)priv->tx_skb->data;
 			hi3110_hw_tx(spi, frame);
-			priv->tx_len = 1 + frame->can_dlc;
+			priv->tx_len = 1 + frame->len;
 			can_put_echo_skb(priv->tx_skb, net, 0);
 			priv->tx_skb = NULL;
 		}
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index 6ddebb1c4a24..25859d16d06f 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -644,9 +644,9 @@ static void mcp251x_hw_tx(struct spi_device *spi, struct can_frame *frame,
 		((eid >> SIDL_EID_SHIFT) & SIDL_EID_MASK);
 	buf[TXBEID8_OFF] = GET_BYTE(eid, 1);
 	buf[TXBEID0_OFF] = GET_BYTE(eid, 0);
-	buf[TXBDLC_OFF] = (rtr << DLC_RTR_SHIFT) | frame->can_dlc;
-	memcpy(buf + TXBDAT_OFF, frame->data, frame->can_dlc);
-	mcp251x_hw_tx_frame(spi, buf, frame->can_dlc, tx_buf_idx);
+	buf[TXBDLC_OFF] = (rtr << DLC_RTR_SHIFT) | frame->len;
+	memcpy(buf + TXBDAT_OFF, frame->data, frame->len);
+	mcp251x_hw_tx_frame(spi, buf, frame->len, tx_buf_idx);
 
 	/* use INSTRUCTION_RTS, to avoid "repeated frame problem" */
 	priv->spi_tx_buf[0] = INSTRUCTION_RTS(1 << tx_buf_idx);
@@ -720,11 +720,11 @@ static void mcp251x_hw_rx(struct spi_device *spi, int buf_idx)
 			frame->can_id |= CAN_RTR_FLAG;
 	}
 	/* Data length */
-	frame->can_dlc = can_cc_dlc2len(buf[RXBDLC_OFF] & RXBDLC_LEN_MASK);
-	memcpy(frame->data, buf + RXBDAT_OFF, frame->can_dlc);
+	frame->len = can_cc_dlc2len(buf[RXBDLC_OFF] & RXBDLC_LEN_MASK);
+	memcpy(frame->data, buf + RXBDAT_OFF, frame->len);
 
 	priv->net->stats.rx_packets++;
-	priv->net->stats.rx_bytes += frame->can_dlc;
+	priv->net->stats.rx_bytes += frame->len;
 
 	can_led_event(priv->net, CAN_LED_EVENT_RX);
 
@@ -998,10 +998,10 @@ static void mcp251x_tx_work_handler(struct work_struct *ws)
 		} else {
 			frame = (struct can_frame *)priv->tx_skb->data;
 
-			if (frame->can_dlc > CAN_FRAME_MAX_DATA_LEN)
-				frame->can_dlc = CAN_FRAME_MAX_DATA_LEN;
+			if (frame->len > CAN_FRAME_MAX_DATA_LEN)
+				frame->len = CAN_FRAME_MAX_DATA_LEN;
 			mcp251x_hw_tx(spi, frame, 0);
-			priv->tx_len = 1 + frame->can_dlc;
+			priv->tx_len = 1 + frame->len;
 			can_put_echo_skb(priv->tx_skb, net, 0);
 			priv->tx_skb = NULL;
 		}
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index 0e2569779895..098cc9670f0f 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -424,7 +424,7 @@ static netdev_tx_t sun4ican_start_xmit(struct sk_buff *skb, struct net_device *d
 	netif_stop_queue(dev);
 
 	id = cf->can_id;
-	dlc = cf->can_dlc;
+	dlc = cf->len;
 	msg_flag_n = dlc;
 
 	if (id & CAN_RTR_FLAG)
@@ -475,7 +475,7 @@ static void sun4i_can_rx(struct net_device *dev)
 		return;
 
 	fi = readl(priv->base + SUN4I_REG_BUF0_ADDR);
-	cf->can_dlc = can_cc_dlc2len(fi & 0x0F);
+	cf->len = can_cc_dlc2len(fi & 0x0F);
 	if (fi & SUN4I_MSG_EFF_FLAG) {
 		dreg = SUN4I_REG_BUF5_ADDR;
 		id = (readl(priv->base + SUN4I_REG_BUF1_ADDR) << 21) |
@@ -493,7 +493,7 @@ static void sun4i_can_rx(struct net_device *dev)
 	if (fi & SUN4I_MSG_RTR_FLAG)
 		id |= CAN_RTR_FLAG;
 	else
-		for (i = 0; i < cf->can_dlc; i++)
+		for (i = 0; i < cf->len; i++)
 			cf->data[i] = readl(priv->base + dreg + i * 4);
 
 	cf->can_id = id;
@@ -501,7 +501,7 @@ static void sun4i_can_rx(struct net_device *dev)
 	sun4i_can_write_cmdreg(priv, SUN4I_CMD_RELEASE_RBUF);
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 
 	can_led_event(dev, CAN_LED_EVENT_RX);
@@ -625,7 +625,7 @@ static int sun4i_can_err(struct net_device *dev, u8 isrc, u8 status)
 
 	if (likely(skb)) {
 		stats->rx_packets++;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		netif_rx(skb);
 	} else {
 		return -ENOMEM;
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 0b1fd34f4c83..a6850ff0b55b 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -496,7 +496,7 @@ static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev)
 	spin_unlock_irqrestore(&priv->mbx_lock, flags);
 
 	/* Prepare mailbox for transmission */
-	data = cf->can_dlc | (get_tx_head_prio(priv) << 8);
+	data = cf->len | (get_tx_head_prio(priv) << 8);
 	if (cf->can_id & CAN_RTR_FLAG) /* Remote transmission request */
 		data |= HECC_CANMCF_RTR;
 	hecc_write_mbx(priv, mbxno, HECC_CANMCF, data);
@@ -508,7 +508,7 @@ static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev)
 	hecc_write_mbx(priv, mbxno, HECC_CANMID, data);
 	hecc_write_mbx(priv, mbxno, HECC_CANMDL,
 		       be32_to_cpu(*(__be32 *)(cf->data)));
-	if (cf->can_dlc > 4)
+	if (cf->len > 4)
 		hecc_write_mbx(priv, mbxno, HECC_CANMDH,
 			       be32_to_cpu(*(__be32 *)(cf->data + 4)));
 	else
@@ -566,11 +566,11 @@ static struct sk_buff *ti_hecc_mailbox_read(struct can_rx_offload *offload,
 	data = hecc_read_mbx(priv, mbxno, HECC_CANMCF);
 	if (data & HECC_CANMCF_RTR)
 		cf->can_id |= CAN_RTR_FLAG;
-	cf->can_dlc = can_cc_dlc2len(data & 0xF);
+	cf->len = can_cc_dlc2len(data & 0xF);
 
 	data = hecc_read_mbx(priv, mbxno, HECC_CANMDL);
 	*(__be32 *)(cf->data) = cpu_to_be32(data);
-	if (cf->can_dlc > 4) {
+	if (cf->len > 4) {
 		data = hecc_read_mbx(priv, mbxno, HECC_CANMDH);
 		*(__be32 *)(cf->data + 4) = cpu_to_be32(data);
 	}
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index 288781934149..25eee4466364 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -306,7 +306,7 @@ static void ems_usb_rx_can_msg(struct ems_usb *dev, struct ems_cpc_msg *msg)
 		return;
 
 	cf->can_id = le32_to_cpu(msg->msg.can_msg.id);
-	cf->can_dlc = can_cc_dlc2len(msg->msg.can_msg.length & 0xF);
+	cf->len = can_cc_dlc2len(msg->msg.can_msg.length & 0xF);
 
 	if (msg->type == CPC_MSG_TYPE_EXT_CAN_FRAME ||
 	    msg->type == CPC_MSG_TYPE_EXT_RTR_FRAME)
@@ -316,12 +316,12 @@ static void ems_usb_rx_can_msg(struct ems_usb *dev, struct ems_cpc_msg *msg)
 	    msg->type == CPC_MSG_TYPE_EXT_RTR_FRAME) {
 		cf->can_id |= CAN_RTR_FLAG;
 	} else {
-		for (i = 0; i < cf->can_dlc; i++)
+		for (i = 0; i < cf->len; i++)
 			cf->data[i] = msg->msg.can_msg.msg[i];
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -396,7 +396,7 @@ static void ems_usb_rx_err(struct ems_usb *dev, struct ems_cpc_msg *msg)
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -755,7 +755,7 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne
 	msg = (struct ems_cpc_msg *)&buf[CPC_HEADER_SIZE];
 
 	msg->msg.can_msg.id = cpu_to_le32(cf->can_id & CAN_ERR_MASK);
-	msg->msg.can_msg.length = cf->can_dlc;
+	msg->msg.can_msg.length = cf->len;
 
 	if (cf->can_id & CAN_RTR_FLAG) {
 		msg->type = cf->can_id & CAN_EFF_FLAG ?
@@ -766,10 +766,10 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne
 		msg->type = cf->can_id & CAN_EFF_FLAG ?
 			CPC_CMD_TYPE_EXT_CAN_FRAME : CPC_CMD_TYPE_CAN_FRAME;
 
-		for (i = 0; i < cf->can_dlc; i++)
+		for (i = 0; i < cf->len; i++)
 			msg->msg.can_msg.msg[i] = cf->data[i];
 
-		msg->length = CPC_CAN_MSG_MIN_SIZE + cf->can_dlc;
+		msg->length = CPC_CAN_MSG_MIN_SIZE + cf->len;
 	}
 
 	for (i = 0; i < MAX_TX_URBS; i++) {
@@ -794,7 +794,7 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne
 
 	context->dev = dev;
 	context->echo_index = i;
-	context->dlc = cf->can_dlc;
+	context->dlc = cf->len;
 
 	usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, 2), buf,
 			  size, ems_usb_write_bulk_callback, context);
diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c
index 72999de550d1..3643a8ee03cf 100644
--- a/drivers/net/can/usb/esd_usb2.c
+++ b/drivers/net/can/usb/esd_usb2.c
@@ -292,7 +292,7 @@ static void esd_usb2_rx_event(struct esd_usb2_net_priv *priv,
 		priv->bec.rxerr = rxerr;
 
 		stats->rx_packets++;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		netif_rx(skb);
 	}
 }
@@ -321,7 +321,7 @@ static void esd_usb2_rx_can_msg(struct esd_usb2_net_priv *priv,
 		}
 
 		cf->can_id = id & ESD_IDMASK;
-		cf->can_dlc = can_cc_dlc2len(msg->msg.rx.dlc & ~ESD_RTR);
+		cf->len = can_cc_dlc2len(msg->msg.rx.dlc & ~ESD_RTR);
 
 		if (id & ESD_EXTID)
 			cf->can_id |= CAN_EFF_FLAG;
@@ -329,12 +329,12 @@ static void esd_usb2_rx_can_msg(struct esd_usb2_net_priv *priv,
 		if (msg->msg.rx.dlc & ESD_RTR) {
 			cf->can_id |= CAN_RTR_FLAG;
 		} else {
-			for (i = 0; i < cf->can_dlc; i++)
+			for (i = 0; i < cf->len; i++)
 				cf->data[i] = msg->msg.rx.data[i];
 		}
 
 		stats->rx_packets++;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		netif_rx(skb);
 	}
 
@@ -737,7 +737,7 @@ static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb,
 	msg->msg.hdr.len = 3; /* minimal length */
 	msg->msg.hdr.cmd = CMD_CAN_TX;
 	msg->msg.tx.net = priv->index;
-	msg->msg.tx.dlc = cf->can_dlc;
+	msg->msg.tx.dlc = cf->len;
 	msg->msg.tx.id = cpu_to_le32(cf->can_id & CAN_ERR_MASK);
 
 	if (cf->can_id & CAN_RTR_FLAG)
@@ -746,10 +746,10 @@ static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb,
 	if (cf->can_id & CAN_EFF_FLAG)
 		msg->msg.tx.id |= cpu_to_le32(ESD_EXTID);
 
-	for (i = 0; i < cf->can_dlc; i++)
+	for (i = 0; i < cf->len; i++)
 		msg->msg.tx.data[i] = cf->data[i];
 
-	msg->msg.hdr.len += (cf->can_dlc + 3) >> 2;
+	msg->msg.hdr.len += (cf->len + 3) >> 2;
 
 	for (i = 0; i < MAX_TX_URBS; i++) {
 		if (priv->tx_contexts[i].echo_index == MAX_TX_URBS) {
@@ -769,7 +769,7 @@ static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb,
 
 	context->priv = priv;
 	context->echo_index = i;
-	context->dlc = cf->can_dlc;
+	context->dlc = cf->len;
 
 	/* hnd must not be 0 - MSB is stripped in txdone handling */
 	msg->msg.tx.hnd = 0x80000000 | i; /* returned in TX done message */
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index b1729b208788..d6a68b7046eb 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -331,7 +331,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
 
 		cf->can_id = hf->can_id;
 
-		cf->can_dlc = can_cc_dlc2len(hf->can_dlc);
+		cf->len = can_cc_dlc2len(hf->can_dlc);
 		memcpy(cf->data, hf->data, 8);
 
 		/* ERROR frames tell us information about the controller */
@@ -378,7 +378,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
 			goto resubmit_urb;
 
 		cf->can_id |= CAN_ERR_CRTL;
-		cf->can_dlc = CAN_ERR_DLC;
+		cf->len = CAN_ERR_DLC;
 		cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
 		stats->rx_over_errors++;
 		stats->rx_errors++;
@@ -504,8 +504,8 @@ static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb,
 	cf = (struct can_frame *)skb->data;
 
 	hf->can_id = cf->can_id;
-	hf->can_dlc = cf->can_dlc;
-	memcpy(hf->data, cf->data, cf->can_dlc);
+	hf->can_dlc = cf->len;
+	memcpy(hf->data, cf->data, cf->len);
 
 	usb_fill_bulk_urb(urb, dev->udev,
 			  usb_sndbulkpipe(dev->udev, GSUSB_ENDPOINT_OUT),
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
index 0f1d3e807d63..d6e18bcb1a7f 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
@@ -258,7 +258,7 @@ int kvaser_usb_can_rx_over_error(struct net_device *netdev)
 	cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 
 	return 0;
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
index 843e2e1392e8..c6d5f3f656c3 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
@@ -895,7 +895,7 @@ static void kvaser_usb_hydra_update_state(struct kvaser_usb_net_priv *priv,
 
 	stats = &netdev->stats;
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -1049,7 +1049,7 @@ kvaser_usb_hydra_error_frame(struct kvaser_usb_net_priv *priv,
 	cf->data[7] = bec.rxerr;
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 
 	priv->bec.txerr = bec.txerr;
@@ -1084,7 +1084,7 @@ static void kvaser_usb_hydra_one_shot_fail(struct kvaser_usb_net_priv *priv,
 
 	stats->tx_errors++;
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -1180,15 +1180,15 @@ static void kvaser_usb_hydra_rx_msg_std(const struct kvaser_usb *dev,
 	if (flags & KVASER_USB_HYDRA_CF_FLAG_OVERRUN)
 		kvaser_usb_can_rx_over_error(priv->netdev);
 
-	cf->can_dlc = can_cc_dlc2len(cmd->rx_can.dlc);
+	cf->len = can_cc_dlc2len(cmd->rx_can.dlc);
 
 	if (flags & KVASER_USB_HYDRA_CF_FLAG_REMOTE_FRAME)
 		cf->can_id |= CAN_RTR_FLAG;
 	else
-		memcpy(cf->data, cmd->rx_can.data, cf->can_dlc);
+		memcpy(cf->data, cmd->rx_can.data, cf->len);
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -1434,7 +1434,7 @@ kvaser_usb_hydra_frame_to_cmd_std(const struct kvaser_usb_net_priv *priv,
 	u32 flags;
 	u32 id;
 
-	*frame_len = cf->can_dlc;
+	*frame_len = cf->len;
 
 	cmd = kcalloc(1, sizeof(struct kvaser_cmd), GFP_ATOMIC);
 	if (!cmd)
@@ -1455,7 +1455,7 @@ kvaser_usb_hydra_frame_to_cmd_std(const struct kvaser_usb_net_priv *priv,
 		id = cf->can_id & CAN_SFF_MASK;
 	}
 
-	cmd->tx_can.dlc = cf->can_dlc;
+	cmd->tx_can.dlc = cf->len;
 
 	flags = (cf->can_id & CAN_EFF_FLAG ?
 		 KVASER_USB_HYDRA_CF_FLAG_EXTENDED_ID : 0);
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
index 916ab994cce4..98c016ef0607 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
@@ -350,7 +350,7 @@ kvaser_usb_leaf_frame_to_cmd(const struct kvaser_usb_net_priv *priv,
 	u8 *cmd_tx_can_flags = NULL;		/* GCC */
 	struct can_frame *cf = (struct can_frame *)skb->data;
 
-	*frame_len = cf->can_dlc;
+	*frame_len = cf->len;
 
 	cmd = kmalloc(sizeof(*cmd), GFP_ATOMIC);
 	if (cmd) {
@@ -383,8 +383,8 @@ kvaser_usb_leaf_frame_to_cmd(const struct kvaser_usb_net_priv *priv,
 			cmd->u.tx_can.data[1] = cf->can_id & 0x3f;
 		}
 
-		cmd->u.tx_can.data[5] = cf->can_dlc;
-		memcpy(&cmd->u.tx_can.data[6], cf->data, cf->can_dlc);
+		cmd->u.tx_can.data[5] = cf->len;
+		memcpy(&cmd->u.tx_can.data[6], cf->data, cf->len);
 
 		if (cf->can_id & CAN_RTR_FLAG)
 			*cmd_tx_can_flags |= MSG_FLAG_REMOTE_FRAME;
@@ -576,7 +576,7 @@ static void kvaser_usb_leaf_tx_acknowledge(const struct kvaser_usb *dev,
 			cf->can_id |= CAN_ERR_RESTARTED;
 
 			stats->rx_packets++;
-			stats->rx_bytes += cf->can_dlc;
+			stats->rx_bytes += cf->len;
 			netif_rx(skb);
 		} else {
 			netdev_err(priv->netdev,
@@ -694,7 +694,7 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev,
 {
 	struct can_frame *cf;
 	struct can_frame tmp_cf = { .can_id = CAN_ERR_FLAG,
-				    .can_dlc = CAN_ERR_DLC };
+				    .len = CAN_ERR_DLC };
 	struct sk_buff *skb;
 	struct net_device_stats *stats;
 	struct kvaser_usb_net_priv *priv;
@@ -778,7 +778,7 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev,
 	cf->data[7] = es->rxerr;
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -978,13 +978,13 @@ static void kvaser_usb_leaf_rx_can_msg(const struct kvaser_usb *dev,
 		else
 			cf->can_id &= CAN_SFF_MASK;
 
-		cf->can_dlc = can_cc_dlc2len(cmd->u.leaf.log_message.dlc);
+		cf->len = can_cc_dlc2len(cmd->u.leaf.log_message.dlc);
 
 		if (cmd->u.leaf.log_message.flags & MSG_FLAG_REMOTE_FRAME)
 			cf->can_id |= CAN_RTR_FLAG;
 		else
 			memcpy(cf->data, &cmd->u.leaf.log_message.data,
-			       cf->can_dlc);
+			       cf->len);
 	} else {
 		cf->can_id = ((rx_data[0] & 0x1f) << 6) | (rx_data[1] & 0x3f);
 
@@ -996,16 +996,16 @@ static void kvaser_usb_leaf_rx_can_msg(const struct kvaser_usb *dev,
 			cf->can_id |= CAN_EFF_FLAG;
 		}
 
-		cf->can_dlc = can_cc_dlc2len(rx_data[5]);
+		cf->len = can_cc_dlc2len(rx_data[5]);
 
 		if (cmd->u.rx_can_header.flag & MSG_FLAG_REMOTE_FRAME)
 			cf->can_id |= CAN_RTR_FLAG;
 		else
-			memcpy(cf->data, &rx_data[6], cf->can_dlc);
+			memcpy(cf->data, &rx_data[6], cf->len);
 	}
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c
index 295886c6b565..df54eb7d4b36 100644
--- a/drivers/net/can/usb/mcba_usb.c
+++ b/drivers/net/can/usb/mcba_usb.c
@@ -184,7 +184,7 @@ static inline struct mcba_usb_ctx *mcba_usb_get_free_ctx(struct mcba_priv *priv,
 
 			if (cf) {
 				ctx->can = true;
-				ctx->dlc = cf->can_dlc;
+				ctx->dlc = cf->len;
 			} else {
 				ctx->can = false;
 				ctx->dlc = 0;
@@ -348,7 +348,7 @@ static netdev_tx_t mcba_usb_start_xmit(struct sk_buff *skb,
 		usb_msg.eid = 0;
 	}
 
-	usb_msg.dlc = cf->can_dlc;
+	usb_msg.dlc = cf->len;
 
 	memcpy(usb_msg.data, cf->data, usb_msg.dlc);
 
@@ -451,12 +451,12 @@ static void mcba_usb_process_can(struct mcba_priv *priv,
 	if (msg->dlc & MCBA_DLC_RTR_MASK)
 		cf->can_id |= CAN_RTR_FLAG;
 
-	cf->can_dlc = can_cc_dlc2len(msg->dlc & MCBA_DLC_MASK);
+	cf->len = can_cc_dlc2len(msg->dlc & MCBA_DLC_MASK);
 
-	memcpy(cf->data, msg->data, cf->can_dlc);
+	memcpy(cf->data, msg->data, cf->len);
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 
 	can_led_event(priv->netdev, CAN_LED_EVENT_RX);
 	netif_rx(skb);
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
index f7fc82203489..ec34f87cc02c 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
@@ -596,7 +596,7 @@ static int pcan_usb_decode_error(struct pcan_usb_msg_context *mc, u8 n,
 	}
 
 	mc->netdev->stats.rx_packets++;
-	mc->netdev->stats.rx_bytes += cf->can_dlc;
+	mc->netdev->stats.rx_bytes += cf->len;
 	netif_rx(skb);
 
 	return 0;
@@ -734,7 +734,7 @@ static int pcan_usb_decode_data(struct pcan_usb_msg_context *mc, u8 status_len)
 		cf->can_id = le16_to_cpu(tmp16) >> 5;
 	}
 
-	cf->can_dlc = can_cc_dlc2len(rec_len);
+	cf->len = can_cc_dlc2len(rec_len);
 
 	/* Only first packet timestamp is a word */
 	if (pcan_usb_decode_ts(mc, !mc->rec_ts_idx))
@@ -751,7 +751,7 @@ static int pcan_usb_decode_data(struct pcan_usb_msg_context *mc, u8 status_len)
 		if ((mc->ptr + rec_len) > mc->end)
 			goto decode_failed;
 
-		memcpy(cf->data, mc->ptr, cf->can_dlc);
+		memcpy(cf->data, mc->ptr, cf->len);
 		mc->ptr += rec_len;
 	}
 
@@ -761,7 +761,7 @@ static int pcan_usb_decode_data(struct pcan_usb_msg_context *mc, u8 status_len)
 
 	/* update statistics */
 	mc->netdev->stats.rx_packets++;
-	mc->netdev->stats.rx_bytes += cf->can_dlc;
+	mc->netdev->stats.rx_bytes += cf->len;
 	/* push the skb */
 	netif_rx(skb);
 
@@ -838,7 +838,7 @@ static int pcan_usb_encode_msg(struct peak_usb_device *dev, struct sk_buff *skb,
 	pc = obuf + PCAN_USB_MSG_HEADER_LEN;
 
 	/* status/len byte */
-	*pc = cf->can_dlc;
+	*pc = cf->len;
 	if (cf->can_id & CAN_RTR_FLAG)
 		*pc |= PCAN_USB_STATUSLEN_RTR;
 
@@ -858,8 +858,8 @@ static int pcan_usb_encode_msg(struct peak_usb_device *dev, struct sk_buff *skb,
 
 	/* can data */
 	if (!(cf->can_id & CAN_RTR_FLAG)) {
-		memcpy(pc, cf->data, cf->can_dlc);
-		pc += cf->can_dlc;
+		memcpy(pc, cf->data, cf->len);
+		pc += cf->len;
 	}
 
 	obuf[(*size)-1] = (u8)(stats->tx_packets & 0xff);
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
index 1233ef20646a..922280692a8f 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
@@ -581,7 +581,7 @@ static int pcan_usb_fd_decode_status(struct pcan_usb_fd_if *usb_if,
 	peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(sm->ts_low));
 
 	netdev->stats.rx_packets++;
-	netdev->stats.rx_bytes += cf->can_dlc;
+	netdev->stats.rx_bytes += cf->len;
 
 	return 0;
 }
@@ -737,7 +737,7 @@ static int pcan_usb_fd_encode_msg(struct peak_usb_device *dev,
 	struct pucan_tx_msg *tx_msg = (struct pucan_tx_msg *)obuf;
 	struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
 	u16 tx_msg_size, tx_msg_flags;
-	u8 can_dlc;
+	u8 len;
 
 	if (cfd->len > CANFD_MAX_DLEN)
 		return -EINVAL;
@@ -756,7 +756,7 @@ static int pcan_usb_fd_encode_msg(struct peak_usb_device *dev,
 
 	if (can_is_canfd_skb(skb)) {
 		/* considering a CANFD frame */
-		can_dlc = can_len2dlc(cfd->len);
+		len = can_len2dlc(cfd->len);
 
 		tx_msg_flags |= PUCAN_MSG_EXT_DATA_LEN;
 
@@ -767,14 +767,14 @@ static int pcan_usb_fd_encode_msg(struct peak_usb_device *dev,
 			tx_msg_flags |= PUCAN_MSG_ERROR_STATE_IND;
 	} else {
 		/* CAND 2.0 frames */
-		can_dlc = cfd->len;
+		len = cfd->len;
 
 		if (cfd->can_id & CAN_RTR_FLAG)
 			tx_msg_flags |= PUCAN_MSG_RTR;
 	}
 
 	tx_msg->flags = cpu_to_le16(tx_msg_flags);
-	tx_msg->channel_dlc = PUCAN_MSG_CHANNEL_DLC(dev->ctrl_idx, can_dlc);
+	tx_msg->channel_dlc = PUCAN_MSG_CHANNEL_DLC(dev->ctrl_idx, len);
 	memcpy(tx_msg->d, cfd->data, cfd->len);
 
 	/* add null size message to tag the end (messages are 32-bits aligned)
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_pro.c b/drivers/net/can/usb/peak_usb/pcan_usb_pro.c
index c7564773fb2b..275087c39602 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_pro.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_pro.c
@@ -532,7 +532,7 @@ static int pcan_usb_pro_handle_canmsg(struct pcan_usb_pro_interface *usb_if,
 		return -ENOMEM;
 
 	can_frame->can_id = le32_to_cpu(rx->id);
-	can_frame->can_dlc = rx->len & 0x0f;
+	can_frame->len = rx->len & 0x0f;
 
 	if (rx->flags & PCAN_USBPRO_EXT)
 		can_frame->can_id |= CAN_EFF_FLAG;
@@ -540,14 +540,14 @@ static int pcan_usb_pro_handle_canmsg(struct pcan_usb_pro_interface *usb_if,
 	if (rx->flags & PCAN_USBPRO_RTR)
 		can_frame->can_id |= CAN_RTR_FLAG;
 	else
-		memcpy(can_frame->data, rx->data, can_frame->can_dlc);
+		memcpy(can_frame->data, rx->data, can_frame->len);
 
 	hwts = skb_hwtstamps(skb);
 	peak_usb_get_ts_time(&usb_if->time_ref, le32_to_cpu(rx->ts32),
 			     &hwts->hwtstamp);
 
 	netdev->stats.rx_packets++;
-	netdev->stats.rx_bytes += can_frame->can_dlc;
+	netdev->stats.rx_bytes += can_frame->len;
 	netif_rx(skb);
 
 	return 0;
@@ -662,7 +662,7 @@ static int pcan_usb_pro_handle_error(struct pcan_usb_pro_interface *usb_if,
 	hwts = skb_hwtstamps(skb);
 	peak_usb_get_ts_time(&usb_if->time_ref, le32_to_cpu(er->ts32), &hwts->hwtstamp);
 	netdev->stats.rx_packets++;
-	netdev->stats.rx_bytes += can_frame->can_dlc;
+	netdev->stats.rx_bytes += can_frame->len;
 	netif_rx(skb);
 
 	return 0;
@@ -767,14 +767,14 @@ static int pcan_usb_pro_encode_msg(struct peak_usb_device *dev,
 
 	pcan_msg_init_empty(&usb_msg, obuf, *size);
 
-	if ((cf->can_id & CAN_RTR_FLAG) || (cf->can_dlc == 0))
+	if ((cf->can_id & CAN_RTR_FLAG) || (cf->len == 0))
 		data_type = PCAN_USBPRO_TXMSG0;
-	else if (cf->can_dlc <= 4)
+	else if (cf->len <= 4)
 		data_type = PCAN_USBPRO_TXMSG4;
 	else
 		data_type = PCAN_USBPRO_TXMSG8;
 
-	len = (dev->ctrl_idx << 4) | (cf->can_dlc & 0x0f);
+	len = (dev->ctrl_idx << 4) | (cf->len & 0x0f);
 
 	flags = 0;
 	if (cf->can_id & CAN_EFF_FLAG)
diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
index 072058c6f6e8..7d92da8954fe 100644
--- a/drivers/net/can/usb/ucan.c
+++ b/drivers/net/can/usb/ucan.c
@@ -614,15 +614,15 @@ static void ucan_rx_can_msg(struct ucan_priv *up, struct ucan_message_in *m)
 	cf->can_id = canid;
 
 	/* compute DLC taking RTR_FLAG into account */
-	cf->can_dlc = ucan_can_cc_dlc2len(&m->msg.can_msg, len);
+	cf->len = ucan_can_cc_dlc2len(&m->msg.can_msg, len);
 
 	/* copy the payload of non RTR frames */
 	if (!(cf->can_id & CAN_RTR_FLAG) || (cf->can_id & CAN_ERR_FLAG))
-		memcpy(cf->data, m->msg.can_msg.data, cf->can_dlc);
+		memcpy(cf->data, m->msg.can_msg.data, cf->len);
 
 	/* don't count error frames as real packets */
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 
 	/* pass it to Linux */
 	netif_rx(skb);
@@ -1078,15 +1078,15 @@ static struct urb *ucan_prepare_tx_urb(struct ucan_priv *up,
 		mlen = UCAN_OUT_HDR_SIZE +
 			offsetof(struct ucan_can_msg, dlc) +
 			sizeof(m->msg.can_msg.dlc);
-		m->msg.can_msg.dlc = cf->can_dlc;
+		m->msg.can_msg.dlc = cf->len;
 	} else {
 		mlen = UCAN_OUT_HDR_SIZE +
-			sizeof(m->msg.can_msg.id) + cf->can_dlc;
-		memcpy(m->msg.can_msg.data, cf->data, cf->can_dlc);
+			sizeof(m->msg.can_msg.id) + cf->len;
+		memcpy(m->msg.can_msg.data, cf->data, cf->len);
 	}
 	m->len = cpu_to_le16(mlen);
 
-	context->dlc = cf->can_dlc;
+	context->dlc = cf->len;
 
 	m->subtype = echo_index;
 
diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index 216c58f8df6e..6517aaeb4bc0 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c
@@ -449,7 +449,7 @@ static void usb_8dev_rx_err_msg(struct usb_8dev_priv *priv,
 	priv->bec.rxerr = rxerr;
 
 	stats->rx_packets++;
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	netif_rx(skb);
 }
 
@@ -470,7 +470,7 @@ static void usb_8dev_rx_can_msg(struct usb_8dev_priv *priv,
 			return;
 
 		cf->can_id = be32_to_cpu(msg->id);
-		cf->can_dlc = can_cc_dlc2len(msg->dlc & 0xF);
+		cf->len = can_cc_dlc2len(msg->dlc & 0xF);
 
 		if (msg->flags & USB_8DEV_EXTID)
 			cf->can_id |= CAN_EFF_FLAG;
@@ -478,10 +478,10 @@ static void usb_8dev_rx_can_msg(struct usb_8dev_priv *priv,
 		if (msg->flags & USB_8DEV_RTR)
 			cf->can_id |= CAN_RTR_FLAG;
 		else
-			memcpy(cf->data, msg->data, cf->can_dlc);
+			memcpy(cf->data, msg->data, cf->len);
 
 		stats->rx_packets++;
-		stats->rx_bytes += cf->can_dlc;
+		stats->rx_bytes += cf->len;
 		netif_rx(skb);
 
 		can_led_event(priv->netdev, CAN_LED_EVENT_RX);
@@ -637,8 +637,8 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb,
 		msg->flags |= USB_8DEV_EXTID;
 
 	msg->id = cpu_to_be32(cf->can_id & CAN_ERR_MASK);
-	msg->dlc = cf->can_dlc;
-	memcpy(msg->data, cf->data, cf->can_dlc);
+	msg->dlc = cf->len;
+	memcpy(msg->data, cf->data, cf->len);
 	msg->end = USB_8DEV_DATA_END;
 
 	for (i = 0; i < MAX_TX_URBS; i++) {
@@ -656,7 +656,7 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb,
 
 	context->priv = priv;
 	context->echo_index = i;
-	context->dlc = cf->can_dlc;
+	context->dlc = cf->len;
 
 	usb_fill_bulk_urb(urb, priv->udev,
 			  usb_sndbulkpipe(priv->udev, USB_8DEV_ENDP_DATA_TX),
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 73e8b1df1071..88831ce0f2f8 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -759,7 +759,7 @@ static int xcan_rx(struct net_device *ndev, int frame_base)
 				   XCAN_DLCR_DLC_SHIFT;
 
 	/* Change Xilinx CAN data length format to socketCAN data format */
-	cf->can_dlc = can_cc_dlc2len(dlc);
+	cf->len = can_cc_dlc2len(dlc);
 
 	/* Change Xilinx CAN ID format to socketCAN ID format */
 	if (id_xcan & XCAN_IDR_IDE_MASK) {
@@ -784,13 +784,13 @@ static int xcan_rx(struct net_device *ndev, int frame_base)
 
 	if (!(cf->can_id & CAN_RTR_FLAG)) {
 		/* Change Xilinx CAN data format to socketCAN data format */
-		if (cf->can_dlc > 0)
+		if (cf->len > 0)
 			*(__be32 *)(cf->data) = cpu_to_be32(data[0]);
-		if (cf->can_dlc > 4)
+		if (cf->len > 4)
 			*(__be32 *)(cf->data + 4) = cpu_to_be32(data[1]);
 	}
 
-	stats->rx_bytes += cf->can_dlc;
+	stats->rx_bytes += cf->len;
 	stats->rx_packets++;
 	netif_receive_skb(skb);
 
@@ -970,7 +970,7 @@ static void xcan_update_error_state_after_rxtx(struct net_device *ndev)
 			struct net_device_stats *stats = &ndev->stats;
 
 			stats->rx_packets++;
-			stats->rx_bytes += cf->can_dlc;
+			stats->rx_bytes += cf->len;
 			netif_rx(skb);
 		}
 	}
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 802606e36b58..77da061c21c9 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -185,8 +185,8 @@ static inline void can_set_static_ctrlmode(struct net_device *dev,
 		dev->mtu = CANFD_MTU;
 }
 
-/* get data length from can_dlc with sanitized can_dlc */
-u8 can_dlc2len(u8 can_dlc);
+/* get data length from raw data length code (DLC) */
+u8 can_dlc2len(u8 dlc);
 
 /* map the sanitized data length to an appropriate data length code */
 u8 can_len2dlc(u8 len);
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 5d124c155904..963bd7145517 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -888,7 +888,7 @@ static __init int can_init(void)
 	int err;
 
 	/* check for correct padding to be able to use the structs similarly */
-	BUILD_BUG_ON(offsetof(struct can_frame, can_dlc) !=
+	BUILD_BUG_ON(offsetof(struct can_frame, len) !=
 		     offsetof(struct canfd_frame, len) ||
 		     offsetof(struct can_frame, data) !=
 		     offsetof(struct canfd_frame, data));
diff --git a/net/can/gw.c b/net/can/gw.c
index 6b790b6ff8d2..de5e8859ec9b 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -207,7 +207,7 @@ static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
 	 */
 
 	dst->can_id = src->can_id;
-	dst->len = src->can_dlc;
+	dst->len = src->len;
 	*(u64 *)dst->data = *(u64 *)src->data;
 }
 
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index 137054bff9ec..bb914d8b4216 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -62,7 +62,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data)
 	skb_pull(skb, J1939_CAN_HDR);
 
 	/* fix length, set to dlc, with 8 maximum */
-	skb_trim(skb, min_t(uint8_t, cf->can_dlc, 8));
+	skb_trim(skb, min_t(uint8_t, cf->len, 8));
 
 	/* set addr */
 	skcb = j1939_skb_to_cb(skb);
@@ -335,7 +335,7 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb)
 		canid |= skcb->addr.da << 8;
 
 	cf->can_id = canid;
-	cf->can_dlc = dlc;
+	cf->len = dlc;
 
 	return can_send(skb, 1);
 
-- 
cgit 


From 3ab4ce0d6fa8c93d41df4a74ec8d2c9198be2109 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 10 Nov 2020 11:18:49 +0100
Subject: can: rename CAN FD related can_len2dlc and can_dlc2len helpers

The helper functions can_len2dlc and can_dlc2len are only relevant for
CAN FD data length code (DLC) conversion.

To fit the introduced can_cc_dlc2len for Classical CAN we rename:

can_dlc2len -> can_fd_dlc2len to get the payload length from the DLC
can_len2dlc -> can_fd_len2dlc to get the DLC from the payload length

Suggested-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20201110101852.1973-6-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 Documentation/networking/can.rst                  | 2 +-
 drivers/net/can/dev.c                             | 8 ++++----
 drivers/net/can/flexcan.c                         | 4 ++--
 drivers/net/can/ifi_canfd/ifi_canfd.c             | 4 ++--
 drivers/net/can/kvaser_pciefd.c                   | 6 +++---
 drivers/net/can/m_can/m_can.c                     | 6 +++---
 drivers/net/can/peak_canfd/peak_canfd.c           | 4 ++--
 drivers/net/can/rcar/rcar_canfd.c                 | 4 ++--
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c    | 8 ++++----
 drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 6 +++---
 drivers/net/can/usb/peak_usb/pcan_usb_fd.c        | 4 ++--
 drivers/net/can/xilinx_can.c                      | 4 ++--
 include/linux/can/dev.h                           | 4 ++--
 13 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst
index ff05cbd05e0d..4895b0dd2714 100644
--- a/Documentation/networking/can.rst
+++ b/Documentation/networking/can.rst
@@ -1332,7 +1332,7 @@ layer is a plain value from 0 .. 64 instead of the CAN 'data length code'.
 The data length code was a 1:1 mapping to the payload length in the legacy
 CAN frames anyway. The payload length to the bus-relevant DLC mapping is
 only performed inside the CAN drivers, preferably with the helper
-functions can_dlc2len() and can_len2dlc().
+functions can_fd_dlc2len() and can_fd_len2dlc().
 
 The CAN netdevice driver capabilities can be distinguished by the network
 devices maximum transfer unit (MTU)::
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 806e8b646b12..3486704c8a95 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -31,11 +31,11 @@ static const u8 dlc2len[] = {0, 1, 2, 3, 4, 5, 6, 7,
 			     8, 12, 16, 20, 24, 32, 48, 64};
 
 /* get data length from raw data length code (DLC) */
-u8 can_dlc2len(u8 dlc)
+u8 can_fd_dlc2len(u8 dlc)
 {
 	return dlc2len[dlc & 0x0F];
 }
-EXPORT_SYMBOL_GPL(can_dlc2len);
+EXPORT_SYMBOL_GPL(can_fd_dlc2len);
 
 static const u8 len2dlc[] = {0, 1, 2, 3, 4, 5, 6, 7, 8,		/* 0 - 8 */
 			     9, 9, 9, 9,			/* 9 - 12 */
@@ -49,14 +49,14 @@ static const u8 len2dlc[] = {0, 1, 2, 3, 4, 5, 6, 7, 8,		/* 0 - 8 */
 			     15, 15, 15, 15, 15, 15, 15, 15};	/* 57 - 64 */
 
 /* map the sanitized data length to an appropriate data length code */
-u8 can_len2dlc(u8 len)
+u8 can_fd_len2dlc(u8 len)
 {
 	if (unlikely(len > 64))
 		return 0xF;
 
 	return len2dlc[len];
 }
-EXPORT_SYMBOL_GPL(can_len2dlc);
+EXPORT_SYMBOL_GPL(can_fd_len2dlc);
 
 #ifdef CONFIG_CAN_CALC_BITTIMING
 #define CAN_CALC_MAX_ERROR 50 /* in one-tenth of a percent */
diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 985569f946e5..6f3555381230 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -746,7 +746,7 @@ static netdev_tx_t flexcan_start_xmit(struct sk_buff *skb, struct net_device *de
 	struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
 	u32 can_id;
 	u32 data;
-	u32 ctrl = FLEXCAN_MB_CODE_TX_DATA | ((can_len2dlc(cfd->len)) << 16);
+	u32 ctrl = FLEXCAN_MB_CODE_TX_DATA | ((can_fd_len2dlc(cfd->len)) << 16);
 	int i;
 
 	if (can_dropped_invalid_skb(dev, skb))
@@ -1000,7 +1000,7 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload,
 		cfd->can_id = (reg_id >> 18) & CAN_SFF_MASK;
 
 	if (reg_ctrl & FLEXCAN_MB_CNT_EDL) {
-		cfd->len = can_dlc2len((reg_ctrl >> 16) & 0xf);
+		cfd->len = can_fd_dlc2len((reg_ctrl >> 16) & 0xf);
 
 		if (reg_ctrl & FLEXCAN_MB_CNT_BRS)
 			cfd->flags |= CANFD_BRS;
diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c
index 3df55b0e4ef3..86b0e1406a21 100644
--- a/drivers/net/can/ifi_canfd/ifi_canfd.c
+++ b/drivers/net/can/ifi_canfd/ifi_canfd.c
@@ -271,7 +271,7 @@ static void ifi_canfd_read_fifo(struct net_device *ndev)
 	dlc = (rxdlc >> IFI_CANFD_RXFIFO_DLC_DLC_OFFSET) &
 	      IFI_CANFD_RXFIFO_DLC_DLC_MASK;
 	if (rxdlc & IFI_CANFD_RXFIFO_DLC_EDL)
-		cf->len = can_dlc2len(dlc);
+		cf->len = can_fd_dlc2len(dlc);
 	else
 		cf->len = can_cc_dlc2len(dlc);
 
@@ -900,7 +900,7 @@ static netdev_tx_t ifi_canfd_start_xmit(struct sk_buff *skb,
 		txid = cf->can_id & CAN_SFF_MASK;
 	}
 
-	txdlc = can_len2dlc(cf->len);
+	txdlc = can_fd_len2dlc(cf->len);
 	if ((priv->can.ctrlmode & CAN_CTRLMODE_FD) && can_is_canfd_skb(skb)) {
 		txdlc |= IFI_CANFD_TXFIFO_DLC_EDL;
 		if (cf->flags & CANFD_BRS)
diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index de268a344fcf..1bafa614950e 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -740,7 +740,7 @@ static int kvaser_pciefd_prepare_tx_packet(struct kvaser_pciefd_tx_packet *p,
 		p->header[0] |= KVASER_PCIEFD_RPACKET_IDE;
 
 	p->header[0] |= cf->can_id & CAN_EFF_MASK;
-	p->header[1] |= can_len2dlc(cf->len) << KVASER_PCIEFD_RPACKET_DLC_SHIFT;
+	p->header[1] |= can_fd_len2dlc(cf->len) << KVASER_PCIEFD_RPACKET_DLC_SHIFT;
 	p->header[1] |= KVASER_PCIEFD_TPACKET_AREQ;
 
 	if (can_is_canfd_skb(skb)) {
@@ -1174,7 +1174,7 @@ static int kvaser_pciefd_handle_data_packet(struct kvaser_pciefd *pcie,
 	if (p->header[0] & KVASER_PCIEFD_RPACKET_IDE)
 		cf->can_id |= CAN_EFF_FLAG;
 
-	cf->len = can_dlc2len(p->header[1] >> KVASER_PCIEFD_RPACKET_DLC_SHIFT);
+	cf->len = can_fd_dlc2len(p->header[1] >> KVASER_PCIEFD_RPACKET_DLC_SHIFT);
 
 	if (p->header[0] & KVASER_PCIEFD_RPACKET_RTR)
 		cf->can_id |= CAN_RTR_FLAG;
@@ -1600,7 +1600,7 @@ static int kvaser_pciefd_read_packet(struct kvaser_pciefd *pcie, int *start_pos,
 		if (!(p->header[0] & KVASER_PCIEFD_RPACKET_RTR)) {
 			u8 data_len;
 
-			data_len = can_dlc2len(p->header[1] >>
+			data_len = can_fd_dlc2len(p->header[1] >>
 					       KVASER_PCIEFD_RPACKET_DLC_SHIFT);
 			pos += DIV_ROUND_UP(data_len, 4);
 		}
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index b7df90ceee4b..a345e22f545e 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -457,7 +457,7 @@ static void m_can_read_fifo(struct net_device *dev, u32 rxfs)
 	}
 
 	if (dlc & RX_BUF_FDF)
-		cf->len = can_dlc2len((dlc >> 16) & 0x0F);
+		cf->len = can_fd_dlc2len((dlc >> 16) & 0x0F);
 	else
 		cf->len = can_cc_dlc2len((dlc >> 16) & 0x0F);
 
@@ -1489,7 +1489,7 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
 		/* message ram configuration */
 		m_can_fifo_write(cdev, 0, M_CAN_FIFO_ID, id);
 		m_can_fifo_write(cdev, 0, M_CAN_FIFO_DLC,
-				 can_len2dlc(cf->len) << 16);
+				 can_fd_len2dlc(cf->len) << 16);
 
 		for (i = 0; i < cf->len; i += 4)
 			m_can_fifo_write(cdev, 0,
@@ -1557,7 +1557,7 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
 		m_can_fifo_write(cdev, putidx, M_CAN_FIFO_DLC,
 				 ((putidx << TX_BUF_MM_SHIFT) &
 				  TX_BUF_MM_MASK) |
-				 (can_len2dlc(cf->len) << 16) |
+				 (can_fd_len2dlc(cf->len) << 16) |
 				 fdflags | TX_BUF_EFC);
 
 		for (i = 0; i < cf->len; i += 4)
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index fff3a35276aa..c5334b0c3038 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -257,7 +257,7 @@ static int pucan_handle_can_rx(struct peak_canfd_priv *priv,
 	u8 cf_len;
 
 	if (rx_msg_flags & PUCAN_MSG_EXT_DATA_LEN)
-		cf_len = can_dlc2len(pucan_msg_get_dlc(msg));
+		cf_len = can_fd_dlc2len(pucan_msg_get_dlc(msg));
 	else
 		cf_len = can_cc_dlc2len(pucan_msg_get_dlc(msg));
 
@@ -682,7 +682,7 @@ static netdev_tx_t peak_canfd_start_xmit(struct sk_buff *skb,
 
 	if (can_is_canfd_skb(skb)) {
 		/* CAN FD frame format */
-		len = can_len2dlc(cf->len);
+		len = can_fd_len2dlc(cf->len);
 
 		msg_flags |= PUCAN_MSG_EXT_DATA_LEN;
 
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index 86c6cbdb7e53..2778ed5c61d1 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1357,7 +1357,7 @@ static netdev_tx_t rcar_canfd_start_xmit(struct sk_buff *skb,
 	if (cf->can_id & CAN_RTR_FLAG)
 		id |= RCANFD_CFID_CFRTR;
 
-	dlc = RCANFD_CFPTR_CFDLC(can_len2dlc(cf->len));
+	dlc = RCANFD_CFPTR_CFDLC(can_fd_len2dlc(cf->len));
 
 	if (priv->can.ctrlmode & CAN_CTRLMODE_FD) {
 		rcar_canfd_write(priv->base,
@@ -1446,7 +1446,7 @@ static void rcar_canfd_rx_pkt(struct rcar_canfd_channel *priv)
 
 	if (priv->can.ctrlmode & CAN_CTRLMODE_FD) {
 		if (sts & RCANFD_RFFDSTS_RFFDF)
-			cf->len = can_dlc2len(RCANFD_RFPTR_RFDLC(dlc));
+			cf->len = can_fd_dlc2len(RCANFD_RFPTR_RFDLC(dlc));
 		else
 			cf->len = can_cc_dlc2len(RCANFD_RFPTR_RFDLC(dlc));
 
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 3bac7274ee5b..afa8cfc729b5 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -1405,7 +1405,7 @@ mcp251xfd_hw_rx_obj_to_skb(const struct mcp251xfd_priv *priv,
 			cfd->flags |= CANFD_BRS;
 
 		dlc = FIELD_GET(MCP251XFD_OBJ_FLAGS_DLC, hw_rx_obj->flags);
-		cfd->len = can_dlc2len(dlc);
+		cfd->len = can_fd_dlc2len(dlc);
 	} else {
 		if (hw_rx_obj->flags & MCP251XFD_OBJ_FLAGS_RTR)
 			cfd->can_id |= CAN_RTR_FLAG;
@@ -2244,7 +2244,7 @@ mcp251xfd_tx_obj_from_skb(const struct mcp251xfd_priv *priv,
 	 * harm, only the lower 7 bits will be transferred into the
 	 * TEF object.
 	 */
-	dlc = can_len2dlc(cfd->len);
+	dlc = can_fd_len2dlc(cfd->len);
 	flags |= FIELD_PREP(MCP251XFD_OBJ_FLAGS_SEQ_MCP2518FD_MASK, seq) |
 		FIELD_PREP(MCP251XFD_OBJ_FLAGS_DLC, dlc);
 
@@ -2273,7 +2273,7 @@ mcp251xfd_tx_obj_from_skb(const struct mcp251xfd_priv *priv,
 
 	/* Clear data at end of CAN frame */
 	offset = round_down(cfd->len, sizeof(u32));
-	len = round_up(can_dlc2len(dlc), sizeof(u32)) - offset;
+	len = round_up(can_fd_dlc2len(dlc), sizeof(u32)) - offset;
 	if (MCP251XFD_SANITIZE_CAN && len)
 		memset(hw_tx_obj->data + offset, 0x0, len);
 	memcpy(hw_tx_obj->data, cfd->data, cfd->len);
@@ -2281,7 +2281,7 @@ mcp251xfd_tx_obj_from_skb(const struct mcp251xfd_priv *priv,
 	/* Number of bytes to be written into the RAM of the controller */
 	len = sizeof(hw_tx_obj->id) + sizeof(hw_tx_obj->flags);
 	if (MCP251XFD_SANITIZE_CAN)
-		len += round_up(can_dlc2len(dlc), sizeof(u32));
+		len += round_up(can_fd_dlc2len(dlc), sizeof(u32));
 	else
 		len += round_up(cfd->len, sizeof(u32));
 
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
index c6d5f3f656c3..107b205b77ab 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
@@ -1120,7 +1120,7 @@ static void kvaser_usb_hydra_tx_acknowledge(const struct kvaser_usb *dev,
 		struct net_device_stats *stats = &priv->netdev->stats;
 
 		stats->tx_packets++;
-		stats->tx_bytes += can_dlc2len(context->dlc);
+		stats->tx_bytes += can_fd_dlc2len(context->dlc);
 	}
 
 	spin_lock_irqsave(&priv->tx_contexts_lock, irq_flags);
@@ -1251,7 +1251,7 @@ static void kvaser_usb_hydra_rx_msg_ext(const struct kvaser_usb *dev,
 		kvaser_usb_can_rx_over_error(priv->netdev);
 
 	if (flags & KVASER_USB_HYDRA_CF_FLAG_FDF) {
-		cf->len = can_dlc2len(dlc);
+		cf->len = can_fd_dlc2len(dlc);
 		if (flags & KVASER_USB_HYDRA_CF_FLAG_BRS)
 			cf->flags |= CANFD_BRS;
 		if (flags & KVASER_USB_HYDRA_CF_FLAG_ESI)
@@ -1351,7 +1351,7 @@ kvaser_usb_hydra_frame_to_cmd_ext(const struct kvaser_usb_net_priv *priv,
 	struct kvaser_usb *dev = priv->dev;
 	struct kvaser_cmd_ext *cmd;
 	struct canfd_frame *cf = (struct canfd_frame *)skb->data;
-	u8 dlc = can_len2dlc(cf->len);
+	u8 dlc = can_fd_len2dlc(cf->len);
 	u8 nbr_of_bytes = cf->len;
 	u32 flags;
 	u32 id;
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
index 922280692a8f..761e78d8e647 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
@@ -492,7 +492,7 @@ static int pcan_usb_fd_decode_canmsg(struct pcan_usb_fd_if *usb_if,
 		if (rx_msg_flags & PUCAN_MSG_ERROR_STATE_IND)
 			cfd->flags |= CANFD_ESI;
 
-		cfd->len = can_dlc2len(pucan_msg_get_dlc(rm));
+		cfd->len = can_fd_dlc2len(pucan_msg_get_dlc(rm));
 	} else {
 		/* CAN 2.0 frame case */
 		skb = alloc_can_skb(netdev, (struct can_frame **)&cfd);
@@ -756,7 +756,7 @@ static int pcan_usb_fd_encode_msg(struct peak_usb_device *dev,
 
 	if (can_is_canfd_skb(skb)) {
 		/* considering a CANFD frame */
-		len = can_len2dlc(cfd->len);
+		len = can_fd_len2dlc(cfd->len);
 
 		tx_msg_flags |= PUCAN_MSG_EXT_DATA_LEN;
 
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 88831ce0f2f8..3f54edee92eb 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -583,7 +583,7 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb,
 			id |= XCAN_IDR_SRR_MASK;
 	}
 
-	dlc = can_len2dlc(cf->len) << XCAN_DLCR_DLC_SHIFT;
+	dlc = can_fd_len2dlc(cf->len) << XCAN_DLCR_DLC_SHIFT;
 	if (can_is_canfd_skb(skb)) {
 		if (cf->flags & CANFD_BRS)
 			dlc |= XCAN_DLCR_BRS_MASK;
@@ -832,7 +832,7 @@ static int xcanfd_rx(struct net_device *ndev, int frame_base)
 	 * format
 	 */
 	if (dlc & XCAN_DLCR_EDL_MASK)
-		cf->len = can_dlc2len((dlc & XCAN_DLCR_DLC_MASK) >>
+		cf->len = can_fd_dlc2len((dlc & XCAN_DLCR_DLC_MASK) >>
 				  XCAN_DLCR_DLC_SHIFT);
 	else
 		cf->len = can_cc_dlc2len((dlc & XCAN_DLCR_DLC_MASK) >>
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 77da061c21c9..e767a96ae075 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -186,10 +186,10 @@ static inline void can_set_static_ctrlmode(struct net_device *dev,
 }
 
 /* get data length from raw data length code (DLC) */
-u8 can_dlc2len(u8 dlc);
+u8 can_fd_dlc2len(u8 dlc);
 
 /* map the sanitized data length to an appropriate data length code */
-u8 can_len2dlc(u8 len);
+u8 can_fd_len2dlc(u8 len);
 
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 				    unsigned int txqs, unsigned int rxqs);
-- 
cgit 


From e8e73562ce0b24d691ad35df3de34b324248458f Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 10 Nov 2020 16:49:12 +0100
Subject: can: drivers: introduce helpers to access Classical CAN DLC values

This patch adds the following helper to functions to access Classical CAN DLC
values.

can_get_cc_dlc(): get the data length code for Classical CAN raw DLC access
can_frame_set_cc_len(): set len and len8_dlc value for Classical CAN raw DLC access

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20201110154913.1404582-2-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index e767a96ae075..197a79535cc2 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -170,6 +170,31 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 	return skb->len == CANFD_MTU;
 }
 
+/* helper to get the data length code (DLC) for Classical CAN raw DLC access */
+static inline u8 can_get_cc_dlc(const struct can_frame *cf, const u32 ctrlmode)
+{
+	/* return len8_dlc as dlc value only if all conditions apply */
+	if ((ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC) &&
+	    (cf->len == CAN_MAX_DLEN) &&
+	    (cf->len8_dlc > CAN_MAX_DLEN && cf->len8_dlc <= CAN_MAX_RAW_DLC))
+		return cf->len8_dlc;
+
+	/* return the payload length as dlc value */
+	return cf->len;
+}
+
+/* helper to set len and len8_dlc value for Classical CAN raw DLC access */
+static inline void can_frame_set_cc_len(struct can_frame *cf, const u8 dlc,
+					const u32 ctrlmode)
+{
+	/* the caller already ensured that dlc is a value from 0 .. 15 */
+	if (ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC && dlc > CAN_MAX_DLEN)
+		cf->len8_dlc = dlc;
+
+	/* limit the payload length 'len' to CAN_MAX_DLEN */
+	cf->len = can_cc_dlc2len(dlc);
+}
+
 /* helper to define static CAN controller features at device creation time */
 static inline void can_set_static_ctrlmode(struct net_device *dev,
 					   u32 static_mode)
-- 
cgit 


From 757055ae8dedf5333af17b3b5b4b70ba9bc9da4e Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 11 Nov 2020 14:54:49 +0100
Subject: init/console: Use ttynull as a fallback when there is no console

stdin, stdout, and stderr standard I/O stream are created for the init
process. They are not available when there is no console registered
for /dev/console. It might lead to a crash when the init process
tries to use them, see the commit 48021f98130880dd742 ("printk: handle
blank console arguments passed in.").

Normally, ttySX and ttyX consoles are used as a fallback when no consoles
are defined via the command line, device tree, or SPCR. But there
will be no console registered when an invalid console name is configured
or when the configured consoles do not exist on the system.

Users even try to avoid the console intentionally, for example,
by using console="" or console=null. It is used on production
systems where the serial port or terminal are not visible to
users. Pushing messages to these consoles would just unnecessary
slowdown the system.

Make sure that stdin, stdout, stderr, and /dev/console are always
available by a fallback to the existing ttynull driver. It has
been implemented for exactly this purpose but it was used only
when explicitly configured.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20201111135450.11214-2-pmladek@suse.com
---
 drivers/tty/Kconfig     | 14 --------------
 drivers/tty/Makefile    |  3 +--
 drivers/tty/ttynull.c   | 18 ++++++++++++++++++
 include/linux/console.h |  3 +++
 init/main.c             | 10 ++++++++--
 5 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 93fd984eb2f5..ca359bbd62f5 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -428,20 +428,6 @@ config MIPS_EJTAG_FDC_KGDB_CHAN
 	help
 	  FDC channel number to use for KGDB.
 
-config NULL_TTY
-	tristate "NULL TTY driver"
-	help
-	  Say Y here if you want a NULL TTY which simply discards messages.
-
-	  This is useful to allow userspace applications which expect a console
-	  device to work without modifications even when no console is
-	  available or desired.
-
-	  In order to use this driver, you should redirect the console to this
-	  TTY, or boot the kernel with console=ttynull.
-
-	  If unsure, say N.
-
 config TRACE_ROUTER
 	tristate "Trace data router for MIPI P1149.7 cJTAG standard"
 	depends on TRACE_SINK
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index 020b1cd9294f..f6b6bee0422d 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -2,7 +2,7 @@
 obj-$(CONFIG_TTY)		+= tty_io.o n_tty.o tty_ioctl.o tty_ldisc.o \
 				   tty_buffer.o tty_port.o tty_mutex.o \
 				   tty_ldsem.o tty_baudrate.o tty_jobctrl.o \
-				   n_null.o
+				   n_null.o ttynull.o
 obj-$(CONFIG_LEGACY_PTYS)	+= pty.o
 obj-$(CONFIG_UNIX98_PTYS)	+= pty.o
 obj-$(CONFIG_AUDIT)		+= tty_audit.o
@@ -25,7 +25,6 @@ obj-$(CONFIG_ISI)		+= isicom.o
 obj-$(CONFIG_MOXA_INTELLIO)	+= moxa.o
 obj-$(CONFIG_MOXA_SMARTIO)	+= mxser.o
 obj-$(CONFIG_NOZOMI)		+= nozomi.o
-obj-$(CONFIG_NULL_TTY)	        += ttynull.o
 obj-$(CONFIG_ROCKETPORT)	+= rocket.o
 obj-$(CONFIG_SYNCLINK_GT)	+= synclink_gt.o
 obj-$(CONFIG_SYNCLINKMP)	+= synclinkmp.o
diff --git a/drivers/tty/ttynull.c b/drivers/tty/ttynull.c
index 17f05b7eb6d3..eced70ec54e1 100644
--- a/drivers/tty/ttynull.c
+++ b/drivers/tty/ttynull.c
@@ -2,6 +2,13 @@
 /*
  * Copyright (C) 2019 Axis Communications AB
  *
+ * The console is useful for userspace applications which expect a console
+ * device to work without modifications even when no console is available
+ * or desired.
+ *
+ * In order to use this driver, you should redirect the console to this
+ * TTY, or boot the kernel with console=ttynull.
+ *
  * Based on ttyprintk.c:
  *  Copyright (C) 2010 Samo Pogacnik
  */
@@ -59,6 +66,17 @@ static struct console ttynull_console = {
 	.device = ttynull_device,
 };
 
+void __init register_ttynull_console(void)
+{
+	if (!ttynull_driver)
+		return;
+
+	if (add_preferred_console(ttynull_console.name, 0, NULL))
+		return;
+
+	register_console(&ttynull_console);
+}
+
 static int __init ttynull_init(void)
 {
 	struct tty_driver *driver;
diff --git a/include/linux/console.h b/include/linux/console.h
index 4b1e26c4cb42..9c662e41cde5 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -187,9 +187,12 @@ extern int braille_register_console(struct console *, int index,
 extern int braille_unregister_console(struct console *);
 #ifdef CONFIG_TTY
 extern void console_sysfs_notify(void);
+extern void register_ttynull_console(void);
 #else
 static inline void console_sysfs_notify(void)
 { }
+static inline void register_ttynull_console(void)
+{ }
 #endif
 extern bool console_suspend_enabled;
 
diff --git a/init/main.c b/init/main.c
index 1af84337cb18..b3fcf446f99e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1468,8 +1468,14 @@ void __init console_on_rootfs(void)
 	struct file *file = filp_open("/dev/console", O_RDWR, 0);
 
 	if (IS_ERR(file)) {
-		pr_err("Warning: unable to open an initial console.\n");
-		return;
+		pr_err("Warning: unable to open an initial console. Fallback to ttynull.\n");
+		register_ttynull_console();
+
+		file = filp_open("/dev/console", O_RDWR, 0);
+		if (IS_ERR(file)) {
+			pr_err("Warning: Failed to add ttynull console. No stdin, stdout, and stderr for the init process!\n");
+			return;
+		}
 	}
 	init_dup(file);
 	init_dup(file);
-- 
cgit 


From 341917490d7d68d2f7267a265b8820fc3f8ead1b Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Wed, 18 Nov 2020 23:49:20 +0100
Subject: PCI: Decode PCIe 64 GT/s link speed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PCIe r6.0, sec 7.5.3.18, defines a new 64.0 GT/s bit in the Supported Link
Speeds Vector of Link Capabilities 2.

This patch does not affect the speed of the link, which should be
negotiated automatically by the hardware; it only adds decoding when
showing the speed to the user.

Decode this new speed.  Previously, reading the speed of a link operating
at this speed showed "Unknown speed" instead of "64.0 GT/s".

Link: https://lore.kernel.org/r/aaaab33fe18975e123a84aebce2adb85f44e2bbe.1605739760.git.gustavo.pimentel@synopsys.com
Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Krzysztof Wilczyński <kw@linux.com>
---
 drivers/pci/pci.h             | 6 ++++--
 drivers/pci/probe.c           | 3 ++-
 include/linux/pci.h           | 1 +
 include/uapi/linux/pci_regs.h | 4 ++++
 4 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f86cae9aa1f4..81bf905b545c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -294,7 +294,8 @@ void pci_bus_put(struct pci_bus *bus);
 
 /* PCIe link information from Link Capabilities 2 */
 #define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \
-	((lnkcap2) & PCI_EXP_LNKCAP2_SLS_32_0GB ? PCIE_SPEED_32_0GT : \
+	((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \
+	 (lnkcap2) & PCI_EXP_LNKCAP2_SLS_32_0GB ? PCIE_SPEED_32_0GT : \
 	 (lnkcap2) & PCI_EXP_LNKCAP2_SLS_16_0GB ? PCIE_SPEED_16_0GT : \
 	 (lnkcap2) & PCI_EXP_LNKCAP2_SLS_8_0GB ? PCIE_SPEED_8_0GT : \
 	 (lnkcap2) & PCI_EXP_LNKCAP2_SLS_5_0GB ? PCIE_SPEED_5_0GT : \
@@ -303,7 +304,8 @@ void pci_bus_put(struct pci_bus *bus);
 
 /* PCIe speed to Mb/s reduced by encoding overhead */
 #define PCIE_SPEED2MBS_ENC(speed) \
-	((speed) == PCIE_SPEED_32_0GT ? 32000*128/130 : \
+	((speed) == PCIE_SPEED_64_0GT ? 64000*128/130 : \
+	 (speed) == PCIE_SPEED_32_0GT ? 32000*128/130 : \
 	 (speed) == PCIE_SPEED_16_0GT ? 16000*128/130 : \
 	 (speed) == PCIE_SPEED_8_0GT  ?  8000*128/130 : \
 	 (speed) == PCIE_SPEED_5_0GT  ?  5000*8/10 : \
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4289030b0fff..fe2e00f5fc4c 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -677,7 +677,7 @@ const unsigned char pcie_link_speed[] = {
 	PCIE_SPEED_8_0GT,		/* 3 */
 	PCIE_SPEED_16_0GT,		/* 4 */
 	PCIE_SPEED_32_0GT,		/* 5 */
-	PCI_SPEED_UNKNOWN,		/* 6 */
+	PCIE_SPEED_64_0GT,		/* 6 */
 	PCI_SPEED_UNKNOWN,		/* 7 */
 	PCI_SPEED_UNKNOWN,		/* 8 */
 	PCI_SPEED_UNKNOWN,		/* 9 */
@@ -719,6 +719,7 @@ const char *pci_speed_string(enum pci_bus_speed speed)
 	    "8.0 GT/s PCIe",		/* 0x16 */
 	    "16.0 GT/s PCIe",		/* 0x17 */
 	    "32.0 GT/s PCIe",		/* 0x18 */
+	    "64.0 GT/s PCIe",		/* 0x19 */
 	};
 
 	if (speed < ARRAY_SIZE(speed_strings))
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 22207a79762c..e007bc3e8b6e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -281,6 +281,7 @@ enum pci_bus_speed {
 	PCIE_SPEED_8_0GT		= 0x16,
 	PCIE_SPEED_16_0GT		= 0x17,
 	PCIE_SPEED_32_0GT		= 0x18,
+	PCIE_SPEED_64_0GT		= 0x19,
 	PCI_SPEED_UNKNOWN		= 0xff,
 };
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index a95d55f9f257..fe9d5dba2ba1 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -531,6 +531,7 @@
 #define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
 #define  PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */
 #define  PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */
+#define  PCI_EXP_LNKCAP_SLS_64_0GB 0x00000006 /* LNKCAP2 SLS Vector bit 5 */
 #define  PCI_EXP_LNKCAP_MLW	0x000003f0 /* Maximum Link Width */
 #define  PCI_EXP_LNKCAP_ASPMS	0x00000c00 /* ASPM Support */
 #define  PCI_EXP_LNKCAP_ASPM_L0S 0x00000400 /* ASPM L0s Support */
@@ -562,6 +563,7 @@
 #define  PCI_EXP_LNKSTA_CLS_8_0GB 0x0003 /* Current Link Speed 8.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_32_0GB 0x0005 /* Current Link Speed 32.0GT/s */
+#define  PCI_EXP_LNKSTA_CLS_64_0GB 0x0006 /* Current Link Speed 64.0GT/s */
 #define  PCI_EXP_LNKSTA_NLW	0x03f0	/* Negotiated Link Width */
 #define  PCI_EXP_LNKSTA_NLW_X1	0x0010	/* Current Link Width x1 */
 #define  PCI_EXP_LNKSTA_NLW_X2	0x0020	/* Current Link Width x2 */
@@ -670,6 +672,7 @@
 #define  PCI_EXP_LNKCAP2_SLS_8_0GB	0x00000008 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_16_0GB	0x00000010 /* Supported Speed 16GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_32_0GB	0x00000020 /* Supported Speed 32GT/s */
+#define  PCI_EXP_LNKCAP2_SLS_64_0GB	0x00000040 /* Supported Speed 64GT/s */
 #define  PCI_EXP_LNKCAP2_CROSSLINK	0x00000100 /* Crosslink supported */
 #define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
 #define  PCI_EXP_LNKCTL2_TLS		0x000f
@@ -678,6 +681,7 @@
 #define  PCI_EXP_LNKCTL2_TLS_8_0GT	0x0003 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_16_0GT	0x0004 /* Supported Speed 16GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_32_0GT	0x0005 /* Supported Speed 32GT/s */
+#define  PCI_EXP_LNKCTL2_TLS_64_0GT	0x0006 /* Supported Speed 64GT/s */
 #define  PCI_EXP_LNKCTL2_ENTER_COMP	0x0010 /* Enter Compliance */
 #define  PCI_EXP_LNKCTL2_TX_MARGIN	0x0380 /* Transmit Margin */
 #define  PCI_EXP_LNKCTL2_HASD		0x0020 /* HW Autonomous Speed Disable */
-- 
cgit 


From 956fb852181e4e4b16ccad62511e0038d3ed4928 Mon Sep 17 00:00:00 2001
From: Srujana Challa <schalla@marvell.com>
Date: Wed, 18 Nov 2020 17:14:14 +0530
Subject: octeontx2-pf: move lmt flush to include/linux/soc

On OcteonTX2 platform CPT instruction enqueue and NIX
packet send are only possible via LMTST operations which
uses LDEOR instruction. This patch moves lmt flush
function from OcteonTX2 nic driver to include/linux/soc
since it will be used by OcteonTX2 CPT and NIC driver for
LMTST.

Signed-off-by: Suheil Chandran <schandran@marvell.com>
Signed-off-by: Srujana Challa <schalla@marvell.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                                        |  2 ++
 .../ethernet/marvell/octeontx2/nic/otx2_common.h   | 13 +---------
 include/linux/soc/marvell/octeontx2/asm.h          | 29 ++++++++++++++++++++++
 3 files changed, 32 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/soc/marvell/octeontx2/asm.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index a7e0f11def8e..d36bee3f1f04 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10453,6 +10453,7 @@ M:	Srujana Challa <schalla@marvell.com>
 L:	linux-crypto@vger.kernel.org
 S:	Maintained
 F:	drivers/crypto/marvell/
+F:	include/linux/soc/marvell/octeontx2/
 
 MARVELL GIGABIT ETHERNET DRIVERS (skge/sky2)
 M:	Mirko Lindner <mlindner@marvell.com>
@@ -10525,6 +10526,7 @@ M:	hariprasad <hkelam@marvell.com>
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/marvell/octeontx2/nic/
+F:	include/linux/soc/marvell/octeontx2/
 
 MARVELL OCTEONTX2 RVU ADMIN FUNCTION DRIVER
 M:	Sunil Goutham <sgoutham@marvell.com>
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index b18b45d02165..ceec649bdd13 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -16,6 +16,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/timecounter.h>
+#include <linux/soc/marvell/octeontx2/asm.h>
 
 #include <mbox.h>
 #include <npc.h>
@@ -462,21 +463,9 @@ static inline u64 otx2_atomic64_add(u64 incr, u64 *ptr)
 	return result;
 }
 
-static inline u64 otx2_lmt_flush(uint64_t addr)
-{
-	u64 result = 0;
-
-	__asm__ volatile(".cpu  generic+lse\n"
-			 "ldeor xzr,%x[rf],[%[rs]]"
-			 : [rf]"=r"(result)
-			 : [rs]"r"(addr));
-	return result;
-}
-
 #else
 #define otx2_write128(lo, hi, addr)
 #define otx2_atomic64_add(incr, ptr)		({ *ptr += incr; })
-#define otx2_lmt_flush(addr)			({ 0; })
 #endif
 
 /* Alloc pointer from pool/aura */
diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h
new file mode 100644
index 000000000000..ae2279fe830a
--- /dev/null
+++ b/include/linux/soc/marvell/octeontx2/asm.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ * Copyright (C) 2020 Marvell.
+ */
+
+#ifndef __SOC_OTX2_ASM_H
+#define __SOC_OTX2_ASM_H
+
+#if defined(CONFIG_ARM64)
+/*
+ * otx2_lmt_flush is used for LMT store operation.
+ * On octeontx2 platform CPT instruction enqueue and
+ * NIX packet send are only possible via LMTST
+ * operations and it uses LDEOR instruction targeting
+ * the coprocessor address.
+ */
+#define otx2_lmt_flush(ioaddr)                          \
+({                                                      \
+	u64 result = 0;                                 \
+	__asm__ volatile(".cpu  generic+lse\n"          \
+			 "ldeor xzr, %x[rf], [%[rs]]"   \
+			 : [rf]"=r" (result)            \
+			 : [rs]"r" (ioaddr));           \
+	(result);                                       \
+})
+#else
+#define otx2_lmt_flush(ioaddr)          ({ 0; })
+#endif
+
+#endif /* __SOC_OTX2_ASM_H */
-- 
cgit 


From 0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e Mon Sep 17 00:00:00 2001
From: YiFei Zhu <yifeifz2@illinois.edu>
Date: Wed, 11 Nov 2020 07:33:54 -0600
Subject: seccomp/cache: Report cache data through /proc/pid/seccomp_cache

Currently the kernel does not provide an infrastructure to translate
architecture numbers to a human-readable name. Translating syscall
numbers to syscall names is possible through FTRACE_SYSCALL
infrastructure but it does not provide support for compat syscalls.

This will create a file for each PID as /proc/pid/seccomp_cache.
The file will be empty when no seccomp filters are loaded, or be
in the format of:
<arch name> <decimal syscall number> <ALLOW | FILTER>
where ALLOW means the cache is guaranteed to allow the syscall,
and filter means the cache will pass the syscall to the BPF filter.

For the docker default profile on x86_64 it looks like:
x86_64 0 ALLOW
x86_64 1 ALLOW
x86_64 2 ALLOW
x86_64 3 ALLOW
[...]
x86_64 132 ALLOW
x86_64 133 ALLOW
x86_64 134 FILTER
x86_64 135 FILTER
x86_64 136 FILTER
x86_64 137 ALLOW
x86_64 138 ALLOW
x86_64 139 FILTER
x86_64 140 ALLOW
x86_64 141 ALLOW
[...]

This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
of N because I think certain users of seccomp might not want the
application to know which syscalls are definitely usable. For
the same reason, it is also guarded by CAP_SYS_ADMIN.

Suggested-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/94e663fa53136f5a11f432c661794d1ee7060779.1605101222.git.yifeifz2@illinois.edu
---
 arch/Kconfig            | 17 ++++++++++++++
 fs/proc/base.c          |  6 +++++
 include/linux/seccomp.h |  7 ++++++
 kernel/seccomp.c        | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 89 insertions(+)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 56b6ccc0e32d..35c9463b7d10 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER
 	  - secure_computing return value is checked and a return value of -1
 	    results in the system call being skipped immediately.
 	  - seccomp syscall wired up
+	  - if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE,
+	    SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If
+	    COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too.
 
 config SECCOMP
 	prompt "Enable seccomp to safely execute untrusted bytecode"
@@ -514,6 +517,20 @@ config SECCOMP_FILTER
 
 	  See Documentation/userspace-api/seccomp_filter.rst for details.
 
+config SECCOMP_CACHE_DEBUG
+	bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
+	depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR
+	depends on PROC_FS
+	help
+	  This enables the /proc/pid/seccomp_cache interface to monitor
+	  seccomp cache data. The file format is subject to change. Reading
+	  the file requires CAP_SYS_ADMIN.
+
+	  This option is for debugging only. Enabling presents the risk that
+	  an adversary may be able to infer the seccomp filter logic.
+
+	  If unsure, say N.
+
 config HAVE_ARCH_STACKLEAK
 	bool
 	help
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b362523a9829..8a7d682ba881 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 02aef2844c38..76963ec4641a 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task,
 	return -EINVAL;
 }
 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+struct seq_file;
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task);
+#endif
 #endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d8cf468dbe1e..76f524e320b1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -553,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
 {
 	struct seccomp_filter *orig = tsk->seccomp.filter;
 
+	/* We are effectively holding the siglock by not having any sighand. */
+	WARN_ON(tsk->sighand != NULL);
+
 	/* Detach task from its filter tree. */
 	tsk->seccomp.filter = NULL;
 	__seccomp_filter_release(orig);
@@ -2335,3 +2338,59 @@ static int __init seccomp_sysctl_init(void)
 device_initcall(seccomp_sysctl_init)
 
 #endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+					const void *bitmap, size_t bitmap_size)
+{
+	int nr;
+
+	for (nr = 0; nr < bitmap_size; nr++) {
+		bool cached = test_bit(nr, bitmap);
+		char *status = cached ? "ALLOW" : "FILTER";
+
+		seq_printf(m, "%s %d %s\n", name, nr, status);
+	}
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_filter *f;
+	unsigned long flags;
+
+	/*
+	 * We don't want some sandboxed process to know what their seccomp
+	 * filters consist of.
+	 */
+	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!lock_task_sighand(task, &flags))
+		return -ESRCH;
+
+	f = READ_ONCE(task->seccomp.filter);
+	if (!f) {
+		unlock_task_sighand(task, &flags);
+		return 0;
+	}
+
+	/* prevent filter from being freed while we are printing it */
+	__get_seccomp_filter(f);
+	unlock_task_sighand(task, &flags);
+
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+				    f->cache.allow_native,
+				    SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+				    f->cache.allow_compat,
+				    SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+	__put_seccomp_filter(f);
+	return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
-- 
cgit 


From 4ae21993f07422ec1cb83e9530f87fa61bff02bd Mon Sep 17 00:00:00 2001
From: Antonio Cardace <acardace@redhat.com>
Date: Wed, 18 Nov 2020 21:45:17 +0100
Subject: ethtool: add ETHTOOL_COALESCE_ALL_PARAMS define

This bitmask represents all existing coalesce parameters.

Signed-off-by: Antonio Cardace <acardace@redhat.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 6408b446051f..e3da25b51ae4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -215,6 +215,7 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 #define ETHTOOL_COALESCE_TX_USECS_HIGH		BIT(19)
 #define ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH	BIT(20)
 #define ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL	BIT(21)
+#define ETHTOOL_COALESCE_ALL_PARAMS		GENMASK(21, 0)
 
 #define ETHTOOL_COALESCE_USECS						\
 	(ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS)
-- 
cgit 


From 5164c788985702ad94fa4ce6adca29bf280876df Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Tue, 29 Sep 2020 15:59:43 +0300
Subject: iio: triggered-buffer: add {devm_}iio_triggered_buffer_setup_ext
 variants

This change adds a parameter to the {devm_}iio_triggered_buffer_setup()
functions to assign the extra sysfs buffer attributes that are typically
assigned via iio_buffer_set_attrs().

The functions also get renamed to iio_triggered_buffer_setup_ext() &
devm_iio_triggered_buffer_setup_ext().
For backwards compatibility the old {devm_}iio_triggered_buffer_setup()
functions are now macros wrap the new (renamed) functions with NULL for the
buffer attrs.

The aim is to remove iio_buffer_set_attrs(), so in the
iio_triggered_buffer_setup_ext() function the attributes are assigned
directly to 'buffer->attrs'.

When adding multiple IIO buffers per IIO device, it can be pretty
cumbersome to first allocate a set of buffers, then to dig them out of IIO
to assign extra attributes (with iio_buffer_set_attrs()).

Naturally, the best way would be to provide them at allocation time, which
is what this change does.

At this moment, buffers allocated with {devm_}iio_triggered_buffer_setup()
are the only ones in mainline IIO to call iio_buffer_set_attrs().

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20200929125949.69934-4-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/industrialio-triggered-buffer.c | 31 +++++++++++++---------
 include/linux/iio/triggered_buffer.h               | 23 +++++++++++-----
 2 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/buffer/industrialio-triggered-buffer.c b/drivers/iio/buffer/industrialio-triggered-buffer.c
index 6c20a83f887e..92b8aea3e063 100644
--- a/drivers/iio/buffer/industrialio-triggered-buffer.c
+++ b/drivers/iio/buffer/industrialio-triggered-buffer.c
@@ -9,18 +9,20 @@
 #include <linux/module.h>
 #include <linux/iio/iio.h>
 #include <linux/iio/buffer.h>
+#include <linux/iio/buffer_impl.h>
 #include <linux/iio/kfifo_buf.h>
 #include <linux/iio/triggered_buffer.h>
 #include <linux/iio/trigger_consumer.h>
 
 /**
- * iio_triggered_buffer_setup() - Setup triggered buffer and pollfunc
+ * iio_triggered_buffer_setup_ext() - Setup triggered buffer and pollfunc
  * @indio_dev:		IIO device structure
  * @h:			Function which will be used as pollfunc top half
  * @thread:		Function which will be used as pollfunc bottom half
  * @setup_ops:		Buffer setup functions to use for this device.
  *			If NULL the default setup functions for triggered
  *			buffers will be used.
+ * @buffer_attrs:	Extra sysfs buffer attributes for this IIO buffer
  *
  * This function combines some common tasks which will normally be performed
  * when setting up a triggered buffer. It will allocate the buffer and the
@@ -33,10 +35,11 @@
  * To free the resources allocated by this function call
  * iio_triggered_buffer_cleanup().
  */
-int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
+int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev,
 	irqreturn_t (*h)(int irq, void *p),
 	irqreturn_t (*thread)(int irq, void *p),
-	const struct iio_buffer_setup_ops *setup_ops)
+	const struct iio_buffer_setup_ops *setup_ops,
+	const struct attribute **buffer_attrs)
 {
 	struct iio_buffer *buffer;
 	int ret;
@@ -67,6 +70,8 @@ int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
 	/* Flag that polled ring buffering is possible */
 	indio_dev->modes |= INDIO_BUFFER_TRIGGERED;
 
+	buffer->attrs = buffer_attrs;
+
 	return 0;
 
 error_kfifo_free:
@@ -74,10 +79,10 @@ error_kfifo_free:
 error_ret:
 	return ret;
 }
-EXPORT_SYMBOL(iio_triggered_buffer_setup);
+EXPORT_SYMBOL(iio_triggered_buffer_setup_ext);
 
 /**
- * iio_triggered_buffer_cleanup() - Free resources allocated by iio_triggered_buffer_setup()
+ * iio_triggered_buffer_cleanup() - Free resources allocated by iio_triggered_buffer_setup_ext()
  * @indio_dev: IIO device structure
  */
 void iio_triggered_buffer_cleanup(struct iio_dev *indio_dev)
@@ -92,11 +97,12 @@ static void devm_iio_triggered_buffer_clean(struct device *dev, void *res)
 	iio_triggered_buffer_cleanup(*(struct iio_dev **)res);
 }
 
-int devm_iio_triggered_buffer_setup(struct device *dev,
-				    struct iio_dev *indio_dev,
-				    irqreturn_t (*h)(int irq, void *p),
-				    irqreturn_t (*thread)(int irq, void *p),
-				    const struct iio_buffer_setup_ops *ops)
+int devm_iio_triggered_buffer_setup_ext(struct device *dev,
+					struct iio_dev *indio_dev,
+					irqreturn_t (*h)(int irq, void *p),
+					irqreturn_t (*thread)(int irq, void *p),
+					const struct iio_buffer_setup_ops *ops,
+					const struct attribute **buffer_attrs)
 {
 	struct iio_dev **ptr;
 	int ret;
@@ -108,7 +114,8 @@ int devm_iio_triggered_buffer_setup(struct device *dev,
 
 	*ptr = indio_dev;
 
-	ret = iio_triggered_buffer_setup(indio_dev, h, thread, ops);
+	ret = iio_triggered_buffer_setup_ext(indio_dev, h, thread, ops,
+					     buffer_attrs);
 	if (!ret)
 		devres_add(dev, ptr);
 	else
@@ -116,7 +123,7 @@ int devm_iio_triggered_buffer_setup(struct device *dev,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(devm_iio_triggered_buffer_setup);
+EXPORT_SYMBOL_GPL(devm_iio_triggered_buffer_setup_ext);
 
 MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
 MODULE_DESCRIPTION("IIO helper functions for setting up triggered buffers");
diff --git a/include/linux/iio/triggered_buffer.h b/include/linux/iio/triggered_buffer.h
index e99c91799359..7f154d1f8739 100644
--- a/include/linux/iio/triggered_buffer.h
+++ b/include/linux/iio/triggered_buffer.h
@@ -4,19 +4,28 @@
 
 #include <linux/interrupt.h>
 
+struct attribute;
 struct iio_dev;
 struct iio_buffer_setup_ops;
 
-int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
+int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev,
 	irqreturn_t (*h)(int irq, void *p),
 	irqreturn_t (*thread)(int irq, void *p),
-	const struct iio_buffer_setup_ops *setup_ops);
+	const struct iio_buffer_setup_ops *setup_ops,
+	const struct attribute **buffer_attrs);
 void iio_triggered_buffer_cleanup(struct iio_dev *indio_dev);
 
-int devm_iio_triggered_buffer_setup(struct device *dev,
-				    struct iio_dev *indio_dev,
-				    irqreturn_t (*h)(int irq, void *p),
-				    irqreturn_t (*thread)(int irq, void *p),
-				    const struct iio_buffer_setup_ops *ops);
+#define iio_triggered_buffer_setup(indio_dev, h, thread, setup_ops)		\
+	iio_triggered_buffer_setup_ext((indio_dev), (h), (thread), (setup_ops), NULL)
+
+int devm_iio_triggered_buffer_setup_ext(struct device *dev,
+					struct iio_dev *indio_dev,
+					irqreturn_t (*h)(int irq, void *p),
+					irqreturn_t (*thread)(int irq, void *p),
+					const struct iio_buffer_setup_ops *ops,
+					const struct attribute **buffer_attrs);
+
+#define devm_iio_triggered_buffer_setup(dev, indio_dev, h, thread, setup_ops)	\
+	devm_iio_triggered_buffer_setup_ext((dev), (indio_dev), (h), (thread), (setup_ops), NULL)
 
 #endif
-- 
cgit 


From 21232b4456ba5e1eea7385bd3c4b1994994fd409 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Tue, 29 Sep 2020 15:59:49 +0300
Subject: iio: buffer: remove iio_buffer_set_attrs() helper

The iio_buffer_set_attrs() is no longer used in the drivers, so it can be
removed now.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20200929125949.69934-10-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-buffer.c | 12 ------------
 include/linux/iio/buffer.h        |  3 ---
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index a4f6bb96d4f4..9663dec3dcf3 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -210,18 +210,6 @@ void iio_buffer_init(struct iio_buffer *buffer)
 }
 EXPORT_SYMBOL(iio_buffer_init);
 
-/**
- * iio_buffer_set_attrs - Set buffer specific attributes
- * @buffer: The buffer for which we are setting attributes
- * @attrs: Pointer to a null terminated list of pointers to attributes
- */
-void iio_buffer_set_attrs(struct iio_buffer *buffer,
-			 const struct attribute **attrs)
-{
-	buffer->attrs = attrs;
-}
-EXPORT_SYMBOL_GPL(iio_buffer_set_attrs);
-
 static ssize_t iio_show_scan_index(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index fbba4093f06c..8febc23f5f26 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -11,9 +11,6 @@
 
 struct iio_buffer;
 
-void iio_buffer_set_attrs(struct iio_buffer *buffer,
-			 const struct attribute **attrs);
-
 int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data);
 
 /**
-- 
cgit 


From e2083d36739168f7b612312160cf7bb45b251408 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 19 Nov 2020 17:49:04 +0000
Subject: firmware: arm_scmi: Add SCMI v3.0 sensors timestamped reads

Add new .reading_get_timestamped() method to sensor_ops to support SCMI v3.0
timestamped reads.

Link: https://lore.kernel.org/r/20201119174906.43862-5-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 127 ++++++++++++++++++++++++++++++++++--
 include/linux/scmi_protocol.h       |  22 +++++++
 2 files changed, 143 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index a85827f60a02..2239af5f9e6e 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -155,6 +155,23 @@ struct scmi_msg_sensor_reading_get {
 #define SENSOR_READ_ASYNC	BIT(0)
 };
 
+struct scmi_resp_sensor_reading_complete {
+	__le32 id;
+	__le64 readings;
+};
+
+struct scmi_sensor_reading_le {
+	__le32 sensor_value_low;
+	__le32 sensor_value_high;
+	__le32 timestamp_low;
+	__le32 timestamp_high;
+};
+
+struct scmi_resp_sensor_reading_complete_v3 {
+	__le32 id;
+	struct scmi_sensor_reading_le readings[];
+};
+
 struct scmi_sensor_trip_notify_payld {
 	__le32 agent_id;
 	__le32 sensor_id;
@@ -575,6 +592,21 @@ scmi_sensor_trip_point_config(const struct scmi_handle *handle, u32 sensor_id,
 	return ret;
 }
 
+/**
+ * scmi_sensor_reading_get  - Read scalar sensor value
+ * @handle: Platform handle
+ * @sensor_id: Sensor ID
+ * @value: The 64bit value sensor reading
+ *
+ * This function returns a single 64 bit reading value representing the sensor
+ * value; if the platform SCMI Protocol implementation and the sensor support
+ * multiple axis and timestamped-reads, this just returns the first axis while
+ * dropping the timestamp value.
+ * Use instead the @scmi_sensor_reading_get_timestamped to retrieve the array of
+ * timestamped multi-axis values.
+ *
+ * Return: 0 on Success
+ */
 static int scmi_sensor_reading_get(const struct scmi_handle *handle,
 				   u32 sensor_id, u64 *value)
 {
@@ -585,20 +617,24 @@ static int scmi_sensor_reading_get(const struct scmi_handle *handle,
 	struct scmi_sensor_info *s = si->sensors + sensor_id;
 
 	ret = scmi_xfer_get_init(handle, SENSOR_READING_GET,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*sensor),
-				 sizeof(u64), &t);
+				 SCMI_PROTOCOL_SENSOR, sizeof(*sensor), 0, &t);
 	if (ret)
 		return ret;
 
 	sensor = t->tx.buf;
 	sensor->id = cpu_to_le32(sensor_id);
-
 	if (s->async) {
 		sensor->flags = cpu_to_le32(SENSOR_READ_ASYNC);
 		ret = scmi_do_xfer_with_response(handle, t);
-		if (!ret)
-			*value = get_unaligned_le64((void *)
-						    ((__le32 *)t->rx.buf + 1));
+		if (!ret) {
+			struct scmi_resp_sensor_reading_complete *resp;
+
+			resp = t->rx.buf;
+			if (le32_to_cpu(resp->id) == sensor_id)
+				*value = get_unaligned_le64(&resp->readings);
+			else
+				ret = -EPROTO;
+		}
 	} else {
 		sensor->flags = cpu_to_le32(0);
 		ret = scmi_do_xfer(handle, t);
@@ -610,6 +646,84 @@ static int scmi_sensor_reading_get(const struct scmi_handle *handle,
 	return ret;
 }
 
+static inline void
+scmi_parse_sensor_readings(struct scmi_sensor_reading *out,
+			   const struct scmi_sensor_reading_le *in)
+{
+	out->value = get_unaligned_le64((void *)&in->sensor_value_low);
+	out->timestamp = get_unaligned_le64((void *)&in->timestamp_low);
+}
+
+/**
+ * scmi_sensor_reading_get_timestamped  - Read multiple-axis timestamped values
+ * @handle: Platform handle
+ * @sensor_id: Sensor ID
+ * @count: The length of the provided @readings array
+ * @readings: An array of elements each representing a timestamped per-axis
+ *	      reading of type @struct scmi_sensor_reading.
+ *	      Returned readings are ordered as the @axis descriptors array
+ *	      included in @struct scmi_sensor_info and the max number of
+ *	      returned elements is min(@count, @num_axis); ideally the provided
+ *	      array should be of length @count equal to @num_axis.
+ *
+ * Return: 0 on Success
+ */
+static int
+scmi_sensor_reading_get_timestamped(const struct scmi_handle *handle,
+				    u32 sensor_id, u8 count,
+				    struct scmi_sensor_reading *readings)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_sensor_reading_get *sensor;
+	struct sensors_info *si = handle->sensor_priv;
+	struct scmi_sensor_info *s = si->sensors + sensor_id;
+
+	if (!count || !readings ||
+	    (!s->num_axis && count > 1) || (s->num_axis && count > s->num_axis))
+		return -EINVAL;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_READING_GET,
+				 SCMI_PROTOCOL_SENSOR, sizeof(*sensor), 0, &t);
+	if (ret)
+		return ret;
+
+	sensor = t->tx.buf;
+	sensor->id = cpu_to_le32(sensor_id);
+	if (s->async) {
+		sensor->flags = cpu_to_le32(SENSOR_READ_ASYNC);
+		ret = scmi_do_xfer_with_response(handle, t);
+		if (!ret) {
+			int i;
+			struct scmi_resp_sensor_reading_complete_v3 *resp;
+
+			resp = t->rx.buf;
+			/* Retrieve only the number of requested axis anyway */
+			if (le32_to_cpu(resp->id) == sensor_id)
+				for (i = 0; i < count; i++)
+					scmi_parse_sensor_readings(&readings[i],
+								   &resp->readings[i]);
+			else
+				ret = -EPROTO;
+		}
+	} else {
+		sensor->flags = cpu_to_le32(0);
+		ret = scmi_do_xfer(handle, t);
+		if (!ret) {
+			int i;
+			struct scmi_sensor_reading_le *resp_readings;
+
+			resp_readings = t->rx.buf;
+			for (i = 0; i < count; i++)
+				scmi_parse_sensor_readings(&readings[i],
+							   &resp_readings[i]);
+		}
+	}
+
+	scmi_xfer_put(handle, t);
+	return ret;
+}
+
 static const struct scmi_sensor_info *
 scmi_sensor_info_get(const struct scmi_handle *handle, u32 sensor_id)
 {
@@ -630,6 +744,7 @@ static const struct scmi_sensor_ops sensor_ops = {
 	.info_get = scmi_sensor_info_get,
 	.trip_point_config = scmi_sensor_trip_point_config,
 	.reading_get = scmi_sensor_reading_get,
+	.reading_get_timestamped = scmi_sensor_reading_get_timestamped,
 };
 
 static int scmi_sensor_set_notify_enabled(const struct scmi_handle *handle,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 0792b0be25a3..0c52bf0cbee4 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -149,6 +149,20 @@ struct scmi_power_ops {
 			 u32 *state);
 };
 
+/**
+ * scmi_sensor_reading  - represent a timestamped read
+ *
+ * Used by @reading_get_timestamped method.
+ *
+ * @value: The signed value sensor read.
+ * @timestamp: An unsigned timestamp for the sensor read, as provided by
+ *	       SCMI platform. Set to zero when not available.
+ */
+struct scmi_sensor_reading {
+	long long value;
+	unsigned long long timestamp;
+};
+
 /**
  * scmi_range_attrs  - specifies a sensor or axis values' range
  * @min_range: The minimum value which can be represented by the sensor/axis.
@@ -390,6 +404,11 @@ enum scmi_sensor_class {
  * @info_get: get the information of the specified sensor
  * @trip_point_config: selects and configures a trip-point of interest
  * @reading_get: gets the current value of the sensor
+ * @reading_get_timestamped: gets the current value and timestamp, when
+ *			     available, of the sensor. (as of v3.0 spec)
+ *			     Supports multi-axis sensors for sensors which
+ *			     supports it and if the @reading array size of
+ *			     @count entry equals the sensor num_axis
  */
 struct scmi_sensor_ops {
 	int (*count_get)(const struct scmi_handle *handle);
@@ -399,6 +418,9 @@ struct scmi_sensor_ops {
 				 u32 sensor_id, u8 trip_id, u64 trip_value);
 	int (*reading_get)(const struct scmi_handle *handle, u32 sensor_id,
 			   u64 *value);
+	int (*reading_get_timestamped)(const struct scmi_handle *handle,
+				       u32 sensor_id, u8 count,
+				       struct scmi_sensor_reading *readings);
 };
 
 /**
-- 
cgit 


From 7b83c5f41088987d04e24c3af0e1fb9f43b747b5 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 19 Nov 2020 17:49:05 +0000
Subject: firmware: arm_scmi: Add SCMI v3.0 sensor configuration support

Add SCMI v3.0 sensor support for CONFIG_GET/CONFIG_SET commands.

Link: https://lore.kernel.org/r/20201119174906.43862-6-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 63 +++++++++++++++++++++++++++++++++++++
 include/linux/scmi_protocol.h       | 37 ++++++++++++++++++++++
 2 files changed, 100 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index 2239af5f9e6e..10c271d430e7 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -23,6 +23,8 @@ enum scmi_sensor_protocol_cmd {
 	SENSOR_READING_GET = 0x6,
 	SENSOR_AXIS_DESCRIPTION_GET = 0x7,
 	SENSOR_LIST_UPDATE_INTERVALS = 0x8,
+	SENSOR_CONFIG_GET = 0x9,
+	SENSOR_CONFIG_SET = 0xA,
 };
 
 struct scmi_msg_resp_sensor_attributes {
@@ -149,6 +151,11 @@ struct scmi_msg_set_sensor_trip_point {
 	__le32 value_high;
 };
 
+struct scmi_msg_sensor_config_set {
+	__le32 id;
+	__le32 sensor_config;
+};
+
 struct scmi_msg_sensor_reading_get {
 	__le32 id;
 	__le32 flags;
@@ -592,6 +599,60 @@ scmi_sensor_trip_point_config(const struct scmi_handle *handle, u32 sensor_id,
 	return ret;
 }
 
+static int scmi_sensor_config_get(const struct scmi_handle *handle,
+				  u32 sensor_id, u32 *sensor_config)
+{
+	int ret;
+	struct scmi_xfer *t;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_CONFIG_GET,
+				 SCMI_PROTOCOL_SENSOR, sizeof(__le32),
+				 sizeof(__le32), &t);
+	if (ret)
+		return ret;
+
+	put_unaligned_le32(cpu_to_le32(sensor_id), t->tx.buf);
+	ret = scmi_do_xfer(handle, t);
+	if (!ret) {
+		struct sensors_info *si = handle->sensor_priv;
+		struct scmi_sensor_info *s = si->sensors + sensor_id;
+
+		*sensor_config = get_unaligned_le64(t->rx.buf);
+		s->sensor_config = *sensor_config;
+	}
+
+	scmi_xfer_put(handle, t);
+	return ret;
+}
+
+static int scmi_sensor_config_set(const struct scmi_handle *handle,
+				  u32 sensor_id, u32 sensor_config)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_sensor_config_set *msg;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_CONFIG_SET,
+				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &t);
+	if (ret)
+		return ret;
+
+	msg = t->tx.buf;
+	msg->id = cpu_to_le32(sensor_id);
+	msg->sensor_config = cpu_to_le32(sensor_config);
+
+	ret = scmi_do_xfer(handle, t);
+	if (!ret) {
+		struct sensors_info *si = handle->sensor_priv;
+		struct scmi_sensor_info *s = si->sensors + sensor_id;
+
+		s->sensor_config = sensor_config;
+	}
+
+	scmi_xfer_put(handle, t);
+	return ret;
+}
+
 /**
  * scmi_sensor_reading_get  - Read scalar sensor value
  * @handle: Platform handle
@@ -745,6 +806,8 @@ static const struct scmi_sensor_ops sensor_ops = {
 	.trip_point_config = scmi_sensor_trip_point_config,
 	.reading_get = scmi_sensor_reading_get,
 	.reading_get_timestamped = scmi_sensor_reading_get_timestamped,
+	.config_get = scmi_sensor_config_get,
+	.config_set = scmi_sensor_config_set,
 };
 
 static int scmi_sensor_set_notify_enabled(const struct scmi_handle *handle,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 0c52bf0cbee4..7e9e2cd3d46b 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -286,7 +286,38 @@ struct scmi_sensor_info {
 	unsigned int num_axis;
 	struct scmi_sensor_axis_info *axis;
 	struct scmi_sensor_intervals_info intervals;
+	unsigned int sensor_config;
+#define SCMI_SENS_CFG_UPDATE_SECS_MASK		GENMASK(31, 16)
+#define SCMI_SENS_CFG_GET_UPDATE_SECS(x)				\
+	FIELD_GET(SCMI_SENS_CFG_UPDATE_SECS_MASK, (x))
+
+#define SCMI_SENS_CFG_UPDATE_EXP_MASK		GENMASK(15, 11)
+#define SCMI_SENS_CFG_GET_UPDATE_EXP(x)					\
+	({								\
+		int __signed_exp =					\
+			FIELD_GET(SCMI_SENS_CFG_UPDATE_EXP_MASK, (x));	\
+									\
+		if (__signed_exp & BIT(4))				\
+			__signed_exp |= GENMASK(31, 5);			\
+		__signed_exp;						\
+	})
+
+#define SCMI_SENS_CFG_ROUND_MASK		GENMASK(10, 9)
+#define SCMI_SENS_CFG_ROUND_AUTO		2
+#define SCMI_SENS_CFG_ROUND_UP			1
+#define SCMI_SENS_CFG_ROUND_DOWN		0
+
+#define SCMI_SENS_CFG_TSTAMP_ENABLED_MASK	BIT(1)
+#define SCMI_SENS_CFG_TSTAMP_ENABLE		1
+#define SCMI_SENS_CFG_TSTAMP_DISABLE		0
+#define SCMI_SENS_CFG_IS_TSTAMP_ENABLED(x)				\
+	FIELD_GET(SCMI_SENS_CFG_TSTAMP_ENABLED_MASK, (x))
+
+#define SCMI_SENS_CFG_SENSOR_ENABLED_MASK	BIT(0)
+#define SCMI_SENS_CFG_SENSOR_ENABLE		1
+#define SCMI_SENS_CFG_SENSOR_DISABLE		0
 	char name[SCMI_MAX_STR_SIZE];
+#define SCMI_SENS_CFG_IS_ENABLED(x)		FIELD_GET(BIT(0), (x))
 	bool extended_scalar_attrs;
 	unsigned int sensor_power;
 	unsigned int resolution;
@@ -409,6 +440,8 @@ enum scmi_sensor_class {
  *			     Supports multi-axis sensors for sensors which
  *			     supports it and if the @reading array size of
  *			     @count entry equals the sensor num_axis
+ * @config_get: Get sensor current configuration
+ * @config_set: Set sensor current configuration
  */
 struct scmi_sensor_ops {
 	int (*count_get)(const struct scmi_handle *handle);
@@ -421,6 +454,10 @@ struct scmi_sensor_ops {
 	int (*reading_get_timestamped)(const struct scmi_handle *handle,
 				       u32 sensor_id, u8 count,
 				       struct scmi_sensor_reading *readings);
+	int (*config_get)(const struct scmi_handle *handle,
+			  u32 sensor_id, u32 *sensor_config);
+	int (*config_set)(const struct scmi_handle *handle,
+			  u32 sensor_id, u32 sensor_config);
 };
 
 /**
-- 
cgit 


From e3811190acf85c63518fbddaa28bcbfab2baa58d Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 19 Nov 2020 17:49:06 +0000
Subject: firmware: arm_scmi: Add SCMI v3.0 sensor notifications

Add support for new SCMI v3.0 SENSOR_UPDATE notification.

Link: https://lore.kernel.org/r/20201119174906.43862-7-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 124 ++++++++++++++++++++++++++++++------
 include/linux/scmi_protocol.h       |   9 +++
 2 files changed, 114 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index 10c271d430e7..b3d7c08c09a0 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -25,6 +25,7 @@ enum scmi_sensor_protocol_cmd {
 	SENSOR_LIST_UPDATE_INTERVALS = 0x8,
 	SENSOR_CONFIG_GET = 0x9,
 	SENSOR_CONFIG_SET = 0xA,
+	SENSOR_CONTINUOUS_UPDATE_NOTIFY = 0xB,
 };
 
 struct scmi_msg_resp_sensor_attributes {
@@ -132,10 +133,10 @@ struct scmi_msg_resp_sensor_list_update_intervals {
 	__le32 intervals[];
 };
 
-struct scmi_msg_sensor_trip_point_notify {
+struct scmi_msg_sensor_request_notify {
 	__le32 id;
 	__le32 event_control;
-#define SENSOR_TP_NOTIFY_ALL	BIT(0)
+#define SENSOR_NOTIFY_ALL	BIT(0)
 };
 
 struct scmi_msg_set_sensor_trip_point {
@@ -185,6 +186,12 @@ struct scmi_sensor_trip_notify_payld {
 	__le32 trip_point_desc;
 };
 
+struct scmi_sensor_update_notify_payld {
+	__le32 agent_id;
+	__le32 sensor_id;
+	struct scmi_sensor_reading_le readings[];
+};
+
 struct sensors_info {
 	u32 version;
 	int num_sensors;
@@ -550,15 +557,16 @@ out:
 	return ret;
 }
 
-static int scmi_sensor_trip_point_notify(const struct scmi_handle *handle,
-					 u32 sensor_id, bool enable)
+static inline int
+scmi_sensor_request_notify(const struct scmi_handle *handle, u32 sensor_id,
+			   u8 message_id, bool enable)
 {
 	int ret;
-	u32 evt_cntl = enable ? SENSOR_TP_NOTIFY_ALL : 0;
+	u32 evt_cntl = enable ? SENSOR_NOTIFY_ALL : 0;
 	struct scmi_xfer *t;
-	struct scmi_msg_sensor_trip_point_notify *cfg;
+	struct scmi_msg_sensor_request_notify *cfg;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_TRIP_POINT_NOTIFY,
+	ret = scmi_xfer_get_init(handle, message_id,
 				 SCMI_PROTOCOL_SENSOR, sizeof(*cfg), 0, &t);
 	if (ret)
 		return ret;
@@ -573,6 +581,23 @@ static int scmi_sensor_trip_point_notify(const struct scmi_handle *handle,
 	return ret;
 }
 
+static int scmi_sensor_trip_point_notify(const struct scmi_handle *handle,
+					 u32 sensor_id, bool enable)
+{
+	return scmi_sensor_request_notify(handle, sensor_id,
+					  SENSOR_TRIP_POINT_NOTIFY,
+					  enable);
+}
+
+static int
+scmi_sensor_continuous_update_notify(const struct scmi_handle *handle,
+				     u32 sensor_id, bool enable)
+{
+	return scmi_sensor_request_notify(handle, sensor_id,
+					  SENSOR_CONTINUOUS_UPDATE_NOTIFY,
+					  enable);
+}
+
 static int
 scmi_sensor_trip_point_config(const struct scmi_handle *handle, u32 sensor_id,
 			      u8 trip_id, u64 trip_value)
@@ -815,7 +840,19 @@ static int scmi_sensor_set_notify_enabled(const struct scmi_handle *handle,
 {
 	int ret;
 
-	ret = scmi_sensor_trip_point_notify(handle, src_id, enable);
+	switch (evt_id) {
+	case SCMI_EVENT_SENSOR_TRIP_POINT_EVENT:
+		ret = scmi_sensor_trip_point_notify(handle, src_id, enable);
+		break;
+	case SCMI_EVENT_SENSOR_UPDATE:
+		ret = scmi_sensor_continuous_update_notify(handle, src_id,
+							   enable);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
 	if (ret)
 		pr_debug("FAIL_ENABLED - evt[%X] dom[%d] - ret:%d\n",
 			 evt_id, src_id, ret);
@@ -828,20 +865,59 @@ static void *scmi_sensor_fill_custom_report(const struct scmi_handle *handle,
 					    const void *payld, size_t payld_sz,
 					    void *report, u32 *src_id)
 {
-	const struct scmi_sensor_trip_notify_payld *p = payld;
-	struct scmi_sensor_trip_point_report *r = report;
+	void *rep = NULL;
 
-	if (evt_id != SCMI_EVENT_SENSOR_TRIP_POINT_EVENT ||
-	    sizeof(*p) != payld_sz)
-		return NULL;
+	switch (evt_id) {
+	case SCMI_EVENT_SENSOR_TRIP_POINT_EVENT:
+	{
+		const struct scmi_sensor_trip_notify_payld *p = payld;
+		struct scmi_sensor_trip_point_report *r = report;
 
-	r->timestamp = timestamp;
-	r->agent_id = le32_to_cpu(p->agent_id);
-	r->sensor_id = le32_to_cpu(p->sensor_id);
-	r->trip_point_desc = le32_to_cpu(p->trip_point_desc);
-	*src_id = r->sensor_id;
+		if (sizeof(*p) != payld_sz)
+			break;
 
-	return r;
+		r->timestamp = timestamp;
+		r->agent_id = le32_to_cpu(p->agent_id);
+		r->sensor_id = le32_to_cpu(p->sensor_id);
+		r->trip_point_desc = le32_to_cpu(p->trip_point_desc);
+		*src_id = r->sensor_id;
+		rep = r;
+		break;
+	}
+	case SCMI_EVENT_SENSOR_UPDATE:
+	{
+		int i;
+		struct scmi_sensor_info *s;
+		const struct scmi_sensor_update_notify_payld *p = payld;
+		struct scmi_sensor_update_report *r = report;
+		struct sensors_info *sinfo = handle->sensor_priv;
+
+		/* payld_sz is variable for this event */
+		r->sensor_id = le32_to_cpu(p->sensor_id);
+		if (r->sensor_id >= sinfo->num_sensors)
+			break;
+		r->timestamp = timestamp;
+		r->agent_id = le32_to_cpu(p->agent_id);
+		s = &sinfo->sensors[r->sensor_id];
+		/*
+		 * The generated report r (@struct scmi_sensor_update_report)
+		 * was pre-allocated to contain up to SCMI_MAX_NUM_SENSOR_AXIS
+		 * readings: here it is filled with the effective @num_axis
+		 * readings defined for this sensor or 1 for scalar sensors.
+		 */
+		r->readings_count = s->num_axis ?: 1;
+		for (i = 0; i < r->readings_count; i++)
+			scmi_parse_sensor_readings(&r->readings[i],
+						   &p->readings[i]);
+		*src_id = r->sensor_id;
+		rep = r;
+		break;
+	}
+	default:
+		break;
+	}
+
+	return rep;
 }
 
 static const struct scmi_event sensor_events[] = {
@@ -850,6 +926,16 @@ static const struct scmi_event sensor_events[] = {
 		.max_payld_sz = sizeof(struct scmi_sensor_trip_notify_payld),
 		.max_report_sz = sizeof(struct scmi_sensor_trip_point_report),
 	},
+	{
+		.id = SCMI_EVENT_SENSOR_UPDATE,
+		.max_payld_sz =
+			sizeof(struct scmi_sensor_update_notify_payld) +
+			 SCMI_MAX_NUM_SENSOR_AXIS *
+			 sizeof(struct scmi_sensor_reading_le),
+		.max_report_sz = sizeof(struct scmi_sensor_update_report) +
+				  SCMI_MAX_NUM_SENSOR_AXIS *
+				  sizeof(struct scmi_sensor_reading),
+	},
 };
 
 static const struct scmi_event_ops sensor_event_ops = {
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 7e9e2cd3d46b..be0be5ff7514 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -657,6 +657,7 @@ enum scmi_notification_events {
 	SCMI_EVENT_PERFORMANCE_LIMITS_CHANGED = 0x0,
 	SCMI_EVENT_PERFORMANCE_LEVEL_CHANGED = 0x1,
 	SCMI_EVENT_SENSOR_TRIP_POINT_EVENT = 0x0,
+	SCMI_EVENT_SENSOR_UPDATE = 0x1,
 	SCMI_EVENT_RESET_ISSUED = 0x0,
 	SCMI_EVENT_BASE_ERROR_EVENT = 0x0,
 	SCMI_EVENT_SYSTEM_POWER_STATE_NOTIFIER = 0x0,
@@ -698,6 +699,14 @@ struct scmi_sensor_trip_point_report {
 	unsigned int	trip_point_desc;
 };
 
+struct scmi_sensor_update_report {
+	ktime_t				timestamp;
+	unsigned int			agent_id;
+	unsigned int			sensor_id;
+	unsigned int			readings_count;
+	struct scmi_sensor_reading	readings[];
+};
+
 struct scmi_reset_issued_report {
 	ktime_t		timestamp;
 	unsigned int	agent_id;
-- 
cgit 


From bc2dc4406c463174613047d8b7946e12c8808cda Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Sat, 21 Nov 2020 22:17:01 -0800
Subject: compiler-clang: remove version check for BPF Tracing

bpftrace parses the kernel headers and uses Clang under the hood.

Remove the version check when __BPF_TRACING__ is defined (as bpftrace
does) so that this tool can continue to parse kernel headers, even with
older clang sources.

Fixes: commit 1f7a44f63e6c ("compiler-clang: add build check for clang 10.0.1")
Reported-by: Chen Yu <yu.chen.surf@gmail.com>
Reported-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Link: https://lkml.kernel.org/r/20201104191052.390657-1-ndesaulniers@google.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index dd7233c48bf3..98cff1b4b088 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -8,8 +8,10 @@
 		     + __clang_patchlevel__)
 
 #if CLANG_VERSION < 100001
+#ifndef __BPF_TRACING__
 # error Sorry, your version of Clang is too old - please use 10.0.1 or newer.
 #endif
+#endif
 
 /* Compiler specific definitions for Clang compiler */
 
-- 
cgit 


From a927bd6ba952d13c52b8b385030943032f659a3e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 21 Nov 2020 22:17:05 -0800
Subject: mm: fix phys_to_target_node() and memory_add_physaddr_to_nid()
 exports

The core-mm has a default __weak implementation of phys_to_target_node()
to mirror the weak definition of memory_add_physaddr_to_nid().  That
symbol is exported for modules.  However, while the export in
mm/memory_hotplug.c exported the symbol in the configuration cases of:

	CONFIG_NUMA_KEEP_MEMINFO=y
	CONFIG_MEMORY_HOTPLUG=y

...and:

	CONFIG_NUMA_KEEP_MEMINFO=n
	CONFIG_MEMORY_HOTPLUG=y

...it failed to export the symbol in the case of:

	CONFIG_NUMA_KEEP_MEMINFO=y
	CONFIG_MEMORY_HOTPLUG=n

Not only is that broken, but Christoph points out that the kernel should
not be exporting any __weak symbol, which means that
memory_add_physaddr_to_nid() example that phys_to_target_node() copied
is broken too.

Rework the definition of phys_to_target_node() and
memory_add_physaddr_to_nid() to not require weak symbols.  Move to the
common arch override design-pattern of an asm header defining a symbol
to replace the default implementation.

The only common header that all memory_add_physaddr_to_nid() producing
architectures implement is asm/sparsemem.h.  In fact, powerpc already
defines its memory_add_physaddr_to_nid() helper in sparsemem.h.
Double-down on that observation and define phys_to_target_node() where
necessary in asm/sparsemem.h.  An alternate consideration that was
discarded was to put this override in asm/numa.h, but that entangles
with the definition of MAX_NUMNODES relative to the inclusion of
linux/nodemask.h, and requires powerpc to grow a new header.

The dependency on NUMA_KEEP_MEMINFO for DEV_DAX_HMEM_DEVICES is invalid
now that the symbol is properly exported / stubbed in all combinations
of CONFIG_NUMA_KEEP_MEMINFO and CONFIG_MEMORY_HOTPLUG.

[dan.j.williams@intel.com: v4]
  Link: https://lkml.kernel.org/r/160461461867.1505359.5301571728749534585.stgit@dwillia2-desk3.amr.corp.intel.com
[dan.j.williams@intel.com: powerpc: fix create_section_mapping compile warning]
  Link: https://lkml.kernel.org/r/160558386174.2948926.2740149041249041764.stgit@dwillia2-desk3.amr.corp.intel.com

Fixes: a035b6bf863e ("mm/memory_hotplug: introduce default phys_to_target_node() implementation")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lkml.kernel.org/r/160447639846.1133764.7044090803980177548.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/sparsemem.h    |  6 ++++++
 arch/powerpc/include/asm/mmzone.h    |  5 +++++
 arch/powerpc/include/asm/sparsemem.h |  5 ++---
 arch/powerpc/mm/mem.c                |  1 +
 arch/x86/include/asm/sparsemem.h     | 10 ++++++++++
 arch/x86/mm/numa.c                   |  2 ++
 drivers/dax/Kconfig                  |  1 -
 include/linux/memory_hotplug.h       | 14 --------------
 include/linux/numa.h                 | 30 +++++++++++++++++++++++++++++-
 mm/memory_hotplug.c                  | 18 ------------------
 10 files changed, 55 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h
index 336d0570e1fa..dd8c166ffd7b 100644
--- a/arch/ia64/include/asm/sparsemem.h
+++ b/arch/ia64/include/asm/sparsemem.h
@@ -18,4 +18,10 @@
 #endif
 
 #endif /* CONFIG_SPARSEMEM */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 addr);
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+#endif
+
 #endif /* _ASM_IA64_SPARSEMEM_H */
diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h
index 91c69ff53a8a..6cda76b57c5d 100644
--- a/arch/powerpc/include/asm/mmzone.h
+++ b/arch/powerpc/include/asm/mmzone.h
@@ -46,5 +46,10 @@ u64 memory_hotplug_max(void);
 #define __HAVE_ARCH_RESERVED_KERNEL_PAGES
 #endif
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+extern int create_section_mapping(unsigned long start, unsigned long end,
+				  int nid, pgprot_t prot);
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_MMZONE_H_ */
diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index 1e6fa371cc38..d072866842e4 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -13,9 +13,9 @@
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-extern int create_section_mapping(unsigned long start, unsigned long end,
-				  int nid, pgprot_t prot);
 extern int remove_section_mapping(unsigned long start, unsigned long end);
+extern int memory_add_physaddr_to_nid(u64 start);
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
 
 #ifdef CONFIG_NUMA
 extern int hot_add_scn_to_nid(unsigned long scn_addr);
@@ -26,6 +26,5 @@ static inline int hot_add_scn_to_nid(unsigned long scn_addr)
 }
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_MEMORY_HOTPLUG */
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_SPARSEMEM_H */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 01ec2a252f09..3fc325bebe4d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -50,6 +50,7 @@
 #include <asm/rtas.h>
 #include <asm/kasan.h>
 #include <asm/svm.h>
+#include <asm/mmzone.h>
 
 #include <mm/mmu_decl.h>
 
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 6bfc878f6771..6a9ccc1b2be5 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -28,4 +28,14 @@
 #endif
 
 #endif /* CONFIG_SPARSEMEM */
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+extern int phys_to_target_node(phys_addr_t start);
+#define phys_to_target_node phys_to_target_node
+extern int memory_add_physaddr_to_nid(u64 start);
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+#endif
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 44148691d78b..5eb4dc2b97da 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -938,6 +938,7 @@ int phys_to_target_node(phys_addr_t start)
 
 	return meminfo_to_nid(&numa_reserved_meminfo, start);
 }
+EXPORT_SYMBOL_GPL(phys_to_target_node);
 
 int memory_add_physaddr_to_nid(u64 start)
 {
@@ -947,4 +948,5 @@ int memory_add_physaddr_to_nid(u64 start)
 		nid = numa_meminfo.blk[0].nid;
 	return nid;
 }
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 567428e10b7b..d2834c2cfa10 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -50,7 +50,6 @@ config DEV_DAX_HMEM
 	  Say M if unsure.
 
 config DEV_DAX_HMEM_DEVICES
-	depends on NUMA_KEEP_MEMINFO # for phys_to_target_node()
 	depends on DEV_DAX_HMEM && DAX=y
 	def_bool y
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d65c6fdc5cfc..551093b74596 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -281,20 +281,6 @@ static inline bool movable_node_is_enabled(void)
 }
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
-#ifdef CONFIG_NUMA
-extern int memory_add_physaddr_to_nid(u64 start);
-extern int phys_to_target_node(u64 start);
-#else
-static inline int memory_add_physaddr_to_nid(u64 start)
-{
-	return 0;
-}
-static inline int phys_to_target_node(u64 start)
-{
-	return 0;
-}
-#endif
-
 #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
 /*
  * pgdat resizing functions
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 8cb33ccfb671..cb44cfe2b725 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -21,13 +21,41 @@
 #endif
 
 #ifdef CONFIG_NUMA
+#include <linux/printk.h>
+#include <asm/sparsemem.h>
+
 /* Generic implementation available */
 int numa_map_to_online_node(int node);
-#else
+
+#ifndef memory_add_physaddr_to_nid
+static inline int memory_add_physaddr_to_nid(u64 start)
+{
+	pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
+			start);
+	return 0;
+}
+#endif
+#ifndef phys_to_target_node
+static inline int phys_to_target_node(u64 start)
+{
+	pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+			start);
+	return 0;
+}
+#endif
+#else /* !CONFIG_NUMA */
 static inline int numa_map_to_online_node(int node)
 {
 	return NUMA_NO_NODE;
 }
+static inline int memory_add_physaddr_to_nid(u64 start)
+{
+	return 0;
+}
+static inline int phys_to_target_node(u64 start)
+{
+	return 0;
+}
 #endif
 
 #endif /* _LINUX_NUMA_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b44d4c7ba73b..63b2e46b6555 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -350,24 +350,6 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 	return err;
 }
 
-#ifdef CONFIG_NUMA
-int __weak memory_add_physaddr_to_nid(u64 start)
-{
-	pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
-			start);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-
-int __weak phys_to_target_node(u64 start)
-{
-	pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
-			start);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(phys_to_target_node);
-#endif
-
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
 				     unsigned long start_pfn,
-- 
cgit 


From 4349a83a3190c1d4414371161b0f4a4c3ccd3f9d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 21 Nov 2020 22:17:08 -0800
Subject: mm: fix readahead_page_batch for retry entries

Both btrfs and fuse have reported faults caused by seeing a retry entry
instead of the page they were looking for.  This was caused by a missing
check in the iterator.

As can be seen in the below panic log, the accessing 0x402 causes a
panic.  In the xarray.h, 0x402 means RETRY_ENTRY.

  BUG: kernel NULL pointer dereference, address: 0000000000000402
  CPU: 14 PID: 306003 Comm: as Not tainted 5.9.0-1-amd64 #1 Debian 5.9.1-1
  Hardware name: Lenovo ThinkSystem SR665/7D2VCTO1WW, BIOS D8E106Q-1.01 05/30/2020
  RIP: 0010:fuse_readahead+0x152/0x470 [fuse]
  Code: 41 8b 57 18 4c 8d 54 10 ff 4c 89 d6 48 8d 7c 24 10 e8 d2 e3 28 f9 48 85 c0 0f 84 fe 00 00 00 44 89 f2 49 89 04 d4 44 8d 72 01 <48> 8b 10 41 8b 4f 1c 48 c1 ea 10 83 e2 01 80 fa 01 19 d2 81 e2 01
  RSP: 0018:ffffad99ceaebc50 EFLAGS: 00010246
  RAX: 0000000000000402 RBX: 0000000000000001 RCX: 0000000000000002
  RDX: 0000000000000000 RSI: ffff94c5af90bd98 RDI: ffffad99ceaebc60
  RBP: ffff94ddc1749a00 R08: 0000000000000402 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000100 R12: ffff94de6c429ce0
  R13: ffff94de6c4d3700 R14: 0000000000000001 R15: ffffad99ceaebd68
  FS:  00007f228c5c7040(0000) GS:ffff94de8ed80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000402 CR3: 0000001dbd9b4000 CR4: 0000000000350ee0
  Call Trace:
    read_pages+0x83/0x270
    page_cache_readahead_unbounded+0x197/0x230
    generic_file_buffered_read+0x57a/0xa20
    new_sync_read+0x112/0x1a0
    vfs_read+0xf8/0x180
    ksys_read+0x5f/0xe0
    do_syscall_64+0x33/0x80
    entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 042124cc64c3 ("mm: add new readahead_control API")
Reported-by: David Sterba <dsterba@suse.com>
Reported-by: Wonhyuk Yang <vvghjk1234@gmail.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20201103142852.8543-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20201103124349.16722-1-vvghjk1234@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e1e19c1f9ec9..d5570deff400 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -906,6 +906,8 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
 	xas_set(&xas, rac->_index);
 	rcu_read_lock();
 	xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
+		if (xas_retry(&xas, page))
+			continue;
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		VM_BUG_ON_PAGE(PageTail(page), page);
 		array[i++] = page;
-- 
cgit 


From e44cdff05145b84293e3f424daa17e4f3ce0109c Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 19 Nov 2020 17:45:09 +0100
Subject: clk: samsung: Allow compile testing of Exynos, S3C64xx and S5Pv210

So far all Exynos, S3C64xx and S5Pv210 clock units were selected by
respective SOC/ARCH Kconfig option.  On a kernel built for selected
SoCs, this allowed to build only limited set of matching clock drivers.
However compile testing was not possible in such case as Makefile object
depends on SOC/ARCH option.

Add separate Kconfig options for each of them to be able to compile
test.

Link: https://lore.kernel.org/r/20201119164509.754851-1-krzk@kernel.org
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
---
 drivers/clk/samsung/Kconfig  | 67 ++++++++++++++++++++++++++++++++++++++++++--
 drivers/clk/samsung/Makefile | 22 +++++++--------
 include/linux/clk/samsung.h  |  4 +--
 3 files changed, 78 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/samsung/Kconfig b/drivers/clk/samsung/Kconfig
index 57d4b3f20417..9323fcfac6cc 100644
--- a/drivers/clk/samsung/Kconfig
+++ b/drivers/clk/samsung/Kconfig
@@ -2,10 +2,73 @@
 # Recent Exynos platforms should just select COMMON_CLK_SAMSUNG:
 config COMMON_CLK_SAMSUNG
 	bool "Samsung Exynos clock controller support" if COMPILE_TEST
-	# Clocks on ARM64 SoCs (e.g. Exynos5433, Exynos7) are chosen by
-	# EXYNOS_ARM64_COMMON_CLK to avoid building them on ARMv7:
+	select S3C64XX_COMMON_CLK if ARM && ARCH_S3C64XX
+	select S5PV210_COMMON_CLK if ARM && ARCH_S5PV210
+	select EXYNOS_3250_COMMON_CLK if ARM && SOC_EXYNOS3250
+	select EXYNOS_4_COMMON_CLK if ARM && ARCH_EXYNOS4
+	select EXYNOS_5250_COMMON_CLK if ARM && SOC_EXYNOS5250
+	select EXYNOS_5260_COMMON_CLK if ARM && SOC_EXYNOS5260
+	select EXYNOS_5410_COMMON_CLK if ARM && SOC_EXYNOS5410
+	select EXYNOS_5420_COMMON_CLK if ARM && SOC_EXYNOS5420
 	select EXYNOS_ARM64_COMMON_CLK if ARM64 && ARCH_EXYNOS
 
+config S3C64XX_COMMON_CLK
+	bool "Samsung S3C64xx clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung S3C64xx SoCs.
+	  Choose Y here only if you build for this SoC.
+
+config S5PV210_COMMON_CLK
+	bool "Samsung S5Pv210 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung S5Pv210 SoCs.
+	  Choose Y here only if you build for this SoC.
+
+config EXYNOS_3250_COMMON_CLK
+	bool "Samsung Exynos3250 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos3250 SoCs. Choose Y here only if you build for this SoC.
+
+config EXYNOS_4_COMMON_CLK
+	bool "Samsung Exynos4 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos4212 and Exynos4412 SoCs. Choose Y here only if you build for
+	  this SoC.
+
+config EXYNOS_5250_COMMON_CLK
+	bool "Samsung Exynos5250 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos5250 SoCs. Choose Y here only if you build for this SoC.
+
+config EXYNOS_5260_COMMON_CLK
+	bool "Samsung Exynos5260 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos5260 SoCs. Choose Y here only if you build for this SoC.
+
+config EXYNOS_5410_COMMON_CLK
+	bool "Samsung Exynos5410 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos5410 SoCs. Choose Y here only if you build for this SoC.
+
+config EXYNOS_5420_COMMON_CLK
+	bool "Samsung Exynos5420 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos5420 SoCs. Choose Y here only if you build for this SoC.
+
 config EXYNOS_ARM64_COMMON_CLK
 	bool "Samsung Exynos ARMv8-family clock controller support" if COMPILE_TEST
 	depends on COMMON_CLK_SAMSUNG
diff --git a/drivers/clk/samsung/Makefile b/drivers/clk/samsung/Makefile
index 1a4e6b787978..bb1433f11c88 100644
--- a/drivers/clk/samsung/Makefile
+++ b/drivers/clk/samsung/Makefile
@@ -4,15 +4,15 @@
 #
 
 obj-$(CONFIG_COMMON_CLK)	+= clk.o clk-pll.o clk-cpu.o
-obj-$(CONFIG_SOC_EXYNOS3250)	+= clk-exynos3250.o
-obj-$(CONFIG_ARCH_EXYNOS4)	+= clk-exynos4.o
-obj-$(CONFIG_ARCH_EXYNOS4)	+= clk-exynos4412-isp.o
-obj-$(CONFIG_SOC_EXYNOS5250)	+= clk-exynos5250.o
-obj-$(CONFIG_SOC_EXYNOS5250)	+= clk-exynos5-subcmu.o
-obj-$(CONFIG_SOC_EXYNOS5260)	+= clk-exynos5260.o
-obj-$(CONFIG_SOC_EXYNOS5410)	+= clk-exynos5410.o
-obj-$(CONFIG_SOC_EXYNOS5420)	+= clk-exynos5420.o
-obj-$(CONFIG_SOC_EXYNOS5420)	+= clk-exynos5-subcmu.o
+obj-$(CONFIG_EXYNOS_3250_COMMON_CLK)	+= clk-exynos3250.o
+obj-$(CONFIG_EXYNOS_4_COMMON_CLK)	+= clk-exynos4.o
+obj-$(CONFIG_EXYNOS_4_COMMON_CLK)	+= clk-exynos4412-isp.o
+obj-$(CONFIG_EXYNOS_5250_COMMON_CLK)	+= clk-exynos5250.o
+obj-$(CONFIG_EXYNOS_5250_COMMON_CLK)	+= clk-exynos5-subcmu.o
+obj-$(CONFIG_EXYNOS_5260_COMMON_CLK)	+= clk-exynos5260.o
+obj-$(CONFIG_EXYNOS_5410_COMMON_CLK)	+= clk-exynos5410.o
+obj-$(CONFIG_EXYNOS_5420_COMMON_CLK)	+= clk-exynos5420.o
+obj-$(CONFIG_EXYNOS_5420_COMMON_CLK)	+= clk-exynos5-subcmu.o
 obj-$(CONFIG_EXYNOS_ARM64_COMMON_CLK)	+= clk-exynos5433.o
 obj-$(CONFIG_EXYNOS_AUDSS_CLK_CON) += clk-exynos-audss.o
 obj-$(CONFIG_ARCH_EXYNOS)	+= clk-exynos-clkout.o
@@ -21,5 +21,5 @@ obj-$(CONFIG_S3C2410_COMMON_CLK)+= clk-s3c2410.o
 obj-$(CONFIG_S3C2410_COMMON_DCLK)+= clk-s3c2410-dclk.o
 obj-$(CONFIG_S3C2412_COMMON_CLK)+= clk-s3c2412.o
 obj-$(CONFIG_S3C2443_COMMON_CLK)+= clk-s3c2443.o
-obj-$(CONFIG_ARCH_S3C64XX)	+= clk-s3c64xx.o
-obj-$(CONFIG_ARCH_S5PV210)	+= clk-s5pv210.o clk-s5pv210-audss.o
+obj-$(CONFIG_S3C64XX_COMMON_CLK)	+= clk-s3c64xx.o
+obj-$(CONFIG_S5PV210_COMMON_CLK)	+= clk-s5pv210.o clk-s5pv210-audss.o
diff --git a/include/linux/clk/samsung.h b/include/linux/clk/samsung.h
index 79097e365f7f..38b774001712 100644
--- a/include/linux/clk/samsung.h
+++ b/include/linux/clk/samsung.h
@@ -10,7 +10,7 @@
 
 struct device_node;
 
-#ifdef CONFIG_ARCH_S3C64XX
+#ifdef CONFIG_S3C64XX_COMMON_CLK
 void s3c64xx_clk_init(struct device_node *np, unsigned long xtal_f,
 		      unsigned long xusbxti_f, bool s3c6400,
 		      void __iomem *base);
@@ -19,7 +19,7 @@ static inline void s3c64xx_clk_init(struct device_node *np,
 				    unsigned long xtal_f,
 				    unsigned long xusbxti_f,
 				    bool s3c6400, void __iomem *base) { }
-#endif /* CONFIG_ARCH_S3C64XX */
+#endif /* CONFIG_S3C64XX_COMMON_CLK */
 
 #ifdef CONFIG_S3C2410_COMMON_CLK
 void s3c2410_common_clk_init(struct device_node *np, unsigned long xti_f,
-- 
cgit 


From 769dda58d1f647a45270db2f02efe2e2de856709 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 13 Nov 2020 15:02:10 +0100
Subject: irqstat: Get rid of nmi_count() and __IRQ_STAT()

Nothing uses this anymore.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20201113141733.005212732@linutronix.de
---
 include/linux/irq_cpustat.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h
index 6e8895cd4d92..78fb2de3ea4d 100644
--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -19,10 +19,6 @@
 
 #ifndef __ARCH_IRQ_STAT
 DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);	/* defined in asm/hardirq.h */
-#define __IRQ_STAT(cpu, member)	(per_cpu(irq_stat.member, cpu))
 #endif
 
-/* arch dependent irq_stat fields */
-#define nmi_count(cpu)		__IRQ_STAT((cpu), __nmi_count)	/* i386 */
-
 #endif	/* __irq_cpustat_h */
-- 
cgit 


From e091bc90cd2d65f48e4688faead2911558d177d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 13 Nov 2020 15:02:16 +0100
Subject: irqstat: Move declaration into asm-generic/hardirq.h

Move the declaration of the irq_cpustat per cpu variable to
asm-generic/hardirq.h and remove the now empty linux/irq_cpustat.h header.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20201113141733.737377332@linutronix.de
---
 include/asm-generic/hardirq.h |  3 ++-
 include/linux/irq_cpustat.h   | 24 ------------------------
 2 files changed, 2 insertions(+), 25 deletions(-)
 delete mode 100644 include/linux/irq_cpustat.h

(limited to 'include/linux')

diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h
index f5dd99781e3c..7317e8258b48 100644
--- a/include/asm-generic/hardirq.h
+++ b/include/asm-generic/hardirq.h
@@ -12,7 +12,8 @@ typedef struct {
 #endif
 } ____cacheline_aligned irq_cpustat_t;
 
-#include <linux/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
+DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
+
 #include <linux/irq.h>
 
 #ifndef ack_bad_irq
diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h
deleted file mode 100644
index 78fb2de3ea4d..000000000000
--- a/include/linux/irq_cpustat.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __irq_cpustat_h
-#define __irq_cpustat_h
-
-/*
- * Contains default mappings for irq_cpustat_t, used by almost every
- * architecture.  Some arch (like s390) have per cpu hardware pages and
- * they define their own mappings for irq_stat.
- *
- * Keith Owens <kaos@ocs.com.au> July 2000.
- */
-
-
-/*
- * Simple wrappers reducing source bloat.  Define all irq_stat fields
- * here, even ones that are arch dependent.  That way we get common
- * definitions instead of differing sets for each arch.
- */
-
-#ifndef __ARCH_IRQ_STAT
-DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);	/* defined in asm/hardirq.h */
-#endif
-
-#endif	/* __irq_cpustat_h */
-- 
cgit 


From 15115830c88751ba83068aa37da996602ddc6a61 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 13 Nov 2020 15:02:17 +0100
Subject: preempt: Cleanup the macro maze a bit

Make the macro maze consistent and prepare it for adding the RT variant for
BH accounting.

 - Use nmi_count() for the NMI portion of preempt count
 - Introduce in_hardirq() to make the naming consistent and non-ambiguos
 - Use the macros to create combined checks (e.g. in_task()) so the
   softirq representation for RT just falls into place.
 - Update comments and move the deprecated macros aside

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20201113141733.864469886@linutronix.de
---
 include/linux/preempt.h | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7d9c1c0e149c..7547857516d5 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -77,31 +77,33 @@
 /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
 #include <asm/preempt.h>
 
+#define nmi_count()	(preempt_count() & NMI_MASK)
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
-#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
-				 | NMI_MASK))
+#define irq_count()	(nmi_count() | hardirq_count() | softirq_count())
 
 /*
- * Are we doing bottom half or hardware interrupt processing?
+ * Macros to retrieve the current execution context:
  *
- * in_irq()       - We're in (hard) IRQ context
+ * in_nmi()		- We're in NMI context
+ * in_hardirq()		- We're in hard IRQ context
+ * in_serving_softirq()	- We're in softirq context
+ * in_task()		- We're in task context
+ */
+#define in_nmi()		(nmi_count())
+#define in_hardirq()		(hardirq_count())
+#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
+#define in_task()		(!(in_nmi() | in_hardirq() | in_serving_softirq()))
+
+/*
+ * The following macros are deprecated and should not be used in new code:
+ * in_irq()       - Obsolete version of in_hardirq()
  * in_softirq()   - We have BH disabled, or are processing softirqs
  * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
- * in_serving_softirq() - We're in softirq context
- * in_nmi()       - We're in NMI context
- * in_task()	  - We're in task context
- *
- * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really
- *       should not be used in new code.
  */
 #define in_irq()		(hardirq_count())
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
-#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
-#define in_nmi()		(preempt_count() & NMI_MASK)
-#define in_task()		(!(preempt_count() & \
-				   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
 
 /*
  * The preempt_count offset after preempt_disable();
-- 
cgit 


From cb4789b0d19ff231ce9f73376a023341300aed96 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 6 Nov 2020 16:50:47 +0100
Subject: iommu/ioasid: Add ioasid references

Let IOASID users take references to existing ioasids with ioasid_get().
ioasid_put() drops a reference and only frees the ioasid when its
reference number is zero. It returns true if the ioasid was freed.
For drivers that don't call ioasid_get(), ioasid_put() is the same as
ioasid_free().

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20201106155048.997886-2-jean-philippe@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/intel/iommu.c |  4 ++--
 drivers/iommu/intel/svm.c   |  6 +++---
 drivers/iommu/ioasid.c      | 38 ++++++++++++++++++++++++++++++++++----
 include/linux/ioasid.h      | 10 ++++++++--
 4 files changed, 47 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 1b1ca63e6bbe..8b26223979a2 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5198,7 +5198,7 @@ static void auxiliary_unlink_device(struct dmar_domain *domain,
 	domain->auxd_refcnt--;
 
 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		ioasid_free(domain->default_pasid);
+		ioasid_put(domain->default_pasid);
 }
 
 static int aux_domain_add_dev(struct dmar_domain *domain,
@@ -5259,7 +5259,7 @@ attach_failed:
 	spin_unlock(&iommu->lock);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		ioasid_free(domain->default_pasid);
+		ioasid_put(domain->default_pasid);
 
 	return ret;
 }
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 3242ebd0bca3..4fa248b98031 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -598,7 +598,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		if (mm) {
 			ret = mmu_notifier_register(&svm->notifier, mm);
 			if (ret) {
-				ioasid_free(svm->pasid);
+				ioasid_put(svm->pasid);
 				kfree(svm);
 				kfree(sdev);
 				goto out;
@@ -616,7 +616,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		if (ret) {
 			if (mm)
 				mmu_notifier_unregister(&svm->notifier, mm);
-			ioasid_free(svm->pasid);
+			ioasid_put(svm->pasid);
 			kfree(svm);
 			kfree(sdev);
 			goto out;
@@ -689,7 +689,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
-				ioasid_free(svm->pasid);
+				ioasid_put(svm->pasid);
 				if (svm->mm) {
 					mmu_notifier_unregister(&svm->notifier, svm->mm);
 					/* Clear mm's pasid. */
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 0f8dd377aada..50ee27bbd04e 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -2,7 +2,7 @@
 /*
  * I/O Address Space ID allocator. There is one global IOASID space, split into
  * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc and ioasid_free.
+ * free IOASIDs with ioasid_alloc and ioasid_put.
  */
 #include <linux/ioasid.h>
 #include <linux/module.h>
@@ -15,6 +15,7 @@ struct ioasid_data {
 	struct ioasid_set *set;
 	void *private;
 	struct rcu_head rcu;
+	refcount_t refs;
 };
 
 /*
@@ -314,6 +315,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 
 	data->set = set;
 	data->private = private;
+	refcount_set(&data->refs, 1);
 
 	/*
 	 * Custom allocator needs allocator data to perform platform specific
@@ -346,11 +348,34 @@ exit_free:
 EXPORT_SYMBOL_GPL(ioasid_alloc);
 
 /**
- * ioasid_free - Free an IOASID
+ * ioasid_get - obtain a reference to the IOASID
+ */
+void ioasid_get(ioasid_t ioasid)
+{
+	struct ioasid_data *ioasid_data;
+
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_data = xa_load(&active_allocator->xa, ioasid);
+	if (ioasid_data)
+		refcount_inc(&ioasid_data->refs);
+	else
+		WARN_ON(1);
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_get);
+
+/**
+ * ioasid_put - Release a reference to an ioasid
  * @ioasid: the ID to remove
+ *
+ * Put a reference to the IOASID, free it when the number of references drops to
+ * zero.
+ *
+ * Return: %true if the IOASID was freed, %false otherwise.
  */
-void ioasid_free(ioasid_t ioasid)
+bool ioasid_put(ioasid_t ioasid)
 {
+	bool free = false;
 	struct ioasid_data *ioasid_data;
 
 	spin_lock(&ioasid_allocator_lock);
@@ -360,6 +385,10 @@ void ioasid_free(ioasid_t ioasid)
 		goto exit_unlock;
 	}
 
+	free = refcount_dec_and_test(&ioasid_data->refs);
+	if (!free)
+		goto exit_unlock;
+
 	active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
 	/* Custom allocator needs additional steps to free the xa element */
 	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
@@ -369,8 +398,9 @@ void ioasid_free(ioasid_t ioasid)
 
 exit_unlock:
 	spin_unlock(&ioasid_allocator_lock);
+	return free;
 }
-EXPORT_SYMBOL_GPL(ioasid_free);
+EXPORT_SYMBOL_GPL(ioasid_put);
 
 /**
  * ioasid_find - Find IOASID data
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 6f000d7a0ddc..e9dacd4b9f6b 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -34,7 +34,8 @@ struct ioasid_allocator_ops {
 #if IS_ENABLED(CONFIG_IOASID)
 ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
 		      void *private);
-void ioasid_free(ioasid_t ioasid);
+void ioasid_get(ioasid_t ioasid);
+bool ioasid_put(ioasid_t ioasid);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 		  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -48,10 +49,15 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
 	return INVALID_IOASID;
 }
 
-static inline void ioasid_free(ioasid_t ioasid)
+static inline void ioasid_get(ioasid_t ioasid)
 {
 }
 
+static inline bool ioasid_put(ioasid_t ioasid)
+{
+	return false;
+}
+
 static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 				bool (*getter)(void *))
 {
-- 
cgit 


From b713c195d59332277a31a59c91f755e53b5b302b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 5 Sep 2020 11:13:35 -0600
Subject: net: provide __sys_shutdown_sock() that takes a socket

No functional changes in this patch, needed to provide io_uring support
for shutdown(2).

Cc: netdev@vger.kernel.org
Cc: David S. Miller <davem@davemloft.net>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/socket.h |  1 +
 net/socket.c           | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index e9cb30d8cbfb..385894b4a8bb 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -436,6 +436,7 @@ extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
 			     int __user *usockaddr_len);
 extern int __sys_socketpair(int family, int type, int protocol,
 			    int __user *usockvec);
+extern int __sys_shutdown_sock(struct socket *sock, int how);
 extern int __sys_shutdown(int fd, int how);
 
 extern struct ns_common *get_net_ns(struct ns_common *ns);
diff --git a/net/socket.c b/net/socket.c
index 6e6cccc2104f..4b615c719765 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2192,6 +2192,17 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
  *	Shutdown a socket.
  */
 
+int __sys_shutdown_sock(struct socket *sock, int how)
+{
+	int err;
+
+	err = security_socket_shutdown(sock, how);
+	if (!err)
+		err = sock->ops->shutdown(sock, how);
+
+	return err;
+}
+
 int __sys_shutdown(int fd, int how)
 {
 	int err, fput_needed;
@@ -2199,9 +2210,7 @@ int __sys_shutdown(int fd, int how)
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (sock != NULL) {
-		err = security_socket_shutdown(sock, how);
-		if (!err)
-			err = sock->ops->shutdown(sock, how);
+		err = __sys_shutdown_sock(sock, how);
 		fput_light(sock->file, fput_needed);
 	}
 	return err;
-- 
cgit 


From 23acdc76f1798b090bb9dcc90671cd29d929834e Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 12 Nov 2020 18:53:34 -0800
Subject: signal: clear non-uapi flag bits when passing/returning sa_flags

Previously we were not clearing non-uapi flag bits in
sigaction.sa_flags when storing the userspace-provided sa_flags or
when returning them via oldact. Start doing so.

This allows userspace to detect missing support for flag bits and
allows the kernel to use non-uapi bits internally, as we are already
doing in arch/x86 for two flag bits. Now that this change is in
place, we no longer need the code in arch/x86 that was hiding these
bits from userspace, so remove it.

This is technically a userspace-visible behavior change for sigaction, as
the unknown bits returned via oldact.sa_flags are no longer set. However,
we are free to define the behavior for unknown bits exactly because
their behavior is currently undefined, so for now we can define the
meaning of each of them to be "clear the bit in oldact.sa_flags unless
the bit becomes known in the future". Furthermore, this behavior is
consistent with OpenBSD [1], illumos [2] and XNU [3] (FreeBSD [4] and
NetBSD [5] fail the syscall if unknown bits are set). So there is some
precedent for this behavior in other kernels, and in particular in XNU,
which is probably the most popular kernel among those that I looked at,
which means that this change is less likely to be a compatibility issue.

Link: [1] https://github.com/openbsd/src/blob/f634a6a4b5bf832e9c1de77f7894ae2625e74484/sys/kern/kern_sig.c#L278
Link: [2] https://github.com/illumos/illumos-gate/blob/76f19f5fdc974fe5be5c82a556e43a4df93f1de1/usr/src/uts/common/syscall/sigaction.c#L86
Link: [3] https://github.com/apple/darwin-xnu/blob/a449c6a3b8014d9406c2ddbdc81795da24aa7443/bsd/kern/kern_sig.c#L480
Link: [4] https://github.com/freebsd/freebsd/blob/eded70c37057857c6e23fae51f86b8f8f43cd2d0/sys/kern/kern_sig.c#L699
Link: [5] https://github.com/NetBSD/src/blob/3365779becdcedfca206091a645a0e8e22b2946e/sys/kern/sys_sig.c#L473
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://linux-review.googlesource.com/id/I35aab6f5be932505d90f3b3450c083b4db1eca86
Link: https://lkml.kernel.org/r/878dbcb5f47bc9b11881c81f745c0bef5c23f97f.1605235762.git.pcc@google.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/arm/include/asm/signal.h    |  2 ++
 arch/parisc/include/asm/signal.h |  2 ++
 arch/x86/kernel/signal_compat.c  |  7 -------
 include/linux/signal_types.h     | 12 ++++++++++++
 kernel/signal.c                  | 10 ++++++++++
 5 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/signal.h b/arch/arm/include/asm/signal.h
index 65530a042009..430be7774402 100644
--- a/arch/arm/include/asm/signal.h
+++ b/arch/arm/include/asm/signal.h
@@ -17,6 +17,8 @@ typedef struct {
 	unsigned long sig[_NSIG_WORDS];
 } sigset_t;
 
+#define __ARCH_UAPI_SA_FLAGS	(SA_THIRTYTWO | SA_RESTORER)
+
 #define __ARCH_HAS_SA_RESTORER
 
 #include <asm/sigcontext.h>
diff --git a/arch/parisc/include/asm/signal.h b/arch/parisc/include/asm/signal.h
index 715c96ba2ec8..30dd1e43ef88 100644
--- a/arch/parisc/include/asm/signal.h
+++ b/arch/parisc/include/asm/signal.h
@@ -21,6 +21,8 @@ typedef struct {
 	unsigned long sig[_NSIG_WORDS];
 } sigset_t;
 
+#define __ARCH_UAPI_SA_FLAGS	_SA_SIGGFAULT
+
 #include <asm/sigcontext.h>
 
 #endif /* !__ASSEMBLY */
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a7f3e12cfbdb..ddfd919be46c 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -165,16 +165,9 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
 {
 	signal_compat_build_tests();
 
-	/* Don't leak in-kernel non-uapi flags to user-space */
-	if (oact)
-		oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
-
 	if (!act)
 		return;
 
-	/* Don't let flags to be set from userspace */
-	act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
-
 	if (in_ia32_syscall())
 		act->sa.sa_flags |= SA_IA32_ABI;
 	if (in_x32_syscall())
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index f8a90ae9c6ec..a7887ad84d36 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -68,4 +68,16 @@ struct ksignal {
 	int sig;
 };
 
+#ifndef __ARCH_UAPI_SA_FLAGS
+#ifdef SA_RESTORER
+#define __ARCH_UAPI_SA_FLAGS	SA_RESTORER
+#else
+#define __ARCH_UAPI_SA_FLAGS	0
+#endif
+#endif
+
+#define UAPI_SA_FLAGS                                                          \
+	(SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO | SA_ONSTACK | SA_RESTART |  \
+	 SA_NODEFER | SA_RESETHAND | __ARCH_UAPI_SA_FLAGS)
+
 #endif /* _LINUX_SIGNAL_TYPES_H */
diff --git a/kernel/signal.c b/kernel/signal.c
index ef8f2a28d37c..8f5bd12ee41b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3985,6 +3985,16 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 	if (oact)
 		*oact = *k;
 
+	/*
+	 * Clear unknown flag bits in order to allow userspace to detect missing
+	 * support for flag bits and to allow the kernel to use non-uapi bits
+	 * internally.
+	 */
+	if (act)
+		act->sa.sa_flags &= UAPI_SA_FLAGS;
+	if (oact)
+		oact->sa.sa_flags &= UAPI_SA_FLAGS;
+
 	sigaction_compat_abi(act, oact);
 
 	if (act) {
-- 
cgit 


From 6ac05e832a9e96f9b1c42a8917cdd317d7b6c8fa Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Fri, 20 Nov 2020 12:33:45 -0800
Subject: signal: define the SA_EXPOSE_TAGBITS bit in sa_flags

Architectures that support address tagging, such as arm64, may want to
expose fault address tag bits to the signal handler to help diagnose
memory errors. However, these bits have not been previously set,
and their presence may confuse unaware user applications. Therefore,
introduce a SA_EXPOSE_TAGBITS flag bit in sa_flags that a signal
handler may use to explicitly request that the bits are set.

The generic signal handler APIs expect to receive tagged addresses.
Architectures may specify how to untag addresses in the case where
SA_EXPOSE_TAGBITS is clear by defining the arch_untagged_si_addr
function.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://linux-review.googlesource.com/id/I16dd0ed2081f091fce97be0190cb8caa874c26cb
Link: https://lkml.kernel.org/r/13cf24d00ebdd8e1f55caf1821c7c29d54100191.1605904350.git.pcc@google.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/signal.h                 | 14 ++++++++++++++
 include/linux/signal_types.h           |  2 +-
 include/uapi/asm-generic/signal-defs.h |  3 +++
 kernel/signal.c                        | 24 ++++++++++++++++++++++++
 4 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index b256f9c65661..205526c4003a 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -469,4 +469,18 @@ struct seq_file;
 extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
 #endif
 
+#ifndef arch_untagged_si_addr
+/*
+ * Given a fault address and a signal and si_code which correspond to the
+ * _sigfault union member, returns the address that must appear in si_addr if
+ * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags.
+ */
+static inline void __user *arch_untagged_si_addr(void __user *addr,
+						 unsigned long sig,
+						 unsigned long si_code)
+{
+	return addr;
+}
+#endif
+
 #endif /* _LINUX_SIGNAL_H */
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index a7887ad84d36..68e06c75c5b2 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -78,6 +78,6 @@ struct ksignal {
 
 #define UAPI_SA_FLAGS                                                          \
 	(SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO | SA_ONSTACK | SA_RESTART |  \
-	 SA_NODEFER | SA_RESETHAND | __ARCH_UAPI_SA_FLAGS)
+	 SA_NODEFER | SA_RESETHAND | SA_EXPOSE_TAGBITS | __ARCH_UAPI_SA_FLAGS)
 
 #endif /* _LINUX_SIGNAL_TYPES_H */
diff --git a/include/uapi/asm-generic/signal-defs.h b/include/uapi/asm-generic/signal-defs.h
index c790f67304ba..fe929e7b77ca 100644
--- a/include/uapi/asm-generic/signal-defs.h
+++ b/include/uapi/asm-generic/signal-defs.h
@@ -20,6 +20,8 @@
  * so this bit allows flag bit support to be detected from userspace while
  * allowing an old kernel to be distinguished from a kernel that supports every
  * flag bit.
+ * SA_EXPOSE_TAGBITS exposes an architecture-defined set of tag bits in
+ * siginfo.si_addr.
  *
  * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
  * Unix names RESETHAND and NODEFER respectively.
@@ -41,6 +43,7 @@
 /* 0x00000100 used on sparc */
 /* 0x00000200 used on sparc */
 #define SA_UNSUPPORTED	0x00000400
+#define SA_EXPOSE_TAGBITS	0x00000800
 /* 0x00010000 used on mips */
 /* 0x01000000 used on x86 */
 /* 0x02000000 used on x86 */
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f34819e80de..26018c59821d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2524,6 +2524,26 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info)
 	return signr;
 }
 
+static void hide_si_addr_tag_bits(struct ksignal *ksig)
+{
+	switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
+	case SIL_FAULT:
+	case SIL_FAULT_MCEERR:
+	case SIL_FAULT_BNDERR:
+	case SIL_FAULT_PKUERR:
+		ksig->info.si_addr = arch_untagged_si_addr(
+			ksig->info.si_addr, ksig->sig, ksig->info.si_code);
+		break;
+	case SIL_KILL:
+	case SIL_TIMER:
+	case SIL_POLL:
+	case SIL_CHLD:
+	case SIL_RT:
+	case SIL_SYS:
+		break;
+	}
+}
+
 bool get_signal(struct ksignal *ksig)
 {
 	struct sighand_struct *sighand = current->sighand;
@@ -2761,6 +2781,10 @@ relock:
 	spin_unlock_irq(&sighand->siglock);
 
 	ksig->sig = signr;
+
+	if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
+		hide_si_addr_tag_bits(ksig);
+
 	return ksig->sig > 0;
 }
 
-- 
cgit 


From 4e1d9a737d00f2cc811dc5654f82c92c7d80e98c Mon Sep 17 00:00:00 2001
From: Patrice Chotard <patrice.chotard@st.com>
Date: Thu, 19 Nov 2020 08:25:39 +0100
Subject: PM: sleep: Add dev_wakeup_path() helper

Add dev_wakeup_path() helper to avoid to spread
dev->power.wakeup_path test in drivers.

Signed-off-by: Patrice Chotard <patrice.chotard@st.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c      |  4 ++--
 drivers/base/power/main.c        |  4 ++--
 drivers/i2c/busses/i2c-stm32f7.c |  4 ++--
 include/linux/pm_wakeup.h        | 10 ++++++++++
 4 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 743268996336..e0894ef8457c 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1142,7 +1142,7 @@ static int genpd_finish_suspend(struct device *dev, bool poweroff)
 	if (ret)
 		return ret;
 
-	if (dev->power.wakeup_path && genpd_is_active_wakeup(genpd))
+	if (device_wakeup_path(dev) && genpd_is_active_wakeup(genpd))
 		return 0;
 
 	if (genpd->dev_ops.stop && genpd->dev_ops.start &&
@@ -1196,7 +1196,7 @@ static int genpd_resume_noirq(struct device *dev)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	if (dev->power.wakeup_path && genpd_is_active_wakeup(genpd))
+	if (device_wakeup_path(dev) && genpd_is_active_wakeup(genpd))
 		return pm_generic_resume_noirq(dev);
 
 	genpd_lock(genpd);
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 2b4255dc101e..46793276598d 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1359,7 +1359,7 @@ static void dpm_propagate_wakeup_to_parent(struct device *dev)
 
 	spin_lock_irq(&parent->power.lock);
 
-	if (dev->power.wakeup_path && !parent->power.ignore_children)
+	if (device_wakeup_path(dev) && !parent->power.ignore_children)
 		parent->power.wakeup_path = true;
 
 	spin_unlock_irq(&parent->power.lock);
@@ -1627,7 +1627,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 		goto Complete;
 
 	/* Avoid direct_complete to let wakeup_path propagate. */
-	if (device_may_wakeup(dev) || dev->power.wakeup_path)
+	if (device_may_wakeup(dev) || device_wakeup_path(dev))
 		dev->power.direct_complete = false;
 
 	if (dev->power.direct_complete) {
diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c
index f41f51a176a1..9aa8e65b511e 100644
--- a/drivers/i2c/busses/i2c-stm32f7.c
+++ b/drivers/i2c/busses/i2c-stm32f7.c
@@ -2322,7 +2322,7 @@ static int stm32f7_i2c_suspend(struct device *dev)
 
 	i2c_mark_adapter_suspended(&i2c_dev->adap);
 
-	if (!device_may_wakeup(dev) && !dev->power.wakeup_path) {
+	if (!device_may_wakeup(dev) && !device_wakeup_path(dev)) {
 		ret = stm32f7_i2c_regs_backup(i2c_dev);
 		if (ret < 0) {
 			i2c_mark_adapter_resumed(&i2c_dev->adap);
@@ -2341,7 +2341,7 @@ static int stm32f7_i2c_resume(struct device *dev)
 	struct stm32f7_i2c_dev *i2c_dev = dev_get_drvdata(dev);
 	int ret;
 
-	if (!device_may_wakeup(dev) && !dev->power.wakeup_path) {
+	if (!device_may_wakeup(dev) && !device_wakeup_path(dev)) {
 		ret = pm_runtime_force_resume(dev);
 		if (ret < 0)
 			return ret;
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index aa3da6611533..196a157456aa 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -84,6 +84,11 @@ static inline bool device_may_wakeup(struct device *dev)
 	return dev->power.can_wakeup && !!dev->power.wakeup;
 }
 
+static inline bool device_wakeup_path(struct device *dev)
+{
+	return dev->power.wakeup_path;
+}
+
 static inline void device_set_wakeup_path(struct device *dev)
 {
 	dev->power.wakeup_path = true;
@@ -174,6 +179,11 @@ static inline bool device_may_wakeup(struct device *dev)
 	return dev->power.can_wakeup && dev->power.should_wakeup;
 }
 
+static inline bool device_wakeup_path(struct device *dev)
+{
+	return false;
+}
+
 static inline void device_set_wakeup_path(struct device *dev) {}
 
 static inline void __pm_stay_awake(struct wakeup_source *ws) {}
-- 
cgit 


From a94ef811f7c3748736b85db0406da8e4ea391ac6 Mon Sep 17 00:00:00 2001
From: Lina Iyer <ilina@codeaurora.org>
Date: Thu, 19 Nov 2020 09:43:25 -0700
Subject: PM: domains: replace -ENOTSUPP with -EOPNOTSUPP

While submitting a patch to add next_wakeup, checkpatch reported this -

WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP
+       return -ENOTSUPP;

Address the above warning in other functions in pm_domain.h.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Lina Iyer <ilina@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_domain.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 1ad0ec481416..5fd20d117cbe 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -255,24 +255,24 @@ static inline int pm_genpd_init(struct generic_pm_domain *genpd,
 }
 static inline int pm_genpd_remove(struct generic_pm_domain *genpd)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_genpd_set_performance_state(struct device *dev,
 						     unsigned int state)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_genpd_add_notifier(struct device *dev,
 					    struct notifier_block *nb)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_genpd_remove_notifier(struct device *dev)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 #define simple_qos_governor		(*(struct dev_power_governor *)(NULL))
@@ -325,13 +325,13 @@ struct device *genpd_dev_pm_attach_by_name(struct device *dev,
 static inline int of_genpd_add_provider_simple(struct device_node *np,
 					struct generic_pm_domain *genpd)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int of_genpd_add_provider_onecell(struct device_node *np,
 					struct genpd_onecell_data *data)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline void of_genpd_del_provider(struct device_node *np) {}
@@ -387,7 +387,7 @@ static inline struct device *genpd_dev_pm_attach_by_name(struct device *dev,
 static inline
 struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
 
-- 
cgit 


From 8eb621698fd4c49703d512fe437d84ab822bc59e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 16 Sep 2020 11:12:03 +0100
Subject: keys: Provide the original description to the key preparser

Provide the proposed description (add key) or the original description
(update/instantiate key) when preparsing a key so that the key type can
validate it against the data.

This is important for rxrpc server keys as we need to check that they have
the right amount of key material present - and it's better to do that when
the key is loaded rather than deep in trying to process a response packet.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
cc: keyrings@vger.kernel.org
---
 include/linux/key-type.h | 1 +
 security/keys/key.c      | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index 2ab2d6d6aeab..7d985a1dfe4a 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -29,6 +29,7 @@ struct kernel_pkey_params;
  * clear the contents.
  */
 struct key_preparsed_payload {
+	const char	*orig_description; /* Actual or proposed description (maybe NULL) */
 	char		*description;	/* Proposed key description (or NULL) */
 	union key_payload payload;	/* Proposed payload */
 	const void	*data;		/* Raw data */
diff --git a/security/keys/key.c b/security/keys/key.c
index e282c6179b21..ebe752b137aa 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -504,6 +504,7 @@ int key_instantiate_and_link(struct key *key,
 	int ret;
 
 	memset(&prep, 0, sizeof(prep));
+	prep.orig_description = key->description;
 	prep.data = data;
 	prep.datalen = datalen;
 	prep.quotalen = key->type->def_datalen;
@@ -854,6 +855,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 		goto error_put_type;
 
 	memset(&prep, 0, sizeof(prep));
+	prep.orig_description = description;
 	prep.data = payload;
 	prep.datalen = plen;
 	prep.quotalen = index_key.type->def_datalen;
-- 
cgit 


From 076d38b88c4146fd5506933d440b881741b6ab60 Mon Sep 17 00:00:00 2001
From: Christian Eggers <ceggers@arri.de>
Date: Fri, 20 Nov 2020 09:41:04 +0100
Subject: net: ptp: introduce common defines for PTP message types

Using PTP wide defines will obsolete different driver internal defines
and uses of magic numbers.

Signed-off-by: Christian Eggers <ceggers@arri.de>
Cc: Kurt Kanzenbach <kurt@linutronix.de>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ptp_classify.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h
index c6487b7ab026..ae04968a3a47 100644
--- a/include/linux/ptp_classify.h
+++ b/include/linux/ptp_classify.h
@@ -31,6 +31,11 @@
 #define PTP_CLASS_V2_VLAN (PTP_CLASS_V2 | PTP_CLASS_VLAN)
 #define PTP_CLASS_L4      (PTP_CLASS_IPV4 | PTP_CLASS_IPV6)
 
+#define PTP_MSGTYPE_SYNC        0x0
+#define PTP_MSGTYPE_DELAY_REQ   0x1
+#define PTP_MSGTYPE_PDELAY_REQ  0x2
+#define PTP_MSGTYPE_PDELAY_RESP 0x3
+
 #define PTP_EV_PORT 319
 #define PTP_GEN_BIT 0x08 /* indicates general message, if set in message type */
 
@@ -140,7 +145,7 @@ static inline u8 ptp_get_msgtype(const struct ptp_header *hdr,
 	/* The return is meaningless. The stub function would not be
 	 * executed since no available header from ptp_parse_header.
 	 */
-	return 0;
+	return PTP_MSGTYPE_SYNC;
 }
 #endif
 #endif /* _PTP_CLASSIFY_H_ */
-- 
cgit 


From 3df98d79215ace13d1e91ddfc5a67a0f5acbd83f Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Sun, 27 Sep 2020 22:38:26 -0400
Subject: lsm,selinux: pass flowi_common instead of flowi to the LSM hooks

As pointed out by Herbert in a recent related patch, the LSM hooks do
not have the necessary address family information to use the flowi
struct safely.  As none of the LSMs currently use any of the protocol
specific flowi information, replace the flowi pointers with pointers
to the address family independent flowi_common struct.

Reported-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 .../chelsio/inline_crypto/chtls/chtls_cm.c         |  2 +-
 drivers/net/wireguard/socket.c                     |  4 ++--
 include/linux/lsm_hook_defs.h                      |  4 ++--
 include/linux/lsm_hooks.h                          |  2 +-
 include/linux/security.h                           | 23 +++++++++++++---------
 include/net/flow.h                                 | 10 ++++++++++
 include/net/route.h                                |  6 +++---
 net/dccp/ipv4.c                                    |  2 +-
 net/dccp/ipv6.c                                    |  6 +++---
 net/ipv4/icmp.c                                    |  4 ++--
 net/ipv4/inet_connection_sock.c                    |  4 ++--
 net/ipv4/ip_output.c                               |  2 +-
 net/ipv4/ping.c                                    |  2 +-
 net/ipv4/raw.c                                     |  2 +-
 net/ipv4/syncookies.c                              |  2 +-
 net/ipv4/udp.c                                     |  2 +-
 net/ipv6/af_inet6.c                                |  2 +-
 net/ipv6/datagram.c                                |  2 +-
 net/ipv6/icmp.c                                    |  6 +++---
 net/ipv6/inet6_connection_sock.c                   |  4 ++--
 net/ipv6/netfilter/nf_reject_ipv6.c                |  2 +-
 net/ipv6/ping.c                                    |  2 +-
 net/ipv6/raw.c                                     |  2 +-
 net/ipv6/syncookies.c                              |  2 +-
 net/ipv6/tcp_ipv6.c                                |  4 ++--
 net/ipv6/udp.c                                     |  2 +-
 net/l2tp/l2tp_ip6.c                                |  2 +-
 net/netfilter/nf_synproxy_core.c                   |  2 +-
 net/xfrm/xfrm_state.c                              |  6 ++++--
 security/security.c                                | 17 ++++++++--------
 security/selinux/hooks.c                           |  4 ++--
 security/selinux/include/xfrm.h                    |  2 +-
 security/selinux/xfrm.c                            | 13 ++++++------
 33 files changed, 85 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
index ec4f79049a06..42e4e43cdd91 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
@@ -1148,7 +1148,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk,
 		fl6.daddr = ip6h->saddr;
 		fl6.fl6_dport = inet_rsk(oreq)->ir_rmt_port;
 		fl6.fl6_sport = htons(inet_rsk(oreq)->ir_num);
-		security_req_classify_flow(oreq, flowi6_to_flowi(&fl6));
+		security_req_classify_flow(oreq, flowi6_to_flowi_common(&fl6));
 		dst = ip6_dst_lookup_flow(sock_net(lsk), lsk, &fl6, NULL);
 		if (IS_ERR(dst))
 			goto free_sk;
diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c
index c33e2c81635f..410b318e57fb 100644
--- a/drivers/net/wireguard/socket.c
+++ b/drivers/net/wireguard/socket.c
@@ -49,7 +49,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb,
 		rt = dst_cache_get_ip4(cache, &fl.saddr);
 
 	if (!rt) {
-		security_sk_classify_flow(sock, flowi4_to_flowi(&fl));
+		security_sk_classify_flow(sock, flowi4_to_flowi_common(&fl));
 		if (unlikely(!inet_confirm_addr(sock_net(sock), NULL, 0,
 						fl.saddr, RT_SCOPE_HOST))) {
 			endpoint->src4.s_addr = 0;
@@ -129,7 +129,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb,
 		dst = dst_cache_get_ip6(cache, &fl.saddr);
 
 	if (!dst) {
-		security_sk_classify_flow(sock, flowi6_to_flowi(&fl));
+		security_sk_classify_flow(sock, flowi6_to_flowi_common(&fl));
 		if (unlikely(!ipv6_addr_any(&fl.saddr) &&
 			     !ipv6_chk_addr(sock_net(sock), &fl.saddr, NULL, 0))) {
 			endpoint->src6 = fl.saddr = in6addr_any;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 32a940117e7a..f70984c8318b 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -311,7 +311,7 @@ LSM_HOOK(int, 0, secmark_relabel_packet, u32 secid)
 LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void)
 LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void)
 LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req,
-	 struct flowi *fl)
+	 struct flowi_common *flic)
 LSM_HOOK(int, 0, tun_dev_alloc_security, void **security)
 LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security)
 LSM_HOOK(int, 0, tun_dev_create, void)
@@ -351,7 +351,7 @@ LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x)
 LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid,
 	 u8 dir)
 LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x,
-	 struct xfrm_policy *xp, const struct flowi *fl)
+	 struct xfrm_policy *xp, const struct flowi_common *flic)
 LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid,
 	 int ckall)
 #endif /* CONFIG_SECURITY_NETWORK_XFRM */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index c503f7ab8afb..a19adef1f088 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1105,7 +1105,7 @@
  * @xfrm_state_pol_flow_match:
  *	@x contains the state to match.
  *	@xp contains the policy to check for a match.
- *	@fl contains the flow to check for a match.
+ *	@flic contains the flowi_common struct to check for a match.
  *	Return 1 if there is a match.
  * @xfrm_decode_session:
  *	@skb points to skb to decode.
diff --git a/include/linux/security.h b/include/linux/security.h
index bc2725491560..469273e8b46d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -167,7 +167,7 @@ struct sk_buff;
 struct sock;
 struct sockaddr;
 struct socket;
-struct flowi;
+struct flowi_common;
 struct dst_entry;
 struct xfrm_selector;
 struct xfrm_policy;
@@ -1355,8 +1355,9 @@ int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u
 int security_sk_alloc(struct sock *sk, int family, gfp_t priority);
 void security_sk_free(struct sock *sk);
 void security_sk_clone(const struct sock *sk, struct sock *newsk);
-void security_sk_classify_flow(struct sock *sk, struct flowi *fl);
-void security_req_classify_flow(const struct request_sock *req, struct flowi *fl);
+void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic);
+void security_req_classify_flow(const struct request_sock *req,
+				struct flowi_common *flic);
 void security_sock_graft(struct sock*sk, struct socket *parent);
 int security_inet_conn_request(struct sock *sk,
 			struct sk_buff *skb, struct request_sock *req);
@@ -1507,11 +1508,13 @@ static inline void security_sk_clone(const struct sock *sk, struct sock *newsk)
 {
 }
 
-static inline void security_sk_classify_flow(struct sock *sk, struct flowi *fl)
+static inline void security_sk_classify_flow(struct sock *sk,
+					     struct flowi_common *flic)
 {
 }
 
-static inline void security_req_classify_flow(const struct request_sock *req, struct flowi *fl)
+static inline void security_req_classify_flow(const struct request_sock *req,
+					      struct flowi_common *flic)
 {
 }
 
@@ -1638,9 +1641,9 @@ void security_xfrm_state_free(struct xfrm_state *x);
 int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir);
 int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				       struct xfrm_policy *xp,
-				       const struct flowi *fl);
+				       const struct flowi_common *flic);
 int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid);
-void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl);
+void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic);
 
 #else	/* CONFIG_SECURITY_NETWORK_XFRM */
 
@@ -1692,7 +1695,8 @@ static inline int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_s
 }
 
 static inline int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
-			struct xfrm_policy *xp, const struct flowi *fl)
+						     struct xfrm_policy *xp,
+						     const struct flowi_common *flic)
 {
 	return 1;
 }
@@ -1702,7 +1706,8 @@ static inline int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
 	return 0;
 }
 
-static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl)
+static inline void security_skb_classify_flow(struct sk_buff *skb,
+					      struct flowi_common *flic)
 {
 }
 
diff --git a/include/net/flow.h b/include/net/flow.h
index b2531df3f65f..39d0cedcddee 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -195,11 +195,21 @@ static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
 	return container_of(fl4, struct flowi, u.ip4);
 }
 
+static inline struct flowi_common *flowi4_to_flowi_common(struct flowi4 *fl4)
+{
+	return &(flowi4_to_flowi(fl4)->u.__fl_common);
+}
+
 static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6)
 {
 	return container_of(fl6, struct flowi, u.ip6);
 }
 
+static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6)
+{
+	return &(flowi6_to_flowi(fl6)->u.__fl_common);
+}
+
 static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn)
 {
 	return container_of(fldn, struct flowi, u.dn);
diff --git a/include/net/route.h b/include/net/route.h
index ff021cab657e..2e6c0e153e3a 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -165,7 +165,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
 			   sk ? inet_sk_flowi_flags(sk) : 0,
 			   daddr, saddr, dport, sport, sock_net_uid(net, sk));
 	if (sk)
-		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+		security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
 	return ip_route_output_flow(net, fl4, sk);
 }
 
@@ -322,7 +322,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
 		ip_rt_put(rt);
 		flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr);
 	}
-	security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+	security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
 	return ip_route_output_flow(net, fl4, sk);
 }
 
@@ -338,7 +338,7 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
 		flowi4_update_output(fl4, sk->sk_bound_dev_if,
 				     RT_CONN_FLAGS(sk), fl4->daddr,
 				     fl4->saddr);
-		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+		security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
 		return ip_route_output_flow(sock_net(sk), fl4, sk);
 	}
 	return rt;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index bb3d70664dde..3d0b58d7bf3f 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -464,7 +464,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
 		.fl4_dport = dccp_hdr(skb)->dccph_sport,
 	};
 
-	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+	security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
 	rt = ip_route_output_flow(net, &fl4, sk);
 	if (IS_ERR(rt)) {
 		IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ef4ab28cfde0..179dcf02a572 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -203,7 +203,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
 	fl6.flowi6_oif = ireq->ir_iif;
 	fl6.fl6_dport = ireq->ir_rmt_port;
 	fl6.fl6_sport = htons(ireq->ir_num);
-	security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+	security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
 
 
 	rcu_read_lock();
@@ -279,7 +279,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
 	fl6.flowi6_oif = inet6_iif(rxskb);
 	fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
 	fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
-	security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
+	security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6));
 
 	/* sk = NULL, but it is safe for now. RST socket required. */
 	dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
@@ -907,7 +907,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.flowi6_oif = sk->sk_bound_dev_if;
 	fl6.fl6_dport = usin->sin6_port;
 	fl6.fl6_sport = inet->inet_sport;
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
 	opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 005faea415a4..396b492c804f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -447,7 +447,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
 	fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
-	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+	security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		goto out_unlock;
@@ -503,7 +503,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
 	fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
 
-	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+	security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4));
 	rt = ip_route_output_key_hash(net, fl4, skb_in);
 	if (IS_ERR(rt))
 		return rt;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4148f5f78f31..1b7801371f7d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
 			   htons(ireq->ir_num), sk->sk_uid);
-	security_req_classify_flow(req, flowi4_to_flowi(fl4));
+	security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
@@ -640,7 +640,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
 			   htons(ireq->ir_num), sk->sk_uid);
-	security_req_classify_flow(req, flowi4_to_flowi(fl4));
+	security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 879b76ae4435..89fff5f59eea 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1700,7 +1700,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   daddr, saddr,
 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
 			   arg->uid);
-	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+	security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		return;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 248856b301c4..8b943f85fff9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -778,7 +778,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	fl4.fl4_icmp_type = user_icmph.type;
 	fl4.fl4_icmp_code = user_icmph.code;
 
-	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+	security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
 	rt = ip_route_output_flow(net, &fl4, sk);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 7d26e0f8bdae..50a73178d63a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -640,7 +640,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			goto done;
 	}
 
-	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+	security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
 	rt = ip_route_output_flow(net, &fl4, sk);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 6ac473b47f30..36b3b9b72b75 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -418,7 +418,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 			   inet_sk_flowi_flags(sk),
 			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
-	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+	security_req_classify_flow(req, flowi4_to_flowi_common(&fl4));
 	rt = ip_route_output_key(sock_net(sk), &fl4);
 	if (IS_ERR(rt)) {
 		reqsk_free(req);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 09f0a23d1a01..594632ff094f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1197,7 +1197,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				   faddr, saddr, dport, inet->inet_sport,
 				   sk->sk_uid);
 
-		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+		security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
 		rt = ip_route_output_flow(net, fl4, sk);
 		if (IS_ERR(rt)) {
 			err = PTR_ERR(rt);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e648fbebb167..b8e491d3acf0 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -819,7 +819,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl6.fl6_dport = inet->inet_dport;
 		fl6.fl6_sport = inet->inet_sport;
 		fl6.flowi6_uid = sk->sk_uid;
-		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+		security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
 		rcu_read_lock();
 		final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index cc8ad7ddecda..206f66310a88 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -60,7 +60,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
 	if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr))
 		fl6->flowi6_oif = np->mcast_oif;
 
-	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
 }
 
 int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ec448b71bf9a..91d9136b6e9c 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -567,7 +567,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	fl6.fl6_icmp_code = code;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
 	fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
-	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+	security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
 
 	np = inet6_sk(sk);
 
@@ -749,7 +749,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
 	fl6.flowi6_mark = mark;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
-	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+	security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
 
 	local_bh_disable();
 	sk = icmpv6_xmit_lock(net);
@@ -1002,7 +1002,7 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
 	fl6->fl6_icmp_type	= type;
 	fl6->fl6_icmp_code	= 0;
 	fl6->flowi6_oif		= oif;
-	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
 }
 
 static void __net_exit icmpv6_sk_exit(struct net *net)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e315526fa244..5a9f4d722f35 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -46,7 +46,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
 	fl6->fl6_dport = ireq->ir_rmt_port;
 	fl6->fl6_sport = htons(ireq->ir_num);
 	fl6->flowi6_uid = sk->sk_uid;
-	security_req_classify_flow(req, flowi6_to_flowi(fl6));
+	security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
 
 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
 	if (IS_ERR(dst))
@@ -95,7 +95,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 	fl6->fl6_sport = inet->inet_sport;
 	fl6->fl6_dport = inet->inet_dport;
 	fl6->flowi6_uid = sk->sk_uid;
-	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
 
 	rcu_read_lock();
 	final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 4aef6baaa55e..bf95513736c9 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -179,7 +179,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
 
 	fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
-	security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+	security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6));
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (dst->error) {
 		dst_release(dst);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 6caa062f68e7..6ac88fe24a8e 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -111,7 +111,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	fl6.flowi6_uid = sk->sk_uid;
 	fl6.fl6_icmp_type = user_icmph.icmp6_type;
 	fl6.fl6_icmp_code = user_icmph.icmp6_code;
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
 	ipcm6_init_sk(&ipc6, np);
 	ipc6.sockc.mark = sk->sk_mark;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 6e4ab80a3b94..1f56d9aae589 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -915,7 +915,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		fl6.flowi6_oif = np->mcast_oif;
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
 	if (hdrincl)
 		fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index e796a64be308..0351e2ffc707 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -233,7 +233,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		fl6.fl6_dport = ireq->ir_rmt_port;
 		fl6.fl6_sport = inet_sk(sk)->inet_sport;
 		fl6.flowi6_uid = sk->sk_uid;
-		security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+		security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
 
 		dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
 		if (IS_ERR(dst))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8db59f4e5f13..e5f75bbcc66f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -278,7 +278,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
 
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
@@ -954,7 +954,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
 	fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
-	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+	security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
 
 	/* Pass a socket to ip6_dst_lookup either it is for RST
 	 * Underlying function will use this to retrieve the network
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 29d9691359b9..724942afc035 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1496,7 +1496,7 @@ do_udp_sendmsg:
 	} else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
 
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
 	if (ipc6.tclass < 0)
 		ipc6.tclass = np->tclass;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index e5e5036257b0..96f975777438 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -606,7 +606,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
 
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
 	if (ipc6.tclass < 0)
 		ipc6.tclass = np->tclass;
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 9cca35d22927..fb64e8515f7d 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -849,7 +849,7 @@ synproxy_send_tcp_ipv6(struct net *net,
 	fl6.fl6_sport = nth->source;
 	fl6.fl6_dport = nth->dest;
 	security_skb_classify_flow((struct sk_buff *)skb,
-				   flowi6_to_flowi(&fl6));
+				   flowi6_to_flowi_common(&fl6));
 	err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
 	if (err) {
 		goto free_nskb;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index bbd4643d7e82..ac25b0c2d87e 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1021,7 +1021,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
 		if ((x->sel.family &&
 		     (x->sel.family != family ||
 		      !xfrm_selector_match(&x->sel, fl, family))) ||
-		    !security_xfrm_state_pol_flow_match(x, pol, fl))
+		    !security_xfrm_state_pol_flow_match(x, pol,
+							&fl->u.__fl_common))
 			return;
 
 		if (!*best ||
@@ -1036,7 +1037,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
 		if ((!x->sel.family ||
 		     (x->sel.family == family &&
 		      xfrm_selector_match(&x->sel, fl, family))) &&
-		    security_xfrm_state_pol_flow_match(x, pol, fl))
+		    security_xfrm_state_pol_flow_match(x, pol,
+						       &fl->u.__fl_common))
 			*error = -ESRCH;
 	}
 }
diff --git a/security/security.c b/security/security.c
index a28045dc9e7f..c08e0eec8b9e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2207,15 +2207,16 @@ void security_sk_clone(const struct sock *sk, struct sock *newsk)
 }
 EXPORT_SYMBOL(security_sk_clone);
 
-void security_sk_classify_flow(struct sock *sk, struct flowi *fl)
+void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic)
 {
-	call_void_hook(sk_getsecid, sk, &fl->flowi_secid);
+	call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
 }
 EXPORT_SYMBOL(security_sk_classify_flow);
 
-void security_req_classify_flow(const struct request_sock *req, struct flowi *fl)
+void security_req_classify_flow(const struct request_sock *req,
+				struct flowi_common *flic)
 {
-	call_void_hook(req_classify_flow, req, fl);
+	call_void_hook(req_classify_flow, req, flic);
 }
 EXPORT_SYMBOL(security_req_classify_flow);
 
@@ -2407,7 +2408,7 @@ int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
 
 int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				       struct xfrm_policy *xp,
-				       const struct flowi *fl)
+				       const struct flowi_common *flic)
 {
 	struct security_hook_list *hp;
 	int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);
@@ -2423,7 +2424,7 @@ int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
 	 */
 	hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match,
 				list) {
-		rc = hp->hook.xfrm_state_pol_flow_match(x, xp, fl);
+		rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic);
 		break;
 	}
 	return rc;
@@ -2434,9 +2435,9 @@ int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
 	return call_int_hook(xfrm_decode_session, 0, skb, secid, 1);
 }
 
-void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl)
+void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
 {
-	int rc = call_int_hook(xfrm_decode_session, 0, skb, &fl->flowi_secid,
+	int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid,
 				0);
 
 	BUG_ON(rc);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 943c2693cec7..a515abdf115b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5437,9 +5437,9 @@ static void selinux_secmark_refcount_dec(void)
 }
 
 static void selinux_req_classify_flow(const struct request_sock *req,
-				      struct flowi *fl)
+				      struct flowi_common *flic)
 {
-	fl->flowi_secid = req->secid;
+	flic->flowic_secid = req->secid;
 }
 
 static int selinux_tun_dev_alloc_security(void **security)
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index a0b465316292..0a6f34a7a971 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -26,7 +26,7 @@ int selinux_xfrm_state_delete(struct xfrm_state *x);
 int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir);
 int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				      struct xfrm_policy *xp,
-				      const struct flowi *fl);
+				      const struct flowi_common *flic);
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 extern atomic_t selinux_xfrm_refcount;
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index 7314196185d1..c367d36965d4 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -175,9 +175,10 @@ int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
  */
 int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				      struct xfrm_policy *xp,
-				      const struct flowi *fl)
+				      const struct flowi_common *flic)
 {
 	u32 state_sid;
+	u32 flic_sid;
 
 	if (!xp->security)
 		if (x->security)
@@ -196,17 +197,17 @@ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				return 0;
 
 	state_sid = x->security->ctx_sid;
+	flic_sid = flic->flowic_secid;
 
-	if (fl->flowi_secid != state_sid)
+	if (flic_sid != state_sid)
 		return 0;
 
 	/* We don't need a separate SA Vs. policy polmatch check since the SA
 	 * is now of the same label as the flow and a flow Vs. policy polmatch
 	 * check had already happened in selinux_xfrm_policy_lookup() above. */
-	return (avc_has_perm(&selinux_state,
-			     fl->flowi_secid, state_sid,
-			    SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO,
-			    NULL) ? 0 : 1);
+	return (avc_has_perm(&selinux_state, flic_sid, state_sid,
+			     SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO,
+			     NULL) ? 0 : 1);
 }
 
 static u32 selinux_xfrm_skb_sid_egress(struct sk_buff *skb)
-- 
cgit 


From cc69837fcaf467426ca19e5790085c26146a2300 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 20 Nov 2020 14:50:52 -0800
Subject: net: don't include ethtool.h from netdevice.h

linux/netdevice.h is included in very many places, touching any
of its dependecies causes large incremental builds.

Drop the linux/ethtool.h include, linux/netdevice.h just needs
a forward declaration of struct ethtool_ops.

Fix all the places which made use of this implicit include.

Acked-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Shannon Nelson <snelson@pensando.io>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Link: https://lore.kernel.org/r/20201120225052.1427503-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/isdn/capi/capi.c                                 | 1 +
 drivers/media/pci/ttpci/av7110_av.c                      | 1 +
 drivers/net/bonding/bond_procfs.c                        | 1 +
 drivers/net/can/usb/gs_usb.c                             | 1 +
 drivers/net/ethernet/amazon/ena/ena_ethtool.c            | 1 +
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h          | 2 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h                | 1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c          | 1 +
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c       | 1 +
 drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c      | 1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h               | 1 +
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c           | 1 +
 drivers/net/ethernet/google/gve/gve_ethtool.c            | 1 +
 drivers/net/ethernet/hisilicon/hns3/hnae3.h              | 1 +
 drivers/net/ethernet/huawei/hinic/hinic_port.h           | 1 +
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c         | 1 +
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h | 1 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h             | 1 +
 drivers/net/ethernet/mellanox/mlxsw/core_env.h           | 3 +++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h           | 1 +
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c           | 1 +
 drivers/net/ethernet/pensando/ionic/ionic_lif.c          | 1 +
 drivers/net/ethernet/pensando/ionic/ionic_stats.c        | 1 +
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c          | 1 +
 drivers/net/geneve.c                                     | 1 +
 drivers/net/hyperv/netvsc_drv.c                          | 1 +
 drivers/net/hyperv/rndis_filter.c                        | 1 +
 drivers/net/ipvlan/ipvlan_main.c                         | 2 ++
 drivers/net/nlmon.c                                      | 1 +
 drivers/net/team/team.c                                  | 1 +
 drivers/net/vrf.c                                        | 1 +
 drivers/net/vsockmon.c                                   | 1 +
 drivers/scsi/bnx2fc/bnx2fc_fcoe.c                        | 2 ++
 drivers/scsi/fcoe/fcoe_transport.c                       | 1 +
 drivers/staging/fsl-dpaa2/ethsw/ethsw-ethtool.c          | 2 ++
 drivers/staging/wimax/i2400m/usb.c                       | 1 +
 include/linux/netdevice.h                                | 2 +-
 include/linux/qed/qed_if.h                               | 1 +
 include/net/cfg80211.h                                   | 1 +
 include/rdma/ib_addr.h                                   | 1 +
 include/rdma/ib_verbs.h                                  | 1 +
 net/packet/af_packet.c                                   | 1 +
 net/sched/sch_cbs.c                                      | 1 +
 net/sched/sch_taprio.c                                   | 1 +
 net/socket.c                                             | 1 +
 45 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 85767f52fe3c..fdf87acccd06 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -11,6 +11,7 @@
 
 #include <linux/compiler.h>
 #include <linux/module.h>
+#include <linux/ethtool.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
diff --git a/drivers/media/pci/ttpci/av7110_av.c b/drivers/media/pci/ttpci/av7110_av.c
index ea9f7d0058a2..91f4866c7e59 100644
--- a/drivers/media/pci/ttpci/av7110_av.c
+++ b/drivers/media/pci/ttpci/av7110_av.c
@@ -11,6 +11,7 @@
  * the project's page is at https://linuxtv.org
  */
 
+#include <linux/ethtool.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index fd5c9cbe45b1..56d34be5e797 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/proc_fs.h>
+#include <linux/ethtool.h>
 #include <linux/export.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index b3431c3feba1..a0336e895d94 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -9,6 +9,7 @@
  * Many thanks to all socketcan devs!
  */
 
+#include <linux/ethtool.h>
 #include <linux/init.h>
 #include <linux/signal.h>
 #include <linux/module.h>
diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
index 3b2cd28f962d..6cdd9efe8df3 100644
--- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
@@ -3,6 +3,7 @@
  * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include <linux/ethtool.h>
 #include <linux/pci.h>
 
 #include "ena_netdev.h"
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 926cca9a0c83..1a7148041e3d 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -10,6 +10,8 @@
 #ifndef AQ_NIC_H
 #define AQ_NIC_H
 
+#include <linux/ethtool.h>
+
 #include "aq_common.h"
 #include "aq_rss.h"
 #include "aq_hw.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 47b3c3127879..950ea26ae0d2 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -20,6 +20,7 @@
 #define DRV_VER_MIN	10
 #define DRV_VER_UPD	1
 
+#include <linux/ethtool.h>
 #include <linux/interrupt.h>
 #include <linux/rhashtable.h>
 #include <linux/crash_dump.h>
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 23b80aa171dd..a217316228f4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -8,6 +8,7 @@
  * the Free Software Foundation.
  */
 
+#include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 16eebfc52109..66f2c553370c 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -15,6 +15,7 @@
  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or
  * NONINFRINGEMENT.  See the GNU General Public License for more details.
  ***********************************************************************/
+#include <linux/ethtool.h>
 #include <linux/netdevice.h>
 #include <linux/net_tstamp.h>
 #include <linux/pci.h>
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
index c7bdac79299a..2f218fbfed06 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
@@ -5,6 +5,7 @@
 
 /* ETHTOOL Support for VNIC_VF Device*/
 
+#include <linux/ethtool.h>
 #include <linux/pci.h>
 #include <linux/net_tstamp.h>
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 27308600da15..8e681ce72d62 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -39,6 +39,7 @@
 
 #include <linux/bitops.h>
 #include <linux/cache.h>
+#include <linux/ethtool.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
index cd8f9a481d73..d546993bda09 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
@@ -33,6 +33,7 @@
  * SOFTWARE.
  */
 
+#include <linux/ethtool.h>
 #include <linux/pci.h>
 
 #include "t4vf_common.h"
diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c
index 7b44769bd87c..2fb197fd3daf 100644
--- a/drivers/net/ethernet/google/gve/gve_ethtool.c
+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2015-2019 Google, Inc.
  */
 
+#include <linux/ethtool.h>
 #include <linux/rtnetlink.h>
 #include "gve.h"
 #include "gve_adminq.h"
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 5bae5e859c81..f6fac2418648 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -25,6 +25,7 @@
 #include <linux/dcbnl.h>
 #include <linux/delay.h>
 #include <linux/device.h>
+#include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/pci.h>
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_port.h b/drivers/net/ethernet/huawei/hinic/hinic_port.h
index 9c3cbe45c9ec..c9ae3d4dc547 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_port.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_port.h
@@ -8,6 +8,7 @@
 #define HINIC_PORT_H
 
 #include <linux/types.h>
+#include <linux/ethtool.h>
 #include <linux/etherdevice.h>
 #include <linux/bitops.h>
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 908fefaa6b85..66776ba7bfb6 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2019 Intel Corporation. */
 
+#include <linux/ethtool.h>
 #include <linux/vmalloc.h>
 
 #include "fm10k.h"
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index ceec649bdd13..103430400a8a 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -11,6 +11,7 @@
 #ifndef OTX2_COMMON_H
 #define OTX2_COMMON_H
 
+#include <linux/ethtool.h>
 #include <linux/pci.h>
 #include <linux/iommu.h>
 #include <linux/net_tstamp.h>
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 014ce8d3d97b..1c50d0f22199 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -36,6 +36,7 @@
 
 #include <linux/bitops.h>
 #include <linux/compiler.h>
+#include <linux/ethtool.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_env.h b/drivers/net/ethernet/mellanox/mlxsw/core_env.h
index 8e36a2634ef5..2b23f8a87862 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_env.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_env.h
@@ -4,6 +4,9 @@
 #ifndef _MLXSW_CORE_ENV_H
 #define _MLXSW_CORE_ENV_H
 
+struct ethtool_modinfo;
+struct ethtool_eeprom;
+
 int mlxsw_env_module_temp_thresholds_get(struct mlxsw_core *core, int module,
 					 int off, int *temp);
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 74b3959b36d4..642099fee380 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -4,6 +4,7 @@
 #ifndef _MLXSW_SPECTRUM_H
 #define _MLXSW_SPECTRUM_H
 
+#include <linux/ethtool.h>
 #include <linux/types.h>
 #include <linux/netdevice.h>
 #include <linux/rhashtable.h>
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index 5023d91269f4..40e2e79d4517 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
+#include <linux/ethtool.h>
 #include <linux/etherdevice.h>
 #include <linux/slab.h>
 #include <linux/device.h>
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
index deabd813e3fe..0afec2fa572d 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2017 - 2019 Pensando Systems, Inc */
 
+#include <linux/ethtool.h>
 #include <linux/printk.h>
 #include <linux/dynamic_debug.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_stats.c b/drivers/net/ethernet/pensando/ionic/ionic_stats.c
index ff20a2ac4c2f..6ae75b771a15 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_stats.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_stats.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2017 - 2019 Pensando Systems, Inc */
 
+#include <linux/ethtool.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
index d58b51d277f1..ca1535ebb6e7 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/etherdevice.h>
+#include <linux/ethtool.h>
 #include <linux/if_arp.h>
 #include <net/pkt_sched.h>
 #include "rmnet_config.h"
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index ef9b5ea9073b..5523f069b9a5 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/ethtool.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 261e6e55a907..d17bbc75f5e7 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -10,6 +10,7 @@
 
 #include <linux/init.h>
 #include <linux/atomic.h>
+#include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
 #include <linux/device.h>
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index b22e47bcfeca..2c2b55c32a7a 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -6,6 +6,7 @@
  *   Haiyang Zhang <haiyangz@microsoft.com>
  *   Hank Janssen  <hjanssen@microsoft.com>
  */
+#include <linux/ethtool.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 60b7d93bb834..a707502a0c0f 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -2,6 +2,8 @@
 /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
  */
 
+#include <linux/ethtool.h>
+
 #include "ipvlan.h"
 
 static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval,
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index afb119f38325..5e19a6839dea 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+#include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index b4092127a92c..c19dac21c468 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -4,6 +4,7 @@
  * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com>
  */
 
+#include <linux/ethtool.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index f2793ffde191..f8d711a84763 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -9,6 +9,7 @@
  * Based on dummy, team and ipvlan drivers
  */
 
+#include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c
index e8563acf98e8..b1bb1b04b664 100644
--- a/drivers/net/vsockmon.c
+++ b/drivers/net/vsockmon.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+#include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/if_arp.h>
diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index 6890bbe04a8c..08fb7d5d08b3 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -16,6 +16,8 @@
 
 #include "bnx2fc.h"
 
+#include <linux/ethtool.h>
+
 static struct list_head adapter_list;
 static struct list_head if_list;
 static u32 adapter_count;
diff --git a/drivers/scsi/fcoe/fcoe_transport.c b/drivers/scsi/fcoe/fcoe_transport.c
index 6e187d0e71fd..b927b3d84523 100644
--- a/drivers/scsi/fcoe/fcoe_transport.c
+++ b/drivers/scsi/fcoe/fcoe_transport.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
+#include <linux/ethtool.h>
 #include <linux/errno.h>
 #include <linux/crc32.h>
 #include <scsi/libfcoe.h>
diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw-ethtool.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw-ethtool.c
index ace4a6d28562..ad55cd738847 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/ethsw-ethtool.c
+++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw-ethtool.c
@@ -7,6 +7,8 @@
  *
  */
 
+#include <linux/ethtool.h>
+
 #include "ethsw.h"
 
 static struct {
diff --git a/drivers/staging/wimax/i2400m/usb.c b/drivers/staging/wimax/i2400m/usb.c
index 3b84dd7b5567..f250d03ce7c7 100644
--- a/drivers/staging/wimax/i2400m/usb.c
+++ b/drivers/staging/wimax/i2400m/usb.c
@@ -51,6 +51,7 @@
 #include "i2400m-usb.h"
 #include "linux-wimax-i2400m.h"
 #include <linux/debugfs.h>
+#include <linux/ethtool.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03433a4c929e..0049e8fe4905 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -34,7 +34,6 @@
 #include <linux/workqueue.h>
 #include <linux/dynamic_queue_limits.h>
 
-#include <linux/ethtool.h>
 #include <net/net_namespace.h>
 #ifdef CONFIG_DCB
 #include <net/dcbnl.h>
@@ -51,6 +50,7 @@
 
 struct netpoll_info;
 struct device;
+struct ethtool_ops;
 struct phy_device;
 struct dsa_port;
 struct ip_tunnel_parm;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 57fb295ea41a..68d17a4fbf20 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -7,6 +7,7 @@
 #ifndef _QED_IF_H
 #define _QED_IF_H
 
+#include <linux/ethtool.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
 #include <linux/netdevice.h>
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ab249ca5d5d1..78c763dfc99a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -10,6 +10,7 @@
  * Copyright (C) 2018-2020 Intel Corporation
  */
 
+#include <linux/ethtool.h>
 #include <linux/netdevice.h>
 #include <linux/debugfs.h>
 #include <linux/list.h>
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index b0e636ac6690..d808dc3d239e 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -7,6 +7,7 @@
 #ifndef IB_ADDR_H
 #define IB_ADDR_H
 
+#include <linux/ethtool.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/if_arp.h>
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9bf6c319a670..3883efd588aa 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -12,6 +12,7 @@
 #ifndef IB_VERBS_H
 #define IB_VERBS_H
 
+#include <linux/ethtool.h>
 #include <linux/types.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 62ebfaa7adcb..48a0ed836b46 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -46,6 +46,7 @@
  *					Copyright (C) 2011, <lokec@ccs.neu.edu>
  */
 
+#include <linux/ethtool.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/capability.h>
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 2eaac2ff380f..459cc240eda9 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -50,6 +50,7 @@
  *	locredit = max_frame_size * (sendslope / port_transmit_rate)
  */
 
+#include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index b0ad7687ee2c..26fb8a62996b 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -6,6 +6,7 @@
  *
  */
 
+#include <linux/ethtool.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
diff --git a/net/socket.c b/net/socket.c
index 152b1dcf93c6..bfef11ba35b8 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -52,6 +52,7 @@
  *	Based upon Swansea University Computer Society NET3.039
  */
 
+#include <linux/ethtool.h>
 #include <linux/mm.h>
 #include <linux/socket.h>
 #include <linux/file.h>
-- 
cgit 


From d549699048b4b5c22dd710455bcdb76966e55aa3 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Sat, 21 Nov 2020 08:28:17 +0200
Subject: net/packet: fix packet receive on L3 devices without visible hard
 header

In the patchset merged by commit b9fcf0a0d826
("Merge branch 'support-AF_PACKET-for-layer-3-devices'") L3 devices which
did not have header_ops were given one for the purpose of protocol parsing
on af_packet transmit path.

That change made af_packet receive path regard these devices as having a
visible L3 header and therefore aligned incoming skb->data to point to the
skb's mac_header. Some devices, such as ipip, xfrmi, and others, do not
reset their mac_header prior to ingress and therefore their incoming
packets became malformed.

Ideally these devices would reset their mac headers, or af_packet would be
able to rely on dev->hard_header_len being 0 for such cases, but it seems
this is not the case.

Fix by changing af_packet RX ll visibility criteria to include the
existence of a '.create()' header operation, which is used when creating
a device hard header - via dev_hard_header() - by upper layers, and does
not exist in these L3 devices.

As this predicate may be useful in other situations, add it as a common
dev_has_header() helper in netdevice.h.

Fixes: b9fcf0a0d826 ("Merge branch 'support-AF_PACKET-for-layer-3-devices'")
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20201121062817.3178900-1-eyal.birger@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  5 +++++
 net/packet/af_packet.c    | 18 +++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 964b494b0e8d..fa275a054f46 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3137,6 +3137,11 @@ static inline bool dev_validate_header(const struct net_device *dev,
 	return false;
 }
 
+static inline bool dev_has_header(const struct net_device *dev)
+{
+	return dev->header_ops && dev->header_ops->create;
+}
+
 typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr,
 			   int len, int size);
 int register_gifconf(unsigned int family, gifconf_func_t *gifconf);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index cefbd50c1090..7a18ffff8551 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -93,8 +93,8 @@
 
 /*
    Assumptions:
-   - If the device has no dev->header_ops, there is no LL header visible
-     above the device. In this case, its hard_header_len should be 0.
+   - If the device has no dev->header_ops->create, there is no LL header
+     visible above the device. In this case, its hard_header_len should be 0.
      The device may prepend its own header internally. In this case, its
      needed_headroom should be set to the space needed for it to add its
      internal header.
@@ -108,26 +108,26 @@
 On receive:
 -----------
 
-Incoming, dev->header_ops != NULL
+Incoming, dev_has_header(dev) == true
    mac_header -> ll header
    data       -> data
 
-Outgoing, dev->header_ops != NULL
+Outgoing, dev_has_header(dev) == true
    mac_header -> ll header
    data       -> ll header
 
-Incoming, dev->header_ops == NULL
+Incoming, dev_has_header(dev) == false
    mac_header -> data
      However drivers often make it point to the ll header.
      This is incorrect because the ll header should be invisible to us.
    data       -> data
 
-Outgoing, dev->header_ops == NULL
+Outgoing, dev_has_header(dev) == false
    mac_header -> data. ll header is invisible to us.
    data       -> data
 
 Resume
-  If dev->header_ops == NULL we are unable to restore the ll header,
+  If dev_has_header(dev) == false we are unable to restore the ll header,
     because it is invisible to us.
 
 
@@ -2069,7 +2069,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb->dev = dev;
 
-	if (dev->header_ops) {
+	if (dev_has_header(dev)) {
 		/* The device has an explicit notion of ll header,
 		 * exported to higher levels.
 		 *
@@ -2198,7 +2198,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!net_eq(dev_net(dev), sock_net(sk)))
 		goto drop;
 
-	if (dev->header_ops) {
+	if (dev_has_header(dev)) {
 		if (sk->sk_type != SOCK_DGRAM)
 			skb_push(skb, skb->data - skb_mac_header(skb));
 		else if (skb->pkt_type == PACKET_OUTGOING) {
-- 
cgit 


From b7cab9be7c16128a0de21ed7ae67211838813437 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 4 Nov 2020 23:23:58 +0800
Subject: soundwire: SDCA: detect sdca_cascade interrupt

The SoundWire 1.2 specification defines an "SDCA cascade" bit which
handles a logical OR of all SDCA interrupt sources (up to 30 defined).

Due to limitations of the addressing space, this bit is located in the
SDW_DP0_INT register when DP0 is used, or alternatively in the
DP0_SDCA_Support_INTSTAT register when DP0 is not used.

To allow for both cases to be handled, this bit will be checked in the
main device-level interrupt handling code. This will result in the
register being read twice if DP0 is enabled, but it's not clear how to
optimize this case. It's also more logical to deal with this interrupt
at the device than the port level, this bit is really not DP0 specific
and its location in the DP0_INTSTAT bit is only due to the lack of
free space in SCP_INTSTAT_1.

The SDCA_Cascade bit cannot be masked or cleared, so the interrupt
handling only forwards the detection to the Slave driver, which will
deal with reading the relevant SDCA status bits and clearing them. The
bus driver only signals the detection.

The communication with the Slave driver is based on the same interrupt
callback, with only an extension to provide the status of the
sdca_cascade bit.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20201104152358.9518-1-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       | 28 +++++++++++++++++++++++++++-
 include/linux/soundwire/sdw.h |  4 ++++
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 8eaf31e76677..ffe4600fd95b 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -1424,6 +1424,7 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave)
 	int port_num, stat, ret, count = 0;
 	unsigned long port;
 	bool slave_notify = false;
+	u8 sdca_cascade = 0;
 	u8 buf, buf2[2], _buf, _buf2[2];
 	bool parity_check;
 	bool parity_quirk;
@@ -1453,6 +1454,16 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave)
 		goto io_err;
 	}
 
+	if (slave->prop.is_sdca) {
+		ret = sdw_read(slave, SDW_DP0_INT);
+		if (ret < 0) {
+			dev_err(slave->bus->dev,
+				"SDW_DP0_INT read failed:%d\n", ret);
+			goto io_err;
+		}
+		sdca_cascade = ret & SDW_DP0_SDCA_CASCADE;
+	}
+
 	do {
 		/*
 		 * Check parity, bus clash and Slave (impl defined)
@@ -1489,6 +1500,10 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave)
 			clear |= SDW_SCP_INT1_IMPL_DEF;
 		}
 
+		/* the SDCA interrupts are cleared in the codec driver .interrupt_callback() */
+		if (sdca_cascade)
+			slave_notify = true;
+
 		/* Check port 0 - 3 interrupts */
 		port = buf & SDW_SCP_INT1_PORT0_3;
 
@@ -1526,6 +1541,7 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave)
 		/* Update the Slave driver */
 		if (slave_notify && slave->ops &&
 		    slave->ops->interrupt_callback) {
+			slave_intr.sdca_cascade = sdca_cascade;
 			slave_intr.control_port = clear;
 			memcpy(slave_intr.port, &port_status,
 			       sizeof(slave_intr.port));
@@ -1563,11 +1579,21 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave)
 			goto io_err;
 		}
 
+		if (slave->prop.is_sdca) {
+			ret = sdw_read(slave, SDW_DP0_INT);
+			if (ret < 0) {
+				dev_err(slave->bus->dev,
+					"SDW_DP0_INT read failed:%d\n", ret);
+				goto io_err;
+			}
+			sdca_cascade = ret & SDW_DP0_SDCA_CASCADE;
+		}
+
 		/* Make sure no interrupts are pending */
 		buf &= _buf;
 		buf2[0] &= _buf2[0];
 		buf2[1] &= _buf2[1];
-		stat = buf || buf2[0] || buf2[1];
+		stat = buf || buf2[0] || buf2[1] || sdca_cascade;
 
 		/*
 		 * Exit loop if Slave is continuously in ALERT state even
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 41cc1192f9aa..f0b01b728640 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -359,6 +359,7 @@ struct sdw_dpn_prop {
  * @sink_dpn_prop: Sink Data Port N properties
  * @scp_int1_mask: SCP_INT1_MASK desired settings
  * @quirks: bitmask identifying deltas from the MIPI specification
+ * @is_sdca: the Slave supports the SDCA specification
  */
 struct sdw_slave_prop {
 	u32 mipi_revision;
@@ -382,6 +383,7 @@ struct sdw_slave_prop {
 	struct sdw_dpn_prop *sink_dpn_prop;
 	u8 scp_int1_mask;
 	u32 quirks;
+	bool is_sdca;
 };
 
 #define SDW_SLAVE_QUIRKS_INVALID_INITIAL_PARITY	BIT(0)
@@ -479,10 +481,12 @@ struct sdw_slave_id {
 
 /**
  * struct sdw_slave_intr_status - Slave interrupt status
+ * @sdca_cascade: set if the Slave device reports an SDCA interrupt
  * @control_port: control port status
  * @port: data port status
  */
 struct sdw_slave_intr_status {
+	bool sdca_cascade;
 	u8 control_port;
 	u8 port[15];
 };
-- 
cgit 


From 74d862b682f51e45d25b95b1ecf212428a4967b0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Nov 2020 20:48:42 +0100
Subject: sched: Make migrate_disable/enable() independent of RT

Now that the scheduler can deal with migrate disable properly, there is no
real compelling reason to make it only available for RT.

There are quite some code pathes which needlessly disable preemption in
order to prevent migration and some constructs like kmap_atomic() enforce
it implicitly.

Making it available independent of RT allows to provide a preemptible
variant of kmap_atomic() and makes the code more consistent in general.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Grudgingly-Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20201118204007.269943012@linutronix.de
---
 include/linux/kernel.h  | 21 ++++++++++++++-------
 include/linux/preempt.h | 38 +++-----------------------------------
 include/linux/sched.h   |  2 +-
 kernel/sched/core.c     | 45 +++++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h    |  4 ++--
 lib/smp_processor_id.c  |  2 +-
 6 files changed, 56 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f05e9128201..665837f9a831 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -204,6 +204,7 @@ extern int _cond_resched(void);
 extern void ___might_sleep(const char *file, int line, int preempt_offset);
 extern void __might_sleep(const char *file, int line, int preempt_offset);
 extern void __cant_sleep(const char *file, int line, int preempt_offset);
+extern void __cant_migrate(const char *file, int line);
 
 /**
  * might_sleep - annotation for functions that can sleep
@@ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
 # define cant_sleep() \
 	do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
 # define sched_annotate_sleep()	(current->task_state_change = 0)
+
+/**
+ * cant_migrate - annotation for functions that cannot migrate
+ *
+ * Will print a stack trace if executed in code which is migratable
+ */
+# define cant_migrate()							\
+	do {								\
+		if (IS_ENABLED(CONFIG_SMP))				\
+			__cant_migrate(__FILE__, __LINE__);		\
+	} while (0)
+
 /**
  * non_block_start - annotate the start of section where sleeping is prohibited
  *
@@ -251,6 +264,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
 				   int preempt_offset) { }
 # define might_sleep() do { might_resched(); } while (0)
 # define cant_sleep() do { } while (0)
+# define cant_migrate()		do { } while (0)
 # define sched_annotate_sleep() do { } while (0)
 # define non_block_start() do { } while (0)
 # define non_block_end() do { } while (0)
@@ -258,13 +272,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
 
 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
 
-#ifndef CONFIG_PREEMPT_RT
-# define cant_migrate()		cant_sleep()
-#else
-  /* Placeholder for now */
-# define cant_migrate()		do { } while (0)
-#endif
-
 /**
  * abs - return absolute value of an argument
  * @x: the value.  If it is unsigned type, it is converted to signed type first.
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 8b43922e65df..6df63cbe8bb0 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -322,7 +322,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 
 #endif
 
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+#ifdef CONFIG_SMP
 
 /*
  * Migrate-Disable and why it is undesired.
@@ -382,43 +382,11 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 extern void migrate_disable(void);
 extern void migrate_enable(void);
 
-#elif defined(CONFIG_PREEMPT_RT)
+#else
 
 static inline void migrate_disable(void) { }
 static inline void migrate_enable(void) { }
 
-#else /* !CONFIG_PREEMPT_RT */
-
-/**
- * migrate_disable - Prevent migration of the current task
- *
- * Maps to preempt_disable() which also disables preemption. Use
- * migrate_disable() to annotate that the intent is to prevent migration,
- * but not necessarily preemption.
- *
- * Can be invoked nested like preempt_disable() and needs the corresponding
- * number of migrate_enable() invocations.
- */
-static __always_inline void migrate_disable(void)
-{
-	preempt_disable();
-}
-
-/**
- * migrate_enable - Allow migration of the current task
- *
- * Counterpart to migrate_disable().
- *
- * As migrate_disable() can be invoked nested, only the outermost invocation
- * reenables migration.
- *
- * Currently mapped to preempt_enable().
- */
-static __always_inline void migrate_enable(void)
-{
-	preempt_enable();
-}
-
-#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
+#endif /* CONFIG_SMP */
 
 #endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3af9d52fe093..a33f35f68060 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -715,7 +715,7 @@ struct task_struct {
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
 	void				*migration_pending;
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+#ifdef CONFIG_SMP
 	unsigned short			migration_disabled;
 #endif
 	unsigned short			migration_flags;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e6473ecaab3c..c962922784d1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1728,8 +1728,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_PREEMPT_RT
-
 static void
 __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
@@ -1800,8 +1798,6 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
 	return rq->nr_pinned;
 }
 
-#endif
-
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -2882,7 +2878,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
 	}
 }
 
-#else
+#else /* CONFIG_SMP */
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 					 const struct cpumask *new_mask,
@@ -2891,10 +2887,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 	return set_cpus_allowed_ptr(p, new_mask);
 }
 
-#endif /* CONFIG_SMP */
-
-#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
-
 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
 
 static inline bool rq_has_pinned_tasks(struct rq *rq)
@@ -2902,7 +2894,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
 	return false;
 }
 
-#endif
+#endif /* !CONFIG_SMP */
 
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -7924,6 +7916,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL_GPL(__cant_sleep);
+
+#ifdef CONFIG_SMP
+void __cant_migrate(const char *file, int line)
+{
+	static unsigned long prev_jiffy;
+
+	if (irqs_disabled())
+		return;
+
+	if (is_migration_disabled(current))
+		return;
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+		return;
+
+	if (preempt_count() > 0)
+		return;
+
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
+	pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
+	       in_atomic(), irqs_disabled(), is_migration_disabled(current),
+	       current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	dump_stack();
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_migrate);
+#endif
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 590e6f27068c..f5acb6c5ce49 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1056,7 +1056,7 @@ struct rq {
 	struct cpuidle_state	*idle_state;
 #endif
 
-#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	unsigned int		nr_pinned;
 #endif
 	unsigned int		push_busy;
@@ -1092,7 +1092,7 @@ static inline int cpu_of(struct rq *rq)
 
 static inline bool is_migration_disabled(struct task_struct *p)
 {
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+#ifdef CONFIG_SMP
 	return p->migration_disabled;
 #else
 	return false;
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index faaa927ac2c8..1c1dbd300325 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -26,7 +26,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
 	if (current->nr_cpus_allowed == 1)
 		goto out;
 
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+#ifdef CONFIG_SMP
 	if (current->migration_disabled)
 		goto out;
 #endif
-- 
cgit 


From 5fbda3ecd14a5343644979c98d6eb65b7e7de9d8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Nov 2020 20:48:43 +0100
Subject: sched: highmem: Store local kmaps in task struct

Instead of storing the map per CPU provide and use per task storage. That
prepares for local kmaps which are preemptible.

The context switch code is preparatory and not yet in use because
kmap_atomic() runs with preemption disabled. Will be made usable in the
next step.

The context switch logic is safe even when an interrupt happens after
clearing or before restoring the kmaps. The kmap index in task struct is
not modified so any nesting kmap in an interrupt will use unused indices
and on return the counter is the same as before.

Also add an assert into the return to user space code. Going back to user
space with an active kmap local is a nono.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20201118204007.372935758@linutronix.de
---
 include/linux/highmem-internal.h | 10 ++++
 include/linux/sched.h            |  9 ++++
 kernel/entry/common.c            |  2 +
 kernel/fork.c                    |  1 +
 kernel/sched/core.c              | 25 ++++++++++
 mm/highmem.c                     | 99 ++++++++++++++++++++++++++++++++++++----
 6 files changed, 136 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 6ceed907b14e..c5a22177db85 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -9,6 +9,16 @@
 void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
 void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
 void kunmap_local_indexed(void *vaddr);
+void kmap_local_fork(struct task_struct *tsk);
+void __kmap_local_sched_out(void);
+void __kmap_local_sched_in(void);
+static inline void kmap_assert_nomap(void)
+{
+	DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
+}
+#else
+static inline void kmap_local_fork(struct task_struct *tsk) { }
+static inline void kmap_assert_nomap(void) { }
 #endif
 
 #ifdef CONFIG_HIGHMEM
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a33f35f68060..8946911cee9f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/rseq.h>
 #include <linux/seqlock.h>
 #include <linux/kcsan.h>
+#include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -629,6 +630,13 @@ struct wake_q_node {
 	struct wake_q_node *next;
 };
 
+struct kmap_ctrl {
+#ifdef CONFIG_KMAP_LOCAL
+	int				idx;
+	pte_t				pteval[KM_MAX_IDX];
+#endif
+};
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -1294,6 +1302,7 @@ struct task_struct {
 	unsigned int			sequential_io;
 	unsigned int			sequential_io_avg;
 #endif
+	struct kmap_ctrl		kmap_ctrl;
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 	unsigned long			task_state_change;
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 2b8366693d5c..4ae1fe0898e9 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,6 +2,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
+#include <linux/highmem.h>
 #include <linux/livepatch.h>
 #include <linux/audit.h>
 
@@ -194,6 +195,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
 
 	/* Ensure that the address limit is intact and no locks are held */
 	addr_limit_user_check();
+	kmap_assert_nomap();
 	lockdep_assert_irqs_disabled();
 	lockdep_sys_exit();
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 32083db7a2a2..17dcd1817799 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -930,6 +930,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	account_kernel_stack(tsk, 1);
 
 	kcov_task_init(tsk);
+	kmap_local_fork(tsk);
 
 #ifdef CONFIG_FAULT_INJECTION
 	tsk->fail_nth = 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c962922784d1..953abdbe1472 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4094,6 +4094,22 @@ static inline void finish_lock_switch(struct rq *rq)
 # define finish_arch_post_lock_switch()	do { } while (0)
 #endif
 
+static inline void kmap_local_sched_out(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+	if (unlikely(current->kmap_ctrl.idx))
+		__kmap_local_sched_out();
+#endif
+}
+
+static inline void kmap_local_sched_in(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+	if (unlikely(current->kmap_ctrl.idx))
+		__kmap_local_sched_in();
+#endif
+}
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -4116,6 +4132,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	perf_event_task_sched_out(prev, next);
 	rseq_preempt(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
+	kmap_local_sched_out();
 	prepare_task(next);
 	prepare_arch_switch(next);
 }
@@ -4182,6 +4199,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	finish_lock_switch(rq);
 	finish_arch_post_lock_switch();
 	kcov_finish_switch(current);
+	/*
+	 * kmap_local_sched_out() is invoked with rq::lock held and
+	 * interrupts disabled. There is no requirement for that, but the
+	 * sched out code does not have an interrupt enabled section.
+	 * Restoring the maps on sched in does not require interrupts being
+	 * disabled either.
+	 */
+	kmap_local_sched_in();
 
 	fire_sched_in_preempt_notifiers(current);
 	/*
diff --git a/mm/highmem.c b/mm/highmem.c
index 39aaca1a1ece..d1ef06aa6de6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high);
 
 #include <asm/kmap_size.h>
 
-static DEFINE_PER_CPU(int, __kmap_local_idx);
-
 /*
  * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second
  * slot is unused which acts as a guard page
@@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_idx);
 
 static inline int kmap_local_idx_push(void)
 {
-	int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1;
-
 	WARN_ON_ONCE(in_irq() && !irqs_disabled());
-	BUG_ON(idx >= KM_MAX_IDX);
-	return idx;
+	current->kmap_ctrl.idx += KM_INCR;
+	BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX);
+	return current->kmap_ctrl.idx - 1;
 }
 
 static inline int kmap_local_idx(void)
 {
-	return __this_cpu_read(__kmap_local_idx) - 1;
+	return current->kmap_ctrl.idx - 1;
 }
 
 static inline void kmap_local_idx_pop(void)
 {
-	int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR);
-
-	BUG_ON(idx < 0);
+	current->kmap_ctrl.idx -= KM_INCR;
+	BUG_ON(current->kmap_ctrl.idx < 0);
 }
 
 #ifndef arch_kmap_local_post_map
@@ -464,6 +460,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
 	pteval = pfn_pte(pfn, prot);
 	set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval);
 	arch_kmap_local_post_map(vaddr, pteval);
+	current->kmap_ctrl.pteval[kmap_local_idx()] = pteval;
 	preempt_enable();
 
 	return (void *)vaddr;
@@ -522,10 +519,92 @@ void kunmap_local_indexed(void *vaddr)
 	arch_kmap_local_pre_unmap(addr);
 	pte_clear(&init_mm, addr, kmap_pte - idx);
 	arch_kmap_local_post_unmap(addr);
+	current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0);
 	kmap_local_idx_pop();
 	preempt_enable();
 }
 EXPORT_SYMBOL(kunmap_local_indexed);
+
+/*
+ * Invoked before switch_to(). This is safe even when during or after
+ * clearing the maps an interrupt which needs a kmap_local happens because
+ * the task::kmap_ctrl.idx is not modified by the unmapping code so a
+ * nested kmap_local will use the next unused index and restore the index
+ * on unmap. The already cleared kmaps of the outgoing task are irrelevant
+ * because the interrupt context does not know about them. The same applies
+ * when scheduling back in for an interrupt which happens before the
+ * restore is complete.
+ */
+void __kmap_local_sched_out(void)
+{
+	struct task_struct *tsk = current;
+	pte_t *kmap_pte = kmap_get_pte();
+	int i;
+
+	/* Clear kmaps */
+	for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
+		pte_t pteval = tsk->kmap_ctrl.pteval[i];
+		unsigned long addr;
+		int idx;
+
+		/* With debug all even slots are unmapped and act as guard */
+		if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
+			WARN_ON_ONCE(!pte_none(pteval));
+			continue;
+		}
+		if (WARN_ON_ONCE(pte_none(pteval)))
+			continue;
+
+		/*
+		 * This is a horrible hack for XTENSA to calculate the
+		 * coloured PTE index. Uses the PFN encoded into the pteval
+		 * and the map index calculation because the actual mapped
+		 * virtual address is not stored in task::kmap_ctrl.
+		 * For any sane architecture this is optimized out.
+		 */
+		idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+
+		addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+		arch_kmap_local_pre_unmap(addr);
+		pte_clear(&init_mm, addr, kmap_pte - idx);
+		arch_kmap_local_post_unmap(addr);
+	}
+}
+
+void __kmap_local_sched_in(void)
+{
+	struct task_struct *tsk = current;
+	pte_t *kmap_pte = kmap_get_pte();
+	int i;
+
+	/* Restore kmaps */
+	for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
+		pte_t pteval = tsk->kmap_ctrl.pteval[i];
+		unsigned long addr;
+		int idx;
+
+		/* With debug all even slots are unmapped and act as guard */
+		if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
+			WARN_ON_ONCE(!pte_none(pteval));
+			continue;
+		}
+		if (WARN_ON_ONCE(pte_none(pteval)))
+			continue;
+
+		/* See comment in __kmap_local_sched_out() */
+		idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+		addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+		set_pte_at(&init_mm, addr, kmap_pte - idx, pteval);
+		arch_kmap_local_post_map(addr, pteval);
+	}
+}
+
+void kmap_local_fork(struct task_struct *tsk)
+{
+	if (WARN_ON_ONCE(tsk->kmap_ctrl.idx))
+		memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl));
+}
+
 #endif
 
 #if defined(HASHED_PAGE_VIRTUAL)
-- 
cgit 


From f3ba3c710ac5a30cd058615a9eb62d2ad95bb782 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Nov 2020 20:48:44 +0100
Subject: mm/highmem: Provide kmap_local*

Now that the kmap atomic index is stored in task struct provide a
preemptible variant. On context switch the maps of an outgoing task are
removed and the map of the incoming task are restored. That's obviously
slow, but highmem is slow anyway.

The kmap_local.*() functions can be invoked from both preemptible and
atomic context. kmap local sections disable migration to keep the resulting
virtual mapping address correct, but disable neither pagefaults nor
preemption.

A wholesale conversion of kmap_atomic to be fully preemptible is not
possible because some of the usage sites might rely on the preemption
disable for serialization or on the implicit pagefault disable. Needs to be
done on a case by case basis.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20201118204007.468533059@linutronix.de
---
 include/linux/highmem-internal.h | 48 ++++++++++++++++++++++++++++++++++++++++
 include/linux/highmem.h          | 43 +++++++++++++++++++++--------------
 mm/highmem.c                     |  6 +++++
 3 files changed, 81 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index c5a22177db85..1bbe96dc8be6 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -68,6 +68,26 @@ static inline void kmap_flush_unused(void)
 	__kmap_flush_unused();
 }
 
+static inline void *kmap_local_page(struct page *page)
+{
+	return __kmap_local_page_prot(page, kmap_prot);
+}
+
+static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
+{
+	return __kmap_local_page_prot(page, prot);
+}
+
+static inline void *kmap_local_pfn(unsigned long pfn)
+{
+	return __kmap_local_pfn_prot(pfn, kmap_prot);
+}
+
+static inline void __kunmap_local(void *vaddr)
+{
+	kunmap_local_indexed(vaddr);
+}
+
 static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
 	preempt_disable();
@@ -140,6 +160,28 @@ static inline void kunmap(struct page *page)
 #endif
 }
 
+static inline void *kmap_local_page(struct page *page)
+{
+	return page_address(page);
+}
+
+static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
+{
+	return kmap_local_page(page);
+}
+
+static inline void *kmap_local_pfn(unsigned long pfn)
+{
+	return kmap_local_page(pfn_to_page(pfn));
+}
+
+static inline void __kunmap_local(void *addr)
+{
+#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
+	kunmap_flush_on_unmap(addr);
+#endif
+}
+
 static inline void *kmap_atomic(struct page *page)
 {
 	preempt_disable();
@@ -181,4 +223,10 @@ do {								\
 	__kunmap_atomic(__addr);				\
 } while (0)
 
+#define kunmap_local(__addr)					\
+do {								\
+	BUILD_BUG_ON(__same_type((__addr), struct page *));	\
+	__kunmap_local(__addr);					\
+} while (0)
+
 #endif
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 7d098bd621f6..f597830f26b4 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -60,24 +60,22 @@ static inline struct page *kmap_to_page(void *addr);
 static inline void kmap_flush_unused(void);
 
 /**
- * kmap_atomic - Atomically map a page for temporary usage
+ * kmap_local_page - Map a page for temporary usage
  * @page:	Pointer to the page to be mapped
  *
  * Returns: The virtual address of the mapping
  *
- * Side effect: On return pagefaults and preemption are disabled.
- *
  * Can be invoked from any context.
  *
  * Requires careful handling when nesting multiple mappings because the map
  * management is stack based. The unmap has to be in the reverse order of
  * the map operation:
  *
- * addr1 = kmap_atomic(page1);
- * addr2 = kmap_atomic(page2);
+ * addr1 = kmap_local_page(page1);
+ * addr2 = kmap_local_page(page2);
  * ...
- * kunmap_atomic(addr2);
- * kunmap_atomic(addr1);
+ * kunmap_local(addr2);
+ * kunmap_local(addr1);
  *
  * Unmapping addr1 before addr2 is invalid and causes malfunction.
  *
@@ -88,10 +86,26 @@ static inline void kmap_flush_unused(void);
  * virtual address of the direct mapping. Only real highmem pages are
  * temporarily mapped.
  *
- * While it is significantly faster than kmap() it comes with restrictions
- * about the pointer validity and the side effects of disabling page faults
- * and preemption. Use it only when absolutely necessary, e.g. from non
- * preemptible contexts.
+ * While it is significantly faster than kmap() for the higmem case it
+ * comes with restrictions about the pointer validity. Only use when really
+ * necessary.
+ *
+ * On HIGHMEM enabled systems mapping a highmem page has the side effect of
+ * disabling migration in order to keep the virtual address stable across
+ * preemption. No caller of kmap_local_page() can rely on this side effect.
+ */
+static inline void *kmap_local_page(struct page *page);
+
+/**
+ * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
+ * @page:	Pointer to the page to be mapped
+ *
+ * Returns: The virtual address of the mapping
+ *
+ * Effectively a wrapper around kmap_local_page() which disables pagefaults
+ * and preemption.
+ *
+ * Do not use in new code. Use kmap_local_page() instead.
  */
 static inline void *kmap_atomic(struct page *page);
 
@@ -101,12 +115,9 @@ static inline void *kmap_atomic(struct page *page);
  *
  * Counterpart to kmap_atomic().
  *
- * Undoes the side effects of kmap_atomic(), i.e. reenabling pagefaults and
+ * Effectively a wrapper around kunmap_local() which additionally undoes
+ * the side effects of kmap_atomic(), i.e. reenabling pagefaults and
  * preemption.
- *
- * Other than that a NOOP for CONFIG_HIGHMEM=n and for mappings of pages
- * in the low memory area. For real highmen pages the mapping which was
- * established with kmap_atomic() is destroyed.
  */
 
 /* Highmem related interfaces for management code */
diff --git a/mm/highmem.c b/mm/highmem.c
index d1ef06aa6de6..83f9660f168f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -453,6 +453,11 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
 	unsigned long vaddr;
 	int idx;
 
+	/*
+	 * Disable migration so resulting virtual address is stable
+	 * accross preemption.
+	 */
+	migrate_disable();
 	preempt_disable();
 	idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn);
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -522,6 +527,7 @@ void kunmap_local_indexed(void *vaddr)
 	current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0);
 	kmap_local_idx_pop();
 	preempt_enable();
+	migrate_enable();
 }
 EXPORT_SYMBOL(kunmap_local_indexed);
 
-- 
cgit 


From e66f6e095486f0210fcf3c5eb3ecf13fa348be4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Nov 2020 20:48:45 +0100
Subject: io-mapping: Provide iomap_local variant

Similar to kmap local provide a iomap local variant which only disables
migration, but neither disables pagefaults nor preemption.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20201118204007.561220818@linutronix.de
---
 Documentation/driver-api/io-mapping.rst | 74 ++++++++++++++++++++-------------
 include/linux/io-mapping.h              | 30 ++++++++++++-
 2 files changed, 73 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/io-mapping.rst b/Documentation/driver-api/io-mapping.rst
index e33b88268554..a0cfb15988df 100644
--- a/Documentation/driver-api/io-mapping.rst
+++ b/Documentation/driver-api/io-mapping.rst
@@ -20,55 +20,71 @@ A mapping object is created during driver initialization using::
 mappable, while 'size' indicates how large a mapping region to
 enable. Both are in bytes.
 
-This _wc variant provides a mapping which may only be used
-with the io_mapping_map_atomic_wc or io_mapping_map_wc.
+This _wc variant provides a mapping which may only be used with
+io_mapping_map_atomic_wc(), io_mapping_map_local_wc() or
+io_mapping_map_wc().
 
-With this mapping object, individual pages can be mapped either atomically
-or not, depending on the necessary scheduling environment. Of course, atomic
-maps are more efficient::
+With this mapping object, individual pages can be mapped either temporarily
+or long term, depending on the requirements. Of course, temporary maps are
+more efficient. They come in two flavours::
+
+	void *io_mapping_map_local_wc(struct io_mapping *mapping,
+				      unsigned long offset)
 
 	void *io_mapping_map_atomic_wc(struct io_mapping *mapping,
 				       unsigned long offset)
 
-'offset' is the offset within the defined mapping region.
-Accessing addresses beyond the region specified in the
-creation function yields undefined results. Using an offset
-which is not page aligned yields an undefined result. The
-return value points to a single page in CPU address space.
+'offset' is the offset within the defined mapping region.  Accessing
+addresses beyond the region specified in the creation function yields
+undefined results. Using an offset which is not page aligned yields an
+undefined result. The return value points to a single page in CPU address
+space.
 
-This _wc variant returns a write-combining map to the
-page and may only be used with mappings created by
-io_mapping_create_wc
+This _wc variant returns a write-combining map to the page and may only be
+used with mappings created by io_mapping_create_wc()
 
-Note that the task may not sleep while holding this page
-mapped.
+Temporary mappings are only valid in the context of the caller. The mapping
+is not guaranteed to be globaly visible.
 
-::
+io_mapping_map_local_wc() has a side effect on X86 32bit as it disables
+migration to make the mapping code work. No caller can rely on this side
+effect.
 
-	void io_mapping_unmap_atomic(void *vaddr)
+io_mapping_map_atomic_wc() has the side effect of disabling preemption and
+pagefaults. Don't use in new code. Use io_mapping_map_local_wc() instead.
 
-'vaddr' must be the value returned by the last
-io_mapping_map_atomic_wc call. This unmaps the specified
-page and allows the task to sleep once again.
+Nested mappings need to be undone in reverse order because the mapping
+code uses a stack for keeping track of them::
 
-If you need to sleep while holding the lock, you can use the non-atomic
-variant, although they may be significantly slower.
+ addr1 = io_mapping_map_local_wc(map1, offset1);
+ addr2 = io_mapping_map_local_wc(map2, offset2);
+ ...
+ io_mapping_unmap_local(addr2);
+ io_mapping_unmap_local(addr1);
 
-::
+The mappings are released with::
+
+	void io_mapping_unmap_local(void *vaddr)
+	void io_mapping_unmap_atomic(void *vaddr)
+
+'vaddr' must be the value returned by the last io_mapping_map_local_wc() or
+io_mapping_map_atomic_wc() call. This unmaps the specified mapping and
+undoes the side effects of the mapping functions.
+
+If you need to sleep while holding a mapping, you can use the regular
+variant, although this may be significantly slower::
 
 	void *io_mapping_map_wc(struct io_mapping *mapping,
 				unsigned long offset)
 
-This works like io_mapping_map_atomic_wc except it allows
-the task to sleep while holding the page mapped.
-
+This works like io_mapping_map_atomic/local_wc() except it has no side
+effects and the pointer is globaly visible.
 
-::
+The mappings are released with::
 
 	void io_mapping_unmap(void *vaddr)
 
-This works like io_mapping_unmap_atomic, except it is used
-for pages mapped with io_mapping_map_wc.
+Use for pages mapped with io_mapping_map_wc().
 
 At driver close time, the io_mapping object must be freed::
 
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 60e7c83e4904..c093e81310a9 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -82,6 +82,21 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
 	preempt_enable();
 }
 
+static inline void __iomem *
+io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset)
+{
+	resource_size_t phys_addr;
+
+	BUG_ON(offset >= mapping->size);
+	phys_addr = mapping->base + offset;
+	return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
+}
+
+static inline void io_mapping_unmap_local(void __iomem *vaddr)
+{
+	kunmap_local_indexed((void __force *)vaddr);
+}
+
 static inline void __iomem *
 io_mapping_map_wc(struct io_mapping *mapping,
 		  unsigned long offset,
@@ -101,7 +116,7 @@ io_mapping_unmap(void __iomem *vaddr)
 	iounmap(vaddr);
 }
 
-#else
+#else  /* HAVE_ATOMIC_IOMAP */
 
 #include <linux/uaccess.h>
 
@@ -166,7 +181,18 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
 	preempt_enable();
 }
 
-#endif /* HAVE_ATOMIC_IOMAP */
+static inline void __iomem *
+io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset)
+{
+	return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
+}
+
+static inline void io_mapping_unmap_local(void __iomem *vaddr)
+{
+	io_mapping_unmap(vaddr);
+}
+
+#endif /* !HAVE_ATOMIC_IOMAP */
 
 static inline struct io_mapping *
 io_mapping_create_wc(resource_size_t base,
-- 
cgit 


From acfdd18591eaac25446e976a0c0d190f8b3dbfb1 Mon Sep 17 00:00:00 2001
From: Amit Sunil Dhamne <amit.sunil.dhamne@xilinx.com>
Date: Mon, 23 Nov 2020 21:52:41 -0800
Subject: firmware: xilinx: Use hash-table for api feature check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently array of fix length PM_API_MAX is used to cache
the pm_api version (valid or invalid). However ATF based
PM APIs values are much higher then PM_API_MAX.
So to include ATF based PM APIs also, use hash-table to
store the pm_api version status.

Signed-off-by: Amit Sunil Dhamne <amit.sunil.dhamne@xilinx.com>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ravi Patel <ravi.patel@xilinx.com>
Signed-off-by: Rajan Vaja <rajan.vaja@xilinx.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Michal Simek <michal.simek@xilinx.com>
Fixes: f3217d6f2f7a ("firmware: xilinx: fix out-of-bounds access")
Cc: stable <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/1606197161-25976-1-git-send-email-rajan.vaja@xilinx.com
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 drivers/firmware/xilinx/zynqmp.c     | 63 ++++++++++++++++++++++++++++--------
 include/linux/firmware/xlnx-zynqmp.h |  4 ---
 2 files changed, 49 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 5828302c7c5c..d08ac824c993 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -20,12 +20,28 @@
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/hashtable.h>
 
 #include <linux/firmware/xlnx-zynqmp.h>
 #include "zynqmp-debug.h"
 
+/* Max HashMap Order for PM API feature check (1<<7 = 128) */
+#define PM_API_FEATURE_CHECK_MAX_ORDER  7
+
 static bool feature_check_enabled;
-static u32 zynqmp_pm_features[PM_API_MAX];
+DEFINE_HASHTABLE(pm_api_features_map, PM_API_FEATURE_CHECK_MAX_ORDER);
+
+/**
+ * struct pm_api_feature_data - PM API Feature data
+ * @pm_api_id:		PM API Id, used as key to index into hashmap
+ * @feature_status:	status of PM API feature: valid, invalid
+ * @hentry:		hlist_node that hooks this entry into hashtable
+ */
+struct pm_api_feature_data {
+	u32 pm_api_id;
+	int feature_status;
+	struct hlist_node hentry;
+};
 
 static const struct mfd_cell firmware_devs[] = {
 	{
@@ -142,29 +158,37 @@ static int zynqmp_pm_feature(u32 api_id)
 	int ret;
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	u64 smc_arg[2];
+	struct pm_api_feature_data *feature_data;
 
 	if (!feature_check_enabled)
 		return 0;
 
-	/* Return value if feature is already checked */
-	if (api_id > ARRAY_SIZE(zynqmp_pm_features))
-		return PM_FEATURE_INVALID;
+	/* Check for existing entry in hash table for given api */
+	hash_for_each_possible(pm_api_features_map, feature_data, hentry,
+			       api_id) {
+		if (feature_data->pm_api_id == api_id)
+			return feature_data->feature_status;
+	}
 
-	if (zynqmp_pm_features[api_id] != PM_FEATURE_UNCHECKED)
-		return zynqmp_pm_features[api_id];
+	/* Add new entry if not present */
+	feature_data = kmalloc(sizeof(*feature_data), GFP_KERNEL);
+	if (!feature_data)
+		return -ENOMEM;
 
+	feature_data->pm_api_id = api_id;
 	smc_arg[0] = PM_SIP_SVC | PM_FEATURE_CHECK;
 	smc_arg[1] = api_id;
 
 	ret = do_fw_call(smc_arg[0], smc_arg[1], 0, ret_payload);
-	if (ret) {
-		zynqmp_pm_features[api_id] = PM_FEATURE_INVALID;
-		return PM_FEATURE_INVALID;
-	}
+	if (ret)
+		ret = -EOPNOTSUPP;
+	else
+		ret = ret_payload[1];
 
-	zynqmp_pm_features[api_id] = ret_payload[1];
+	feature_data->feature_status = ret;
+	hash_add(pm_api_features_map, &feature_data->hentry, api_id);
 
-	return zynqmp_pm_features[api_id];
+	return ret;
 }
 
 /**
@@ -200,9 +224,12 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1,
 	 * Make sure to stay in x0 register
 	 */
 	u64 smc_arg[4];
+	int ret;
 
-	if (zynqmp_pm_feature(pm_api_id) == PM_FEATURE_INVALID)
-		return -ENOTSUPP;
+	/* Check if feature is supported or not */
+	ret = zynqmp_pm_feature(pm_api_id);
+	if (ret < 0)
+		return ret;
 
 	smc_arg[0] = PM_SIP_SVC | pm_api_id;
 	smc_arg[1] = ((u64)arg1 << 32) | arg0;
@@ -1252,9 +1279,17 @@ static int zynqmp_firmware_probe(struct platform_device *pdev)
 
 static int zynqmp_firmware_remove(struct platform_device *pdev)
 {
+	struct pm_api_feature_data *feature_data;
+	int i;
+
 	mfd_remove_devices(&pdev->dev);
 	zynqmp_pm_api_debugfs_exit();
 
+	hash_for_each(pm_api_features_map, i, feature_data, hentry) {
+		hash_del(&feature_data->hentry);
+		kfree(feature_data);
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 5968df82b991..41a1bab98b7e 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -50,10 +50,6 @@
 #define	ZYNQMP_PM_CAPABILITY_WAKEUP	0x4U
 #define	ZYNQMP_PM_CAPABILITY_UNUSABLE	0x8U
 
-/* Feature check status */
-#define PM_FEATURE_INVALID		-1
-#define PM_FEATURE_UNCHECKED		0
-
 /*
  * Firmware FPGA Manager flags
  * XILINX_ZYNQMP_PM_FPGA_FULL:	FPGA full reconfiguration
-- 
cgit 


From 7a9f50a05843fee8366bd3a65addbebaa7cf7f07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 15 Jun 2020 11:51:29 +0200
Subject: irq_work: Cleanup

Get rid of the __call_single_node union and clean up the API a little
to avoid external code relying on the structure layout as much.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
---
 drivers/gpu/drm/i915/i915_request.c |  4 ++--
 include/linux/irq_work.h            | 33 +++++++++++++++++++++------------
 include/linux/irqflags.h            |  4 ++--
 kernel/bpf/stackmap.c               |  2 +-
 kernel/irq_work.c                   | 18 +++++++++---------
 kernel/printk/printk.c              |  6 ++----
 kernel/rcu/tree.c                   |  3 +--
 kernel/time/tick-sched.c            |  6 ++----
 kernel/trace/bpf_trace.c            |  2 +-
 9 files changed, 41 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 0e813819b041..5385b081a376 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -197,7 +197,7 @@ __notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk))
 
 	llist_for_each_entry_safe(cb, cn,
 				  llist_del_all(&rq->execute_cb),
-				  work.llnode)
+				  work.node.llist)
 		fn(&cb->work);
 }
 
@@ -460,7 +460,7 @@ __await_execution(struct i915_request *rq,
 	 * callback first, then checking the ACTIVE bit, we serialise with
 	 * the completed/retired request.
 	 */
-	if (llist_add(&cb->work.llnode, &signal->execute_cb)) {
+	if (llist_add(&cb->work.node.llist, &signal->execute_cb)) {
 		if (i915_request_is_active(signal) ||
 		    __request_in_flight(signal))
 			__notify_execute_cb_imm(signal);
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 30823780c192..ec2a47a81e42 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -14,28 +14,37 @@
  */
 
 struct irq_work {
-	union {
-		struct __call_single_node node;
-		struct {
-			struct llist_node llnode;
-			atomic_t flags;
-		};
-	};
+	struct __call_single_node node;
 	void (*func)(struct irq_work *);
 };
 
+#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){	\
+	.node = { .u_flags = (_flags), },			\
+	.func = (_func),					\
+}
+
+#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0)
+#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY)
+#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ)
+
+#define DEFINE_IRQ_WORK(name, _f)				\
+	struct irq_work name = IRQ_WORK_INIT(_f)
+
 static inline
 void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 {
-	atomic_set(&work->flags, 0);
-	work->func = func;
+	*work = IRQ_WORK_INIT(func);
 }
 
-#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = {	\
-		.flags = ATOMIC_INIT(0),			\
-		.func  = (_f)					\
+static inline bool irq_work_is_pending(struct irq_work *work)
+{
+	return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING;
 }
 
+static inline bool irq_work_is_busy(struct irq_work *work)
+{
+	return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
+}
 
 bool irq_work_queue(struct irq_work *work);
 bool irq_work_queue_on(struct irq_work *work, int cpu);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 3ed4e8771b64..fef2d43a7a1d 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -109,12 +109,12 @@ do {						\
 
 # define lockdep_irq_work_enter(__work)					\
 	  do {								\
-		  if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
+		  if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\
 			current->irq_config = 1;			\
 	  } while (0)
 # define lockdep_irq_work_exit(__work)					\
 	  do {								\
-		  if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
+		  if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\
 			current->irq_config = 0;			\
 	  } while (0)
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 06065fa27124..599041cd0c8a 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -298,7 +298,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	if (irqs_disabled()) {
 		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
 			work = this_cpu_ptr(&up_read_work);
-			if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
+			if (irq_work_is_busy(&work->irq_work)) {
 				/* cannot queue more up_read, fallback */
 				irq_work_busy = true;
 			}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index eca83965b631..fbff25adb574 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
 {
 	int oflags;
 
-	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
+	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
 	/*
 	 * If the work is already pending, no need to raise the IPI.
 	 * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@@ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void)
 static void __irq_work_queue_local(struct irq_work *work)
 {
 	/* If the work is "lazy", handle it from next tick if any */
-	if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
-		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+	if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
+		if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
 		    tick_nohz_tick_stopped())
 			arch_irq_work_raise();
 	} else {
-		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+		if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
 			arch_irq_work_raise();
 	}
 }
@@ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	if (cpu != smp_processor_id()) {
 		/* Arch remote IPI send/receive backend aren't NMI safe */
 		WARN_ON_ONCE(in_nmi());
-		__smp_call_single_queue(cpu, &work->llnode);
+		__smp_call_single_queue(cpu, &work->node.llist);
 	} else {
 		__irq_work_queue_local(work);
 	}
@@ -142,7 +142,7 @@ void irq_work_single(void *arg)
 	 * to claim that work don't rely on us to handle their data
 	 * while we are in the middle of the func.
 	 */
-	flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+	flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->node.a_flags);
 
 	lockdep_irq_work_enter(work);
 	work->func(work);
@@ -152,7 +152,7 @@ void irq_work_single(void *arg)
 	 * no-one else claimed it meanwhile.
 	 */
 	flags &= ~IRQ_WORK_PENDING;
-	(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+	(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
 }
 
 static void irq_work_run_list(struct llist_head *list)
@@ -166,7 +166,7 @@ static void irq_work_run_list(struct llist_head *list)
 		return;
 
 	llnode = llist_del_all(list);
-	llist_for_each_entry_safe(work, tmp, llnode, llnode)
+	llist_for_each_entry_safe(work, tmp, llnode, node.llist)
 		irq_work_single(work);
 }
 
@@ -198,7 +198,7 @@ void irq_work_sync(struct irq_work *work)
 {
 	lockdep_assert_irqs_enabled();
 
-	while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
+	while (irq_work_is_busy(work))
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fe64a49344bf..9ef23d4b07c7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3025,10 +3025,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 		wake_up_interruptible(&log_wait);
 }
 
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
-	.func = wake_up_klogd_work_func,
-	.flags = ATOMIC_INIT(IRQ_WORK_LAZY),
-};
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
+	IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
 
 void wake_up_klogd(void)
 {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 06895ef85d69..a41e84f1b55a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1311,8 +1311,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 		if (IS_ENABLED(CONFIG_IRQ_WORK) &&
 		    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
 		    (rnp->ffmask & rdp->grpmask)) {
-			init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
-			atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ);
 			rdp->rcu_iw_pending = true;
 			rdp->rcu_iw_gp_seq = rnp->gp_seq;
 			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
@@ -3964,6 +3962,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
 	rdp->cpu_no_qs.b.norm = true;
 	rdp->core_needs_qs = false;
 	rdp->rcu_iw_pending = false;
+	rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
 	rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
 	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 81632cd5e3b7..1b734070f028 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -243,10 +243,8 @@ static void nohz_full_kick_func(struct irq_work *work)
 	/* Empty, the tick restart happens on tick_nohz_irq_exit() */
 }
 
-static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
-	.func = nohz_full_kick_func,
-	.flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ),
-};
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
+	IRQ_WORK_INIT_HARD(nohz_full_kick_func);
 
 /*
  * Kick this CPU if it's full dynticks in order to force it to
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4517c8b66518..a6903912f7a0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1086,7 +1086,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
 			return -EINVAL;
 
 		work = this_cpu_ptr(&send_signal_work);
-		if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
+		if (irq_work_is_busy(&work->irq_work))
 			return -EBUSY;
 
 		/* Add the current task, which is the target of sending signal,
-- 
cgit 


From 545b8c8df41f9ecbaf806332d4095bc4bc7c14e8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 15 Jun 2020 11:29:31 +0200
Subject: smp: Cleanup smp_call_function*()

Get rid of the __call_single_node union and cleanup the API a little
to avoid external code relying on the structure layout as much.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
---
 arch/mips/kernel/process.c                      |  5 ++-
 arch/mips/kernel/smp.c                          | 25 +++----------
 arch/s390/pci/pci_irq.c                         |  4 +-
 arch/x86/kernel/cpuid.c                         |  7 ++--
 arch/x86/lib/msr-smp.c                          |  7 ++--
 block/blk-mq.c                                  |  4 +-
 drivers/cpuidle/coupled.c                       |  3 +-
 drivers/net/ethernet/cavium/liquidio/lio_core.c |  9 +----
 include/linux/smp.h                             | 19 +++++-----
 kernel/debug/debug_core.c                       |  6 +--
 kernel/sched/core.c                             | 12 +-----
 kernel/smp.c                                    | 50 ++++++++++++-------------
 net/core/dev.c                                  |  3 +-
 13 files changed, 60 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 75ebd8d7bd5d..d7e288f3a1e7 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -702,7 +702,6 @@ unsigned long arch_align_stack(unsigned long sp)
 	return sp & ALMASK;
 }
 
-static DEFINE_PER_CPU(call_single_data_t, backtrace_csd);
 static struct cpumask backtrace_csd_busy;
 
 static void handle_backtrace(void *info)
@@ -711,6 +710,9 @@ static void handle_backtrace(void *info)
 	cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy);
 }
 
+static DEFINE_PER_CPU(call_single_data_t, backtrace_csd) =
+	CSD_INIT(handle_backtrace, NULL);
+
 static void raise_backtrace(cpumask_t *mask)
 {
 	call_single_data_t *csd;
@@ -730,7 +732,6 @@ static void raise_backtrace(cpumask_t *mask)
 		}
 
 		csd = &per_cpu(backtrace_csd, cpu);
-		csd->func = handle_backtrace;
 		smp_call_function_single_async(cpu, csd);
 	}
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 48d84d5fcc36..74b9102fd06e 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -687,36 +687,23 @@ EXPORT_SYMBOL(flush_tlb_one);
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 
-static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
-
-void tick_broadcast(const struct cpumask *mask)
-{
-	call_single_data_t *csd;
-	int cpu;
-
-	for_each_cpu(cpu, mask) {
-		csd = &per_cpu(tick_broadcast_csd, cpu);
-		smp_call_function_single_async(cpu, csd);
-	}
-}
-
 static void tick_broadcast_callee(void *info)
 {
 	tick_receive_broadcast();
 }
 
-static int __init tick_broadcast_init(void)
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd) =
+	CSD_INIT(tick_broadcast_callee, NULL);
+
+void tick_broadcast(const struct cpumask *mask)
 {
 	call_single_data_t *csd;
 	int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_cpu(cpu, mask) {
 		csd = &per_cpu(tick_broadcast_csd, cpu);
-		csd->func = tick_broadcast_callee;
+		smp_call_function_single_async(cpu, csd);
 	}
-
-	return 0;
 }
-early_initcall(tick_broadcast_init);
 
 #endif /* CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index 743f257cf2cb..1311b6f9d6dd 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -178,9 +178,7 @@ static void zpci_handle_fallback_irq(void)
 		if (atomic_inc_return(&cpu_data->scheduled) > 1)
 			continue;
 
-		cpu_data->csd.func = zpci_handle_remote_irq;
-		cpu_data->csd.info = &cpu_data->scheduled;
-		cpu_data->csd.flags = 0;
+		INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled);
 		smp_call_function_single_async(cpu, &cpu_data->csd);
 	}
 }
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 3492aa36bf09..6f7b8cc1bc9f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 
 	init_completion(&cmd.done);
 	for (; count; count -= 16) {
-		call_single_data_t csd = {
-			.func = cpuid_smp_cpuid,
-			.info = &cmd,
-		};
+		call_single_data_t csd;
+
+		INIT_CSD(&csd, cpuid_smp_cpuid, &cmd);
 
 		cmd.regs.eax = pos;
 		cmd.regs.ecx = pos >> 32;
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index fee8b9c0520c..75a0915b0d01 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *info)
 int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 {
 	struct msr_info_completion rv;
-	call_single_data_t csd = {
-		.func	= __rdmsr_safe_on_cpu,
-		.info	= &rv,
-	};
+	call_single_data_t csd;
 	int err;
 
+	INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv);
+
 	memset(&rv, 0, sizeof(rv));
 	init_completion(&rv.done);
 	rv.msr.msr_no = msr_no;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 55bcee5dc032..d35b3c0c876a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -671,9 +671,7 @@ bool blk_mq_complete_request_remote(struct request *rq)
 		return false;
 
 	if (blk_mq_complete_need_ipi(rq)) {
-		rq->csd.func = __blk_mq_complete_request_remote;
-		rq->csd.info = rq;
-		rq->csd.flags = 0;
+		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
 		smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
 	} else {
 		if (rq->q->nr_hw_queues > 1)
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 04003b90dc49..74068742cef3 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -674,8 +674,7 @@ have_coupled:
 	coupled->refcnt++;
 
 	csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);
-	csd->func = cpuidle_coupled_handle_poke;
-	csd->info = (void *)(unsigned long)dev->cpu;
+	INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 9ef172976b35..37d064193f0f 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -729,13 +729,8 @@ static void liquidio_napi_drv_callback(void *arg)
 	    droq->cpu_id == this_cpu) {
 		napi_schedule_irqoff(&droq->napi);
 	} else {
-		call_single_data_t *csd = &droq->csd;
-
-		csd->func = napi_schedule_wrapper;
-		csd->info = &droq->napi;
-		csd->flags = 0;
-
-		smp_call_function_single_async(droq->cpu_id, csd);
+		INIT_CSD(&droq->csd, napi_schedule_wrapper, &droq->napi);
+		smp_call_function_single_async(droq->cpu_id, &droq->csd);
 	}
 }
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9f13966d3d92..70c6f6284dcf 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -21,24 +21,23 @@ typedef bool (*smp_cond_func_t)(int cpu, void *info);
  * structure shares (partial) layout with struct irq_work
  */
 struct __call_single_data {
-	union {
-		struct __call_single_node node;
-		struct {
-			struct llist_node llist;
-			unsigned int flags;
-#ifdef CONFIG_64BIT
-			u16 src, dst;
-#endif
-		};
-	};
+	struct __call_single_node node;
 	smp_call_func_t func;
 	void *info;
 };
 
+#define CSD_INIT(_func, _info) \
+	(struct __call_single_data){ .func = (_func), .info = (_info), }
+
 /* Use __aligned() to avoid to use 2 cache lines for 1 csd */
 typedef struct __call_single_data call_single_data_t
 	__aligned(sizeof(struct __call_single_data));
 
+#define INIT_CSD(_csd, _func, _info)		\
+do {						\
+	*(_csd) = CSD_INIT((_func), (_info));	\
+} while (0)
+
 /*
  * Enqueue a llist_node on the call_single_queue; be very careful, read
  * flush_smp_call_function_queue() in detail.
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1e75a8923a8d..af6e8b4fb359 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -225,8 +225,6 @@ NOKPROBE_SYMBOL(kgdb_skipexception);
  * Default (weak) implementation for kgdb_roundup_cpus
  */
 
-static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
-
 void __weak kgdb_call_nmi_hook(void *ignored)
 {
 	/*
@@ -241,6 +239,9 @@ void __weak kgdb_call_nmi_hook(void *ignored)
 }
 NOKPROBE_SYMBOL(kgdb_call_nmi_hook);
 
+static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) =
+	CSD_INIT(kgdb_call_nmi_hook, NULL);
+
 void __weak kgdb_roundup_cpus(void)
 {
 	call_single_data_t *csd;
@@ -267,7 +268,6 @@ void __weak kgdb_roundup_cpus(void)
 			continue;
 		kgdb_info[cpu].rounding_up = true;
 
-		csd->func = kgdb_call_nmi_hook;
 		ret = smp_call_function_single_async(cpu, csd);
 		if (ret)
 			kgdb_info[cpu].rounding_up = false;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c962922784d1..b943b459b77a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -320,14 +320,6 @@ void update_rq_clock(struct rq *rq)
 	update_rq_clock_task(rq, delta);
 }
 
-static inline void
-rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
-{
-	csd->flags = 0;
-	csd->func = func;
-	csd->info = rq;
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -428,7 +420,7 @@ void hrtick_start(struct rq *rq, u64 delay)
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
-	rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
+	INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
 #endif
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	rq->hrtick_timer.function = hrtick;
@@ -7774,7 +7766,7 @@ void __init sched_init(void)
 		rq->last_blocked_load_update_tick = jiffies;
 		atomic_set(&rq->nohz_flags, 0);
 
-		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
+		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
 #endif
 #ifdef CONFIG_HOTPLUG_CPU
 		rcuwait_init(&rq->hotplug_wait);
diff --git a/kernel/smp.c b/kernel/smp.c
index 4d17501433be..faf1a3ace6a9 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -27,7 +27,7 @@
 #include "smpboot.h"
 #include "sched/smp.h"
 
-#define CSD_TYPE(_csd)	((_csd)->flags & CSD_FLAG_TYPE_MASK)
+#define CSD_TYPE(_csd)	((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
 
 struct call_function_data {
 	call_single_data_t	__percpu *csd;
@@ -146,7 +146,7 @@ static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 t
 	bool firsttime;
 	u64 ts2, ts_delta;
 	call_single_data_t *cpu_cur_csd;
-	unsigned int flags = READ_ONCE(csd->flags);
+	unsigned int flags = READ_ONCE(csd->node.u_flags);
 
 	if (!(flags & CSD_FLAG_LOCK)) {
 		if (!unlikely(*bug_id))
@@ -224,14 +224,14 @@ static void csd_lock_record(call_single_data_t *csd)
 
 static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
-	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
+	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #endif
 
 static __always_inline void csd_lock(call_single_data_t *csd)
 {
 	csd_lock_wait(csd);
-	csd->flags |= CSD_FLAG_LOCK;
+	csd->node.u_flags |= CSD_FLAG_LOCK;
 
 	/*
 	 * prevent CPU from reordering the above assignment
@@ -243,12 +243,12 @@ static __always_inline void csd_lock(call_single_data_t *csd)
 
 static __always_inline void csd_unlock(call_single_data_t *csd)
 {
-	WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
+	WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 
 	/*
 	 * ensure we're all done before releasing data:
 	 */
-	smp_store_release(&csd->flags, 0);
+	smp_store_release(&csd->node.u_flags, 0);
 }
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
@@ -300,7 +300,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd)
 		return -ENXIO;
 	}
 
-	__smp_call_single_queue(cpu, &csd->llist);
+	__smp_call_single_queue(cpu, &csd->node.llist);
 
 	return 0;
 }
@@ -353,7 +353,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 		 * We don't have to use the _safe() variant here
 		 * because we are not invoking the IPI handlers yet.
 		 */
-		llist_for_each_entry(csd, entry, llist) {
+		llist_for_each_entry(csd, entry, node.llist) {
 			switch (CSD_TYPE(csd)) {
 			case CSD_TYPE_ASYNC:
 			case CSD_TYPE_SYNC:
@@ -378,16 +378,16 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	 * First; run all SYNC callbacks, people are waiting for us.
 	 */
 	prev = NULL;
-	llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 		/* Do we wait until *after* callback? */
 		if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
 			smp_call_func_t func = csd->func;
 			void *info = csd->info;
 
 			if (prev) {
-				prev->next = &csd_next->llist;
+				prev->next = &csd_next->node.llist;
 			} else {
-				entry = &csd_next->llist;
+				entry = &csd_next->node.llist;
 			}
 
 			csd_lock_record(csd);
@@ -395,7 +395,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 			csd_unlock(csd);
 			csd_lock_record(NULL);
 		} else {
-			prev = &csd->llist;
+			prev = &csd->node.llist;
 		}
 	}
 
@@ -406,14 +406,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	 * Second; run all !SYNC callbacks.
 	 */
 	prev = NULL;
-	llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 		int type = CSD_TYPE(csd);
 
 		if (type != CSD_TYPE_TTWU) {
 			if (prev) {
-				prev->next = &csd_next->llist;
+				prev->next = &csd_next->node.llist;
 			} else {
-				entry = &csd_next->llist;
+				entry = &csd_next->node.llist;
 			}
 
 			if (type == CSD_TYPE_ASYNC) {
@@ -429,7 +429,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 			}
 
 		} else {
-			prev = &csd->llist;
+			prev = &csd->node.llist;
 		}
 	}
 
@@ -465,7 +465,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 {
 	call_single_data_t *csd;
 	call_single_data_t csd_stack = {
-		.flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
+		.node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
 	};
 	int this_cpu;
 	int err;
@@ -502,8 +502,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 	csd->func = func;
 	csd->info = info;
 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
-	csd->src = smp_processor_id();
-	csd->dst = cpu;
+	csd->node.src = smp_processor_id();
+	csd->node.dst = cpu;
 #endif
 
 	err = generic_exec_single(cpu, csd);
@@ -544,12 +544,12 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 
 	preempt_disable();
 
-	if (csd->flags & CSD_FLAG_LOCK) {
+	if (csd->node.u_flags & CSD_FLAG_LOCK) {
 		err = -EBUSY;
 		goto out;
 	}
 
-	csd->flags = CSD_FLAG_LOCK;
+	csd->node.u_flags = CSD_FLAG_LOCK;
 	smp_wmb();
 
 	err = generic_exec_single(cpu, csd);
@@ -667,14 +667,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 
 		csd_lock(csd);
 		if (wait)
-			csd->flags |= CSD_TYPE_SYNC;
+			csd->node.u_flags |= CSD_TYPE_SYNC;
 		csd->func = func;
 		csd->info = info;
 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
-		csd->src = smp_processor_id();
-		csd->dst = cpu;
+		csd->node.src = smp_processor_id();
+		csd->node.dst = cpu;
 #endif
-		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+		if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu)))
 			__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 82dc6b48e45f..57352605f82c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11165,8 +11165,7 @@ static int __init net_dev_init(void)
 		INIT_LIST_HEAD(&sd->poll_list);
 		sd->output_queue_tailp = &sd->output_queue;
 #ifdef CONFIG_RPS
-		sd->csd.func = rps_trigger_softirq;
-		sd->csd.info = sd;
+		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 		sd->cpu = i;
 #endif
 
-- 
cgit 


From 2914b0ba61a9d253535e51af16c7122a8148995d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 18 Jun 2020 22:28:37 +0200
Subject: irq_work: Optimize irq_work_single()

Trade one atomic op for a full memory barrier.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/irqflags.h |  8 ++++----
 kernel/irq_work.c        | 29 +++++++++++++++++------------
 2 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index fef2d43a7a1d..8de0e1373de7 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -107,14 +107,14 @@ do {						\
 		  current->irq_config = 0;			\
 	  } while (0)
 
-# define lockdep_irq_work_enter(__work)					\
+# define lockdep_irq_work_enter(_flags)					\
 	  do {								\
-		  if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\
+		  if (!((_flags) & IRQ_WORK_HARD_IRQ))			\
 			current->irq_config = 1;			\
 	  } while (0)
-# define lockdep_irq_work_exit(__work)					\
+# define lockdep_irq_work_exit(_flags)					\
 	  do {								\
-		  if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\
+		  if (!((_flags) & IRQ_WORK_HARD_IRQ))			\
 			current->irq_config = 0;			\
 	  } while (0)
 
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index fbff25adb574..e8da1e71583a 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work)
 	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
 	/*
 	 * If the work is already pending, no need to raise the IPI.
-	 * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
+	 * The pairing smp_mb() in irq_work_single() makes sure
 	 * everything we did before is visible.
 	 */
 	if (oflags & IRQ_WORK_PENDING)
@@ -136,22 +136,27 @@ void irq_work_single(void *arg)
 	int flags;
 
 	/*
-	 * Clear the PENDING bit, after this point the @work
-	 * can be re-used.
-	 * Make it immediately visible so that other CPUs trying
-	 * to claim that work don't rely on us to handle their data
-	 * while we are in the middle of the func.
+	 * Clear the PENDING bit, after this point the @work can be re-used.
+	 * The PENDING bit acts as a lock, and we own it, so we can clear it
+	 * without atomic ops.
 	 */
-	flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->node.a_flags);
+	flags = atomic_read(&work->node.a_flags);
+	flags &= ~IRQ_WORK_PENDING;
+	atomic_set(&work->node.a_flags, flags);
+
+	/*
+	 * See irq_work_claim().
+	 */
+	smp_mb();
 
-	lockdep_irq_work_enter(work);
+	lockdep_irq_work_enter(flags);
 	work->func(work);
-	lockdep_irq_work_exit(work);
+	lockdep_irq_work_exit(flags);
+
 	/*
-	 * Clear the BUSY bit and return to the free state if
-	 * no-one else claimed it meanwhile.
+	 * Clear the BUSY bit, if set, and return to the free state if no-one
+	 * else claimed it meanwhile.
 	 */
-	flags &= ~IRQ_WORK_PENDING;
 	(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
 }
 
-- 
cgit 


From 6bef038011a023db41f1b33f0776224729d52344 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 20 Nov 2020 14:42:38 -0700
Subject: rpmsg: Introduce __rpmsg{16|32|64} types

Introduce __rpmsg{16|32|64} types along with byte order conversion
functions based on an rpmsg_device operation as a foundation to
make RPMSG modular and transport agnostic.

Tested-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Suggested-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Link: https://lore.kernel.org/r/20201120214245.172963-2-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/rpmsg.h            | 51 ++++++++++++++++++++++++++++++
 include/linux/rpmsg/byteorder.h  | 67 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/rpmsg_types.h | 11 +++++++
 3 files changed, 129 insertions(+)
 create mode 100644 include/linux/rpmsg/byteorder.h
 create mode 100644 include/uapi/linux/rpmsg_types.h

(limited to 'include/linux')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index 9fe156d1c018..faf2daff6238 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -17,6 +17,7 @@
 #include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/poll.h>
+#include <linux/rpmsg/byteorder.h>
 
 #define RPMSG_ADDR_ANY		0xFFFFFFFF
 
@@ -46,6 +47,7 @@ struct rpmsg_channel_info {
  * @dst: destination address
  * @ept: the rpmsg endpoint of this channel
  * @announce: if set, rpmsg will announce the creation/removal of this channel
+ * @little_endian: True if transport is using little endian byte representation
  */
 struct rpmsg_device {
 	struct device dev;
@@ -55,6 +57,7 @@ struct rpmsg_device {
 	u32 dst;
 	struct rpmsg_endpoint *ept;
 	bool announce;
+	bool little_endian;
 
 	const struct rpmsg_device_ops *ops;
 };
@@ -111,6 +114,54 @@ struct rpmsg_driver {
 	int (*callback)(struct rpmsg_device *, void *, int, void *, u32);
 };
 
+static inline u16 rpmsg16_to_cpu(struct rpmsg_device *rpdev, __rpmsg16 val)
+{
+	if (!rpdev)
+		return __rpmsg16_to_cpu(rpmsg_is_little_endian(), val);
+	else
+		return __rpmsg16_to_cpu(rpdev->little_endian, val);
+}
+
+static inline __rpmsg16 cpu_to_rpmsg16(struct rpmsg_device *rpdev, u16 val)
+{
+	if (!rpdev)
+		return __cpu_to_rpmsg16(rpmsg_is_little_endian(), val);
+	else
+		return __cpu_to_rpmsg16(rpdev->little_endian, val);
+}
+
+static inline u32 rpmsg32_to_cpu(struct rpmsg_device *rpdev, __rpmsg32 val)
+{
+	if (!rpdev)
+		return __rpmsg32_to_cpu(rpmsg_is_little_endian(), val);
+	else
+		return __rpmsg32_to_cpu(rpdev->little_endian, val);
+}
+
+static inline __rpmsg32 cpu_to_rpmsg32(struct rpmsg_device *rpdev, u32 val)
+{
+	if (!rpdev)
+		return __cpu_to_rpmsg32(rpmsg_is_little_endian(), val);
+	else
+		return __cpu_to_rpmsg32(rpdev->little_endian, val);
+}
+
+static inline u64 rpmsg64_to_cpu(struct rpmsg_device *rpdev, __rpmsg64 val)
+{
+	if (!rpdev)
+		return __rpmsg64_to_cpu(rpmsg_is_little_endian(), val);
+	else
+		return __rpmsg64_to_cpu(rpdev->little_endian, val);
+}
+
+static inline __rpmsg64 cpu_to_rpmsg64(struct rpmsg_device *rpdev, u64 val)
+{
+	if (!rpdev)
+		return __cpu_to_rpmsg64(rpmsg_is_little_endian(), val);
+	else
+		return __cpu_to_rpmsg64(rpdev->little_endian, val);
+}
+
 #if IS_ENABLED(CONFIG_RPMSG)
 
 int register_rpmsg_device(struct rpmsg_device *dev);
diff --git a/include/linux/rpmsg/byteorder.h b/include/linux/rpmsg/byteorder.h
new file mode 100644
index 000000000000..c0f565dbad6d
--- /dev/null
+++ b/include/linux/rpmsg/byteorder.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Follows implementation found in linux/virtio_byteorder.h
+ */
+#ifndef _LINUX_RPMSG_BYTEORDER_H
+#define _LINUX_RPMSG_BYTEORDER_H
+#include <linux/types.h>
+#include <uapi/linux/rpmsg_types.h>
+
+static inline bool rpmsg_is_little_endian(void)
+{
+#ifdef __LITTLE_ENDIAN
+	return true;
+#else
+	return false;
+#endif
+}
+
+static inline u16 __rpmsg16_to_cpu(bool little_endian, __rpmsg16 val)
+{
+	if (little_endian)
+		return le16_to_cpu((__force __le16)val);
+	else
+		return be16_to_cpu((__force __be16)val);
+}
+
+static inline __rpmsg16 __cpu_to_rpmsg16(bool little_endian, u16 val)
+{
+	if (little_endian)
+		return (__force __rpmsg16)cpu_to_le16(val);
+	else
+		return (__force __rpmsg16)cpu_to_be16(val);
+}
+
+static inline u32 __rpmsg32_to_cpu(bool little_endian, __rpmsg32 val)
+{
+	if (little_endian)
+		return le32_to_cpu((__force __le32)val);
+	else
+		return be32_to_cpu((__force __be32)val);
+}
+
+static inline __rpmsg32 __cpu_to_rpmsg32(bool little_endian, u32 val)
+{
+	if (little_endian)
+		return (__force __rpmsg32)cpu_to_le32(val);
+	else
+		return (__force __rpmsg32)cpu_to_be32(val);
+}
+
+static inline u64 __rpmsg64_to_cpu(bool little_endian, __rpmsg64 val)
+{
+	if (little_endian)
+		return le64_to_cpu((__force __le64)val);
+	else
+		return be64_to_cpu((__force __be64)val);
+}
+
+static inline __rpmsg64 __cpu_to_rpmsg64(bool little_endian, u64 val)
+{
+	if (little_endian)
+		return (__force __rpmsg64)cpu_to_le64(val);
+	else
+		return (__force __rpmsg64)cpu_to_be64(val);
+}
+
+#endif /* _LINUX_RPMSG_BYTEORDER_H */
diff --git a/include/uapi/linux/rpmsg_types.h b/include/uapi/linux/rpmsg_types.h
new file mode 100644
index 000000000000..36e3b9404391
--- /dev/null
+++ b/include/uapi/linux/rpmsg_types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RPMSG_TYPES_H
+#define _UAPI_LINUX_RPMSG_TYPES_H
+
+#include <linux/types.h>
+
+typedef __u16 __bitwise __rpmsg16;
+typedef __u32 __bitwise __rpmsg32;
+typedef __u64 __bitwise __rpmsg64;
+
+#endif /* _UAPI_LINUX_RPMSG_TYPES_H */
-- 
cgit 


From c435a04189de372ba9ae72076b18185a884108d6 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 20 Nov 2020 14:42:40 -0700
Subject: rpmsg: Move structure rpmsg_ns_msg to header file

Move structure rpmsg_ns_msg to its own header file so that
it can be used by other entities.

Tested-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Link: https://lore.kernel.org/r/20201120214245.172963-4-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/virtio_rpmsg_bus.c | 32 +-----------------------------
 include/linux/rpmsg/ns.h         | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 31 deletions(-)
 create mode 100644 include/linux/rpmsg/ns.h

(limited to 'include/linux')

diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 5259fbbc8e68..20d0cf909bea 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -20,6 +20,7 @@
 #include <linux/of_device.h>
 #include <linux/rpmsg.h>
 #include <linux/rpmsg/byteorder.h>
+#include <linux/rpmsg/ns.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -93,34 +94,6 @@ struct rpmsg_hdr {
 	u8 data[];
 } __packed;
 
-/**
- * struct rpmsg_ns_msg - dynamic name service announcement message
- * @name: name of remote service that is published
- * @addr: address of remote service that is published
- * @flags: indicates whether service is created or destroyed
- *
- * This message is sent across to publish a new service, or announce
- * about its removal. When we receive these messages, an appropriate
- * rpmsg channel (i.e device) is created/destroyed. In turn, the ->probe()
- * or ->remove() handler of the appropriate rpmsg driver will be invoked
- * (if/as-soon-as one is registered).
- */
-struct rpmsg_ns_msg {
-	char name[RPMSG_NAME_SIZE];
-	__rpmsg32 addr;
-	__rpmsg32 flags;
-} __packed;
-
-/**
- * enum rpmsg_ns_flags - dynamic name service announcement flags
- *
- * @RPMSG_NS_CREATE: a new remote service was just created
- * @RPMSG_NS_DESTROY: a known remote service was just destroyed
- */
-enum rpmsg_ns_flags {
-	RPMSG_NS_CREATE		= 0,
-	RPMSG_NS_DESTROY	= 1,
-};
 
 /**
  * struct virtio_rpmsg_channel - rpmsg channel descriptor
@@ -167,9 +140,6 @@ struct virtio_rpmsg_channel {
  */
 #define RPMSG_RESERVED_ADDRESSES	(1024)
 
-/* Address 53 is reserved for advertising remote services */
-#define RPMSG_NS_ADDR			(53)
-
 static void virtio_rpmsg_destroy_ept(struct rpmsg_endpoint *ept);
 static int virtio_rpmsg_send(struct rpmsg_endpoint *ept, void *data, int len);
 static int virtio_rpmsg_sendto(struct rpmsg_endpoint *ept, void *data, int len,
diff --git a/include/linux/rpmsg/ns.h b/include/linux/rpmsg/ns.h
new file mode 100644
index 000000000000..73ecc91dc26f
--- /dev/null
+++ b/include/linux/rpmsg/ns.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_RPMSG_NS_H
+#define _LINUX_RPMSG_NS_H
+
+#include <linux/mod_devicetable.h>
+#include <linux/rpmsg/byteorder.h>
+#include <linux/types.h>
+
+/**
+ * struct rpmsg_ns_msg - dynamic name service announcement message
+ * @name: name of remote service that is published
+ * @addr: address of remote service that is published
+ * @flags: indicates whether service is created or destroyed
+ *
+ * This message is sent across to publish a new service, or announce
+ * about its removal. When we receive these messages, an appropriate
+ * rpmsg channel (i.e device) is created/destroyed. In turn, the ->probe()
+ * or ->remove() handler of the appropriate rpmsg driver will be invoked
+ * (if/as-soon-as one is registered).
+ */
+struct rpmsg_ns_msg {
+	char name[RPMSG_NAME_SIZE];
+	__rpmsg32 addr;
+	__rpmsg32 flags;
+} __packed;
+
+/**
+ * enum rpmsg_ns_flags - dynamic name service announcement flags
+ *
+ * @RPMSG_NS_CREATE: a new remote service was just created
+ * @RPMSG_NS_DESTROY: a known remote service was just destroyed
+ */
+enum rpmsg_ns_flags {
+	RPMSG_NS_CREATE		= 0,
+	RPMSG_NS_DESTROY	= 1,
+};
+
+/* Address 53 is reserved for advertising remote services */
+#define RPMSG_NS_ADDR			(53)
+
+#endif
-- 
cgit 


From 55488110acc1560b4599e3d509b13f2731a6fee1 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 20 Nov 2020 14:42:44 -0700
Subject: rpmsg: Make rpmsg_{register|unregister}_device() public

Make function rpmsg_register_device() and rpmsg_unregister_device()
functions public so that they can be used by other clients.  While
doing so get rid of two obsolete function, i.e register_rpmsg_device()
and unregister_rpmsg_device(), to prevent confusion.

Tested-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Link: https://lore.kernel.org/r/20201120214245.172963-8-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/rpmsg_internal.h |  4 ----
 include/linux/rpmsg.h          | 12 ++++++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rpmsg/rpmsg_internal.h b/drivers/rpmsg/rpmsg_internal.h
index f1de73e0f2d6..a76c344253bf 100644
--- a/drivers/rpmsg/rpmsg_internal.h
+++ b/drivers/rpmsg/rpmsg_internal.h
@@ -74,10 +74,6 @@ struct rpmsg_endpoint_ops {
 			     poll_table *wait);
 };
 
-int rpmsg_register_device(struct rpmsg_device *rpdev);
-int rpmsg_unregister_device(struct device *parent,
-			    struct rpmsg_channel_info *chinfo);
-
 struct device *rpmsg_find_device(struct device *parent,
 				 struct rpmsg_channel_info *chinfo);
 
diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index faf2daff6238..a5db828b2420 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -164,8 +164,9 @@ static inline __rpmsg64 cpu_to_rpmsg64(struct rpmsg_device *rpdev, u64 val)
 
 #if IS_ENABLED(CONFIG_RPMSG)
 
-int register_rpmsg_device(struct rpmsg_device *dev);
-void unregister_rpmsg_device(struct rpmsg_device *dev);
+int rpmsg_register_device(struct rpmsg_device *rpdev);
+int rpmsg_unregister_device(struct device *parent,
+			    struct rpmsg_channel_info *chinfo);
 int __register_rpmsg_driver(struct rpmsg_driver *drv, struct module *owner);
 void unregister_rpmsg_driver(struct rpmsg_driver *drv);
 void rpmsg_destroy_ept(struct rpmsg_endpoint *);
@@ -188,15 +189,18 @@ __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp,
 
 #else
 
-static inline int register_rpmsg_device(struct rpmsg_device *dev)
+static inline int rpmsg_register_device(struct rpmsg_device *rpdev)
 {
 	return -ENXIO;
 }
 
-static inline void unregister_rpmsg_device(struct rpmsg_device *dev)
+static inline int rpmsg_unregister_device(struct device *parent,
+					  struct rpmsg_channel_info *chinfo)
 {
 	/* This shouldn't be possible */
 	WARN_ON(1);
+
+	return -ENXIO;
 }
 
 static inline int __register_rpmsg_driver(struct rpmsg_driver *drv,
-- 
cgit 


From 950a7388f02bf775515d13dc508cb9d749bd6d91 Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Date: Fri, 20 Nov 2020 14:42:45 -0700
Subject: rpmsg: Turn name service into a stand alone driver

Make the RPMSG name service announcement a stand alone driver so that it
can be reused by other subsystems.  It is also the first step in making the
functionatlity transport independent, i.e that is not tied to virtIO.

Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Tested-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Co-developed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Co-developed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Signed-off-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Link: https://lore.kernel.org/r/20201120214245.172963-9-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/Kconfig            |   9 +++
 drivers/rpmsg/Makefile           |   1 +
 drivers/rpmsg/rpmsg_ns.c         | 126 +++++++++++++++++++++++++++++++++++++++
 drivers/rpmsg/virtio_rpmsg_bus.c |  87 +++++++--------------------
 include/linux/rpmsg/ns.h         |   3 +
 5 files changed, 159 insertions(+), 67 deletions(-)
 create mode 100644 drivers/rpmsg/rpmsg_ns.c

(limited to 'include/linux')

diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig
index f96716893c2a..0b4407abdf13 100644
--- a/drivers/rpmsg/Kconfig
+++ b/drivers/rpmsg/Kconfig
@@ -15,6 +15,14 @@ config RPMSG_CHAR
 	  in /dev. They make it possible for user-space programs to send and
 	  receive rpmsg packets.
 
+config RPMSG_NS
+	tristate "RPMSG name service announcement"
+	depends on RPMSG
+	help
+	  Say Y here to enable the support of the name service announcement
+	  channel that probes the associated RPMsg device on remote endpoint
+	  service announcement.
+
 config RPMSG_MTK_SCP
 	tristate "MediaTek SCP"
 	depends on MTK_SCP
@@ -62,6 +70,7 @@ config RPMSG_VIRTIO
 	tristate "Virtio RPMSG bus driver"
 	depends on HAS_DMA
 	select RPMSG
+	select RPMSG_NS
 	select VIRTIO
 
 endmenu
diff --git a/drivers/rpmsg/Makefile b/drivers/rpmsg/Makefile
index ffe932ef6050..8d452656f0ee 100644
--- a/drivers/rpmsg/Makefile
+++ b/drivers/rpmsg/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_RPMSG)		+= rpmsg_core.o
 obj-$(CONFIG_RPMSG_CHAR)	+= rpmsg_char.o
+obj-$(CONFIG_RPMSG_NS)		+= rpmsg_ns.o
 obj-$(CONFIG_RPMSG_MTK_SCP)	+= mtk_rpmsg.o
 qcom_glink-objs			:= qcom_glink_native.o qcom_glink_ssr.o
 obj-$(CONFIG_RPMSG_QCOM_GLINK) += qcom_glink.o
diff --git a/drivers/rpmsg/rpmsg_ns.c b/drivers/rpmsg/rpmsg_ns.c
new file mode 100644
index 000000000000..762ff1ae279f
--- /dev/null
+++ b/drivers/rpmsg/rpmsg_ns.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) STMicroelectronics 2020 - All Rights Reserved
+ */
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rpmsg.h>
+#include <linux/rpmsg/ns.h>
+#include <linux/slab.h>
+
+#include "rpmsg_internal.h"
+
+/**
+ * rpmsg_ns_register_device() - register name service device based on rpdev
+ * @rpdev: prepared rpdev to be used for creating endpoints
+ *
+ * This function wraps rpmsg_register_device() preparing the rpdev for use as
+ * basis for the rpmsg name service device.
+ */
+int rpmsg_ns_register_device(struct rpmsg_device *rpdev)
+{
+	strcpy(rpdev->id.name, "rpmsg_ns");
+	rpdev->driver_override = "rpmsg_ns";
+	rpdev->src = RPMSG_NS_ADDR;
+	rpdev->dst = RPMSG_NS_ADDR;
+
+	return rpmsg_register_device(rpdev);
+}
+EXPORT_SYMBOL(rpmsg_ns_register_device);
+
+/* invoked when a name service announcement arrives */
+static int rpmsg_ns_cb(struct rpmsg_device *rpdev, void *data, int len,
+		       void *priv, u32 src)
+{
+	struct rpmsg_ns_msg *msg = data;
+	struct rpmsg_device *newch;
+	struct rpmsg_channel_info chinfo;
+	struct device *dev = rpdev->dev.parent;
+	int ret;
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+	dynamic_hex_dump("NS announcement: ", DUMP_PREFIX_NONE, 16, 1,
+			 data, len, true);
+#endif
+
+	if (len != sizeof(*msg)) {
+		dev_err(dev, "malformed ns msg (%d)\n", len);
+		return -EINVAL;
+	}
+
+	/* don't trust the remote processor for null terminating the name */
+	msg->name[RPMSG_NAME_SIZE - 1] = '\0';
+
+	strncpy(chinfo.name, msg->name, sizeof(chinfo.name));
+	chinfo.src = RPMSG_ADDR_ANY;
+	chinfo.dst = rpmsg32_to_cpu(rpdev, msg->addr);
+
+	dev_info(dev, "%sing channel %s addr 0x%x\n",
+		 rpmsg32_to_cpu(rpdev, msg->flags) & RPMSG_NS_DESTROY ?
+		 "destroy" : "creat", msg->name, chinfo.dst);
+
+	if (rpmsg32_to_cpu(rpdev, msg->flags) & RPMSG_NS_DESTROY) {
+		ret = rpmsg_release_channel(rpdev, &chinfo);
+		if (ret)
+			dev_err(dev, "rpmsg_destroy_channel failed: %d\n", ret);
+	} else {
+		newch = rpmsg_create_channel(rpdev, &chinfo);
+		if (!newch)
+			dev_err(dev, "rpmsg_create_channel failed\n");
+	}
+
+	return 0;
+}
+
+static int rpmsg_ns_probe(struct rpmsg_device *rpdev)
+{
+	struct rpmsg_endpoint *ns_ept;
+	struct rpmsg_channel_info ns_chinfo = {
+		.src = RPMSG_NS_ADDR,
+		.dst = RPMSG_NS_ADDR,
+		.name = "name_service",
+	};
+
+	/*
+	 * Create the NS announcement service endpoint associated to the RPMsg
+	 * device. The endpoint will be automatically destroyed when the RPMsg
+	 * device will be deleted.
+	 */
+	ns_ept = rpmsg_create_ept(rpdev, rpmsg_ns_cb, NULL, ns_chinfo);
+	if (!ns_ept) {
+		dev_err(&rpdev->dev, "failed to create the ns ept\n");
+		return -ENOMEM;
+	}
+	rpdev->ept = ns_ept;
+
+	return 0;
+}
+
+static struct rpmsg_driver rpmsg_ns_driver = {
+	.drv.name = KBUILD_MODNAME,
+	.probe = rpmsg_ns_probe,
+};
+
+static int rpmsg_ns_init(void)
+{
+	int ret;
+
+	ret = register_rpmsg_driver(&rpmsg_ns_driver);
+	if (ret < 0)
+		pr_err("%s: Failed to register rpmsg driver\n", __func__);
+
+	return ret;
+}
+postcore_initcall(rpmsg_ns_init);
+
+static void rpmsg_ns_exit(void)
+{
+	unregister_rpmsg_driver(&rpmsg_ns_driver);
+}
+module_exit(rpmsg_ns_exit);
+
+MODULE_DESCRIPTION("Name service announcement rpmsg driver");
+MODULE_AUTHOR("Arnaud Pouliquen <arnaud.pouliquen@st.com>");
+MODULE_ALIAS("rpmsg:" KBUILD_MODNAME);
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 6ec299f7f790..e87d4cf926eb 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -49,7 +49,6 @@
  * @endpoints_lock: lock of the endpoints set
  * @sendq:	wait queue of sending contexts waiting for a tx buffers
  * @sleepers:	number of senders that are waiting for a tx buffer
- * @ns_ept:	the bus's name service endpoint
  *
  * This structure stores the rpmsg state of a given virtio remote processor
  * device (there might be several virtio proc devices for each physical
@@ -68,7 +67,6 @@ struct virtproc_info {
 	struct mutex endpoints_lock;
 	wait_queue_head_t sendq;
 	atomic_t sleepers;
-	struct rpmsg_endpoint *ns_ept;
 };
 
 /* The feature bitmap for virtio rpmsg */
@@ -815,69 +813,14 @@ static void rpmsg_xmit_done(struct virtqueue *svq)
 	wake_up_interruptible(&vrp->sendq);
 }
 
-/* invoked when a name service announcement arrives */
-static int rpmsg_ns_cb(struct rpmsg_device *rpdev, void *data, int len,
-		       void *priv, u32 src)
-{
-	struct rpmsg_ns_msg *msg = data;
-	struct rpmsg_device *newch;
-	struct rpmsg_channel_info chinfo;
-	struct virtproc_info *vrp = priv;
-	struct device *dev = &vrp->vdev->dev;
-	bool little_endian = virtio_is_little_endian(vrp->vdev);
-	int ret;
-
-#if defined(CONFIG_DYNAMIC_DEBUG)
-	dynamic_hex_dump("NS announcement: ", DUMP_PREFIX_NONE, 16, 1,
-			 data, len, true);
-#endif
-
-	if (len != sizeof(*msg)) {
-		dev_err(dev, "malformed ns msg (%d)\n", len);
-		return -EINVAL;
-	}
-
-	/*
-	 * the name service ept does _not_ belong to a real rpmsg channel,
-	 * and is handled by the rpmsg bus itself.
-	 * for sanity reasons, make sure a valid rpdev has _not_ sneaked
-	 * in somehow.
-	 */
-	if (rpdev) {
-		dev_err(dev, "anomaly: ns ept has an rpdev handle\n");
-		return -EINVAL;
-	}
-
-	/* don't trust the remote processor for null terminating the name */
-	msg->name[RPMSG_NAME_SIZE - 1] = '\0';
-
-	strncpy(chinfo.name, msg->name, sizeof(chinfo.name));
-	chinfo.src = RPMSG_ADDR_ANY;
-	chinfo.dst = __rpmsg32_to_cpu(little_endian, msg->addr);
-
-	dev_info(dev, "%sing channel %s addr 0x%x\n",
-		 __rpmsg32_to_cpu(little_endian, msg->flags) & RPMSG_NS_DESTROY ?
-		 "destroy" : "creat", msg->name, chinfo.dst);
-
-	if (__rpmsg32_to_cpu(little_endian, msg->flags) & RPMSG_NS_DESTROY) {
-		ret = rpmsg_unregister_device(&vrp->vdev->dev, &chinfo);
-		if (ret)
-			dev_err(dev, "rpmsg_destroy_channel failed: %d\n", ret);
-	} else {
-		newch = __rpmsg_create_channel(vrp, &chinfo);
-		if (!newch)
-			dev_err(dev, "rpmsg_create_channel failed\n");
-	}
-
-	return 0;
-}
-
 static int rpmsg_probe(struct virtio_device *vdev)
 {
 	vq_callback_t *vq_cbs[] = { rpmsg_recv_done, rpmsg_xmit_done };
 	static const char * const names[] = { "input", "output" };
 	struct virtqueue *vqs[2];
 	struct virtproc_info *vrp;
+	struct virtio_rpmsg_channel *vch;
+	struct rpmsg_device *rpdev_ns;
 	void *bufs_va;
 	int err = 0, i;
 	size_t total_buf_space;
@@ -953,14 +896,26 @@ static int rpmsg_probe(struct virtio_device *vdev)
 
 	/* if supported by the remote processor, enable the name service */
 	if (virtio_has_feature(vdev, VIRTIO_RPMSG_F_NS)) {
-		/* a dedicated endpoint handles the name service msgs */
-		vrp->ns_ept = __rpmsg_create_ept(vrp, NULL, rpmsg_ns_cb,
-						vrp, RPMSG_NS_ADDR);
-		if (!vrp->ns_ept) {
-			dev_err(&vdev->dev, "failed to create the ns ept\n");
+		vch = kzalloc(sizeof(*vch), GFP_KERNEL);
+		if (!vch) {
 			err = -ENOMEM;
 			goto free_coherent;
 		}
+
+		/* Link the channel to our vrp */
+		vch->vrp = vrp;
+
+		/* Assign public information to the rpmsg_device */
+		rpdev_ns = &vch->rpdev;
+		rpdev_ns->ops = &virtio_rpmsg_ops;
+		rpdev_ns->little_endian = virtio_is_little_endian(vrp->vdev);
+
+		rpdev_ns->dev.parent = &vrp->vdev->dev;
+		rpdev_ns->dev.release = virtio_rpmsg_release_device;
+
+		err = rpmsg_ns_register_device(rpdev_ns);
+		if (err)
+			goto free_coherent;
 	}
 
 	/*
@@ -985,6 +940,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
 	return 0;
 
 free_coherent:
+	kfree(vch);
 	dma_free_coherent(vdev->dev.parent, total_buf_space,
 			  bufs_va, vrp->bufs_dma);
 vqs_del:
@@ -1013,9 +969,6 @@ static void rpmsg_remove(struct virtio_device *vdev)
 	if (ret)
 		dev_warn(&vdev->dev, "can't remove rpmsg device: %d\n", ret);
 
-	if (vrp->ns_ept)
-		__rpmsg_destroy_ept(vrp, vrp->ns_ept);
-
 	idr_destroy(&vrp->endpoints);
 
 	vdev->config->del_vqs(vrp->vdev);
diff --git a/include/linux/rpmsg/ns.h b/include/linux/rpmsg/ns.h
index 73ecc91dc26f..a7804edd6d58 100644
--- a/include/linux/rpmsg/ns.h
+++ b/include/linux/rpmsg/ns.h
@@ -4,6 +4,7 @@
 #define _LINUX_RPMSG_NS_H
 
 #include <linux/mod_devicetable.h>
+#include <linux/rpmsg.h>
 #include <linux/rpmsg/byteorder.h>
 #include <linux/types.h>
 
@@ -39,4 +40,6 @@ enum rpmsg_ns_flags {
 /* Address 53 is reserved for advertising remote services */
 #define RPMSG_NS_ADDR			(53)
 
+int rpmsg_ns_register_device(struct rpmsg_device *rpdev);
+
 #endif
-- 
cgit 


From e7bbb7acabf47d74672e0e314bed4d452d2097b4 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 9 Nov 2020 14:24:49 +0530
Subject: dmaengine: add peripheral configuration

Some complex dmaengine controllers have capability to program the
peripheral device, so pass on the peripheral configuration as part of
dma_slave_config

Link: https://lore.kernel.org/r/20201109085450.24843-3-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index dd357a747780..493a047ed0a2 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -418,6 +418,9 @@ enum dma_slave_buswidth {
  * @slave_id: Slave requester id. Only valid for slave channels. The dma
  * slave peripheral will have unique id as dma requester which need to be
  * pass as slave config.
+ * @peripheral_config: peripheral configuration for programming peripheral
+ * for dmaengine transfer
+ * @peripheral_size: peripheral configuration buffer size
  *
  * This struct is passed in as configuration data to a DMA engine
  * in order to set up a certain channel for DMA transport at runtime.
@@ -443,6 +446,8 @@ struct dma_slave_config {
 	u32 dst_port_window_size;
 	bool device_fc;
 	unsigned int slave_id;
+	void *peripheral_config;
+	size_t peripheral_size;
 };
 
 /**
-- 
cgit 


From 5d0c3533a19f48e5e7e73806a3e4b29cd4364130 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 9 Nov 2020 14:24:50 +0530
Subject: dmaengine: qcom: Add GPI dma driver

This controller provides DMAengine capabilities for a variety of peripheral
buses such as I2C, UART, and SPI. By using GPI dmaengine driver, bus
drivers can use a standardize interface that is protocol independent to
transfer data between memory and peripheral.

Link: https://lore.kernel.org/r/20201109085450.24843-4-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/qcom/Kconfig         |   12 +
 drivers/dma/qcom/Makefile        |    1 +
 drivers/dma/qcom/gpi.c           | 2303 ++++++++++++++++++++++++++++++++++++++
 include/linux/dma/qcom-gpi-dma.h |   83 ++
 4 files changed, 2399 insertions(+)
 create mode 100644 drivers/dma/qcom/gpi.c
 create mode 100644 include/linux/dma/qcom-gpi-dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/qcom/Kconfig b/drivers/dma/qcom/Kconfig
index 0389d60d2604..365f94eb3b08 100644
--- a/drivers/dma/qcom/Kconfig
+++ b/drivers/dma/qcom/Kconfig
@@ -19,6 +19,18 @@ config QCOM_BAM_DMA
 	  Enable support for the QCOM BAM DMA controller.  This controller
 	  provides DMA capabilities for a variety of on-chip devices.
 
+config QCOM_GPI_DMA
+        tristate "Qualcomm Technologies GPI DMA support"
+        depends on ARCH_QCOM
+        select DMA_ENGINE
+        select DMA_VIRTUAL_CHANNELS
+        help
+          Enable support for the QCOM GPI DMA controller. This controller
+          provides DMA capabilities for a variety of peripheral buses such
+          as I2C, UART, and SPI. By using GPI dmaengine driver, bus drivers
+          can use a standardize interface that is protocol independent to
+          transfer data between DDR and peripheral.
+
 config QCOM_HIDMA_MGMT
 	tristate "Qualcomm Technologies HIDMA Management support"
 	select DMA_ENGINE
diff --git a/drivers/dma/qcom/Makefile b/drivers/dma/qcom/Makefile
index 346e643fbb6d..50f1e7014693 100644
--- a/drivers/dma/qcom/Makefile
+++ b/drivers/dma/qcom/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_QCOM_ADM) += qcom_adm.o
 obj-$(CONFIG_QCOM_BAM_DMA) += bam_dma.o
+obj-$(CONFIG_QCOM_GPI_DMA) += gpi.o
 obj-$(CONFIG_QCOM_HIDMA_MGMT) += hdma_mgmt.o
 hdma_mgmt-objs	 := hidma_mgmt.o hidma_mgmt_sys.o
 obj-$(CONFIG_QCOM_HIDMA) +=  hdma.o
diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
new file mode 100644
index 000000000000..d2334f535de2
--- /dev/null
+++ b/drivers/dma/qcom/gpi.c
@@ -0,0 +1,2303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2017-2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2020, Linaro Limited
+ */
+
+#include <dt-bindings/dma/qcom-gpi.h>
+#include <linux/bitfield.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/module.h>
+#include <linux/of_dma.h>
+#include <linux/platform_device.h>
+#include <linux/dma/qcom-gpi-dma.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include "../dmaengine.h"
+#include "../virt-dma.h"
+
+#define TRE_TYPE_DMA		0x10
+#define TRE_TYPE_GO		0x20
+#define TRE_TYPE_CONFIG0	0x22
+
+/* TRE flags */
+#define TRE_FLAGS_CHAIN		BIT(0)
+#define TRE_FLAGS_IEOB		BIT(8)
+#define TRE_FLAGS_IEOT		BIT(9)
+#define TRE_FLAGS_BEI		BIT(10)
+#define TRE_FLAGS_LINK		BIT(11)
+#define TRE_FLAGS_TYPE		GENMASK(23, 16)
+
+/* SPI CONFIG0 WD0 */
+#define TRE_SPI_C0_WORD_SZ	GENMASK(4, 0)
+#define TRE_SPI_C0_LOOPBACK	BIT(8)
+#define TRE_SPI_C0_CS		BIT(11)
+#define TRE_SPI_C0_CPHA		BIT(12)
+#define TRE_SPI_C0_CPOL		BIT(13)
+#define TRE_SPI_C0_TX_PACK	BIT(24)
+#define TRE_SPI_C0_RX_PACK	BIT(25)
+
+/* CONFIG0 WD2 */
+#define TRE_C0_CLK_DIV		GENMASK(11, 0)
+#define TRE_C0_CLK_SRC		GENMASK(19, 16)
+
+/* SPI GO WD0 */
+#define TRE_SPI_GO_CMD		GENMASK(4, 0)
+#define TRE_SPI_GO_CS		GENMASK(10, 8)
+#define TRE_SPI_GO_FRAG		BIT(26)
+
+/* GO WD2 */
+#define TRE_RX_LEN		GENMASK(23, 0)
+
+/* I2C Config0 WD0 */
+#define TRE_I2C_C0_TLOW		GENMASK(7, 0)
+#define TRE_I2C_C0_THIGH	GENMASK(15, 8)
+#define TRE_I2C_C0_TCYL		GENMASK(23, 16)
+#define TRE_I2C_C0_TX_PACK	BIT(24)
+#define TRE_I2C_C0_RX_PACK      BIT(25)
+
+/* I2C GO WD0 */
+#define TRE_I2C_GO_CMD          GENMASK(4, 0)
+#define TRE_I2C_GO_ADDR		GENMASK(14, 8)
+#define TRE_I2C_GO_STRETCH	BIT(26)
+
+/* DMA TRE */
+#define TRE_DMA_LEN		GENMASK(23, 0)
+
+/* Register offsets from gpi-top */
+#define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_CNTXT_0_EL_SIZE	GENMASK(31, 24)
+#define GPII_n_CH_k_CNTXT_0_CHSTATE	GENMASK(23, 20)
+#define GPII_n_CH_k_CNTXT_0_ERIDX	GENMASK(18, 14)
+#define GPII_n_CH_k_CNTXT_0_DIR		BIT(3)
+#define GPII_n_CH_k_CNTXT_0_PROTO	GENMASK(2, 0)
+
+#define GPII_n_CH_k_CNTXT_0(el_size, erindex, dir, chtype_proto)  \
+	(FIELD_PREP(GPII_n_CH_k_CNTXT_0_EL_SIZE, el_size)	| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_ERIDX, erindex)		| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_DIR, dir)		| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_PROTO, chtype_proto))
+
+#define GPI_CHTYPE_DIR_IN	(0)
+#define GPI_CHTYPE_DIR_OUT	(1)
+
+#define GPI_CHTYPE_PROTO_GPI	(0x2)
+
+#define GPII_n_CH_k_DOORBELL_0_OFFS(n, k)	(0x22000 + (0x4000 * (n)) + (0x8 * (k)))
+#define GPII_n_CH_CMD_OFFS(n)			(0x23008 + (0x4000 * (n)))
+#define GPII_n_CH_CMD_OPCODE			GENMASK(31, 24)
+#define GPII_n_CH_CMD_CHID			GENMASK(7, 0)
+#define GPII_n_CH_CMD(opcode, chid)				 \
+		     (FIELD_PREP(GPII_n_CH_CMD_OPCODE, opcode) | \
+		      FIELD_PREP(GPII_n_CH_CMD_CHID, chid))
+
+#define GPII_n_CH_CMD_ALLOCATE		(0)
+#define GPII_n_CH_CMD_START		(1)
+#define GPII_n_CH_CMD_STOP		(2)
+#define GPII_n_CH_CMD_RESET		(9)
+#define GPII_n_CH_CMD_DE_ALLOC		(10)
+#define GPII_n_CH_CMD_UART_SW_STALE	(32)
+#define GPII_n_CH_CMD_UART_RFR_READY	(33)
+#define GPII_n_CH_CMD_UART_RFR_NOT_READY (34)
+
+/* EV Context Array */
+#define GPII_n_EV_CH_k_CNTXT_0_OFFS(n, k) (0x21000 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_EV_k_CNTXT_0_EL_SIZE	GENMASK(31, 24)
+#define GPII_n_EV_k_CNTXT_0_CHSTATE	GENMASK(23, 20)
+#define GPII_n_EV_k_CNTXT_0_INTYPE	BIT(16)
+#define GPII_n_EV_k_CNTXT_0_CHTYPE	GENMASK(3, 0)
+
+#define GPII_n_EV_k_CNTXT_0(el_size, inttype, chtype)		\
+	(FIELD_PREP(GPII_n_EV_k_CNTXT_0_EL_SIZE, el_size) |	\
+	 FIELD_PREP(GPII_n_EV_k_CNTXT_0_INTYPE, inttype)  |	\
+	 FIELD_PREP(GPII_n_EV_k_CNTXT_0_CHTYPE, chtype))
+
+#define GPI_INTTYPE_IRQ		(1)
+#define GPI_CHTYPE_GPI_EV	(0x2)
+
+enum CNTXT_OFFS {
+	CNTXT_0_CONFIG = 0x0,
+	CNTXT_1_R_LENGTH = 0x4,
+	CNTXT_2_RING_BASE_LSB = 0x8,
+	CNTXT_3_RING_BASE_MSB = 0xC,
+	CNTXT_4_RING_RP_LSB = 0x10,
+	CNTXT_5_RING_RP_MSB = 0x14,
+	CNTXT_6_RING_WP_LSB = 0x18,
+	CNTXT_7_RING_WP_MSB = 0x1C,
+	CNTXT_8_RING_INT_MOD = 0x20,
+	CNTXT_9_RING_INTVEC = 0x24,
+	CNTXT_10_RING_MSI_LSB = 0x28,
+	CNTXT_11_RING_MSI_MSB = 0x2C,
+	CNTXT_12_RING_RP_UPDATE_LSB = 0x30,
+	CNTXT_13_RING_RP_UPDATE_MSB = 0x34,
+};
+
+#define GPII_n_EV_CH_k_DOORBELL_0_OFFS(n, k)	(0x22100 + (0x4000 * (n)) + (0x8 * (k)))
+#define GPII_n_EV_CH_CMD_OFFS(n)		(0x23010 + (0x4000 * (n)))
+#define GPII_n_EV_CMD_OPCODE			GENMASK(31, 24)
+#define GPII_n_EV_CMD_CHID			GENMASK(7, 0)
+#define GPII_n_EV_CMD(opcode, chid)				 \
+		     (FIELD_PREP(GPII_n_EV_CMD_OPCODE, opcode) | \
+		      FIELD_PREP(GPII_n_EV_CMD_CHID, chid))
+
+#define GPII_n_EV_CH_CMD_ALLOCATE		(0x00)
+#define GPII_n_EV_CH_CMD_RESET			(0x09)
+#define GPII_n_EV_CH_CMD_DE_ALLOC		(0x0A)
+
+#define GPII_n_CNTXT_TYPE_IRQ_OFFS(n)		(0x23080 + (0x4000 * (n)))
+
+/* mask type register */
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(n)	(0x23088 + (0x4000 * (n)))
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK		GENMASK(6, 0)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_GENERAL	BIT(6)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB		BIT(3)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB		BIT(2)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL	BIT(1)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL	BIT(0)
+
+#define GPII_n_CNTXT_SRC_GPII_CH_IRQ_OFFS(n)	(0x23090 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_OFFS(n)	(0x23094 + (0x4000 * (n)))
+
+/* Mask channel control interrupt register */
+#define GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(n)	(0x23098 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK	GENMASK(1, 0)
+
+/* Mask event control interrupt register */
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(n)	(0x2309C + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK	BIT(0)
+
+#define GPII_n_CNTXT_SRC_CH_IRQ_CLR_OFFS(n)	(0x230A0 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_CLR_OFFS(n)	(0x230A4 + (0x4000 * (n)))
+
+/* Mask event interrupt register */
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(n)	(0x230B8 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK	BIT(0)
+
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_CLR_OFFS(n)	(0x230C0 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GLOB_IRQ_STTS_OFFS(n)	(0x23100 + (0x4000 * (n)))
+#define GPI_GLOB_IRQ_ERROR_INT_MSK		BIT(0)
+
+/* GPII specific Global - Enable bit register */
+#define GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(n)	(0x23108 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GLOB_IRQ_CLR_OFFS(n)	(0x23110 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GPII_IRQ_STTS_OFFS(n)	(0x23118 + (0x4000 * (n)))
+
+/* GPII general interrupt - Enable bit register */
+#define GPII_n_CNTXT_GPII_IRQ_EN_OFFS(n)	(0x23120 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GPII_IRQ_EN_BMSK		GENMASK(3, 0)
+
+#define GPII_n_CNTXT_GPII_IRQ_CLR_OFFS(n)	(0x23128 + (0x4000 * (n)))
+
+/* GPII Interrupt Type register */
+#define GPII_n_CNTXT_INTSET_OFFS(n)		(0x23180 + (0x4000 * (n)))
+#define GPII_n_CNTXT_INTSET_BMSK		BIT(0)
+
+#define GPII_n_CNTXT_MSI_BASE_LSB_OFFS(n)	(0x23188 + (0x4000 * (n)))
+#define GPII_n_CNTXT_MSI_BASE_MSB_OFFS(n)	(0x2318C + (0x4000 * (n)))
+#define GPII_n_CNTXT_SCRATCH_0_OFFS(n)		(0x23400 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SCRATCH_1_OFFS(n)		(0x23404 + (0x4000 * (n)))
+
+#define GPII_n_ERROR_LOG_OFFS(n)		(0x23200 + (0x4000 * (n)))
+
+/* QOS Registers */
+#define GPII_n_CH_k_QOS_OFFS(n, k)		(0x2005C + (0x4000 * (n)) + (0x80 * (k)))
+
+/* Scratch registers */
+#define GPII_n_CH_k_SCRATCH_0_OFFS(n, k)	(0x20060 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_0_SEID		GENMASK(2, 0)
+#define GPII_n_CH_k_SCRATCH_0_PROTO		GENMASK(7, 4)
+#define GPII_n_CH_k_SCRATCH_0_PAIR		GENMASK(20, 16)
+#define GPII_n_CH_k_SCRATCH_0(pair, proto, seid)		\
+			     (FIELD_PREP(GPII_n_CH_k_SCRATCH_0_PAIR, pair)	| \
+			      FIELD_PREP(GPII_n_CH_k_SCRATCH_0_PROTO, proto)	| \
+			      FIELD_PREP(GPII_n_CH_k_SCRATCH_0_SEID, seid))
+#define GPII_n_CH_k_SCRATCH_1_OFFS(n, k)	(0x20064 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_2_OFFS(n, k)	(0x20068 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_3_OFFS(n, k)	(0x2006C + (0x4000 * (n)) + (0x80 * (k)))
+
+struct __packed gpi_tre {
+	u32 dword[4];
+};
+
+enum msm_gpi_tce_code {
+	MSM_GPI_TCE_SUCCESS = 1,
+	MSM_GPI_TCE_EOT = 2,
+	MSM_GPI_TCE_EOB = 4,
+	MSM_GPI_TCE_UNEXP_ERR = 16,
+};
+
+#define CMD_TIMEOUT_MS		(250)
+
+#define MAX_CHANNELS_PER_GPII	(2)
+#define GPI_TX_CHAN		(0)
+#define GPI_RX_CHAN		(1)
+#define STATE_IGNORE		(U32_MAX)
+#define EV_FACTOR		(2)
+#define REQ_OF_DMA_ARGS		(5) /* # of arguments required from client */
+#define CHAN_TRES		64
+
+struct __packed xfer_compl_event {
+	u64 ptr;
+	u32 length:24;
+	u8 code;
+	u16 status;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed immediate_data_event {
+	u8 data_bytes[8];
+	u8 length:4;
+	u8 resvd:4;
+	u16 tre_index;
+	u8 code;
+	u16 status;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed qup_notif_event {
+	u32 status;
+	u32 time;
+	u32 count:24;
+	u8 resvd;
+	u16 resvd1;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed gpi_ere {
+	u32 dword[4];
+};
+
+enum GPI_EV_TYPE {
+	XFER_COMPLETE_EV_TYPE = 0x22,
+	IMMEDIATE_DATA_EV_TYPE = 0x30,
+	QUP_NOTIF_EV_TYPE = 0x31,
+	STALE_EV_TYPE = 0xFF,
+};
+
+union __packed gpi_event {
+	struct __packed xfer_compl_event xfer_compl_event;
+	struct __packed immediate_data_event immediate_data_event;
+	struct __packed qup_notif_event qup_notif_event;
+	struct __packed gpi_ere gpi_ere;
+};
+
+enum gpii_irq_settings {
+	DEFAULT_IRQ_SETTINGS,
+	MASK_IEOB_SETTINGS,
+};
+
+enum gpi_ev_state {
+	DEFAULT_EV_CH_STATE = 0,
+	EV_STATE_NOT_ALLOCATED = DEFAULT_EV_CH_STATE,
+	EV_STATE_ALLOCATED,
+	MAX_EV_STATES
+};
+
+static const char *const gpi_ev_state_str[MAX_EV_STATES] = {
+	[EV_STATE_NOT_ALLOCATED] = "NOT ALLOCATED",
+	[EV_STATE_ALLOCATED] = "ALLOCATED",
+};
+
+#define TO_GPI_EV_STATE_STR(_state) (((_state) >= MAX_EV_STATES) ? \
+				    "INVALID" : gpi_ev_state_str[(_state)])
+
+enum gpi_ch_state {
+	DEFAULT_CH_STATE = 0x0,
+	CH_STATE_NOT_ALLOCATED = DEFAULT_CH_STATE,
+	CH_STATE_ALLOCATED = 0x1,
+	CH_STATE_STARTED = 0x2,
+	CH_STATE_STOPPED = 0x3,
+	CH_STATE_STOP_IN_PROC = 0x4,
+	CH_STATE_ERROR = 0xf,
+	MAX_CH_STATES
+};
+
+enum gpi_cmd {
+	GPI_CH_CMD_BEGIN,
+	GPI_CH_CMD_ALLOCATE = GPI_CH_CMD_BEGIN,
+	GPI_CH_CMD_START,
+	GPI_CH_CMD_STOP,
+	GPI_CH_CMD_RESET,
+	GPI_CH_CMD_DE_ALLOC,
+	GPI_CH_CMD_UART_SW_STALE,
+	GPI_CH_CMD_UART_RFR_READY,
+	GPI_CH_CMD_UART_RFR_NOT_READY,
+	GPI_CH_CMD_END = GPI_CH_CMD_UART_RFR_NOT_READY,
+	GPI_EV_CMD_BEGIN,
+	GPI_EV_CMD_ALLOCATE = GPI_EV_CMD_BEGIN,
+	GPI_EV_CMD_RESET,
+	GPI_EV_CMD_DEALLOC,
+	GPI_EV_CMD_END = GPI_EV_CMD_DEALLOC,
+	GPI_MAX_CMD,
+};
+
+#define IS_CHAN_CMD(_cmd) ((_cmd) <= GPI_CH_CMD_END)
+
+static const char *const gpi_cmd_str[GPI_MAX_CMD] = {
+	[GPI_CH_CMD_ALLOCATE] = "CH ALLOCATE",
+	[GPI_CH_CMD_START] = "CH START",
+	[GPI_CH_CMD_STOP] = "CH STOP",
+	[GPI_CH_CMD_RESET] = "CH_RESET",
+	[GPI_CH_CMD_DE_ALLOC] = "DE ALLOC",
+	[GPI_CH_CMD_UART_SW_STALE] = "UART SW STALE",
+	[GPI_CH_CMD_UART_RFR_READY] = "UART RFR READY",
+	[GPI_CH_CMD_UART_RFR_NOT_READY] = "UART RFR NOT READY",
+	[GPI_EV_CMD_ALLOCATE] = "EV ALLOCATE",
+	[GPI_EV_CMD_RESET] = "EV RESET",
+	[GPI_EV_CMD_DEALLOC] = "EV DEALLOC",
+};
+
+#define TO_GPI_CMD_STR(_cmd) (((_cmd) >= GPI_MAX_CMD) ? "INVALID" : \
+				  gpi_cmd_str[(_cmd)])
+
+/*
+ * @DISABLE_STATE: no register access allowed
+ * @CONFIG_STATE:  client has configured the channel
+ * @PREP_HARDWARE: register access is allowed
+ *		   however, no processing EVENTS
+ * @ACTIVE_STATE: channels are fully operational
+ * @PREPARE_TERMINATE: graceful termination of channels
+ *		       register access is allowed
+ * @PAUSE_STATE: channels are active, but not processing any events
+ */
+enum gpi_pm_state {
+	DISABLE_STATE,
+	CONFIG_STATE,
+	PREPARE_HARDWARE,
+	ACTIVE_STATE,
+	PREPARE_TERMINATE,
+	PAUSE_STATE,
+	MAX_PM_STATE
+};
+
+#define REG_ACCESS_VALID(_pm_state) ((_pm_state) >= PREPARE_HARDWARE)
+
+static const char *const gpi_pm_state_str[MAX_PM_STATE] = {
+	[DISABLE_STATE] = "DISABLE",
+	[CONFIG_STATE] = "CONFIG",
+	[PREPARE_HARDWARE] = "PREPARE HARDWARE",
+	[ACTIVE_STATE] = "ACTIVE",
+	[PREPARE_TERMINATE] = "PREPARE TERMINATE",
+	[PAUSE_STATE] = "PAUSE",
+};
+
+#define TO_GPI_PM_STR(_state) (((_state) >= MAX_PM_STATE) ? \
+			      "INVALID" : gpi_pm_state_str[(_state)])
+
+static const struct {
+	enum gpi_cmd gpi_cmd;
+	u32 opcode;
+	u32 state;
+} gpi_cmd_info[GPI_MAX_CMD] = {
+	{
+		GPI_CH_CMD_ALLOCATE,
+		GPII_n_CH_CMD_ALLOCATE,
+		CH_STATE_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_START,
+		GPII_n_CH_CMD_START,
+		CH_STATE_STARTED,
+	},
+	{
+		GPI_CH_CMD_STOP,
+		GPII_n_CH_CMD_STOP,
+		CH_STATE_STOPPED,
+	},
+	{
+		GPI_CH_CMD_RESET,
+		GPII_n_CH_CMD_RESET,
+		CH_STATE_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_DE_ALLOC,
+		GPII_n_CH_CMD_DE_ALLOC,
+		CH_STATE_NOT_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_UART_SW_STALE,
+		GPII_n_CH_CMD_UART_SW_STALE,
+		STATE_IGNORE,
+	},
+	{
+		GPI_CH_CMD_UART_RFR_READY,
+		GPII_n_CH_CMD_UART_RFR_READY,
+		STATE_IGNORE,
+	},
+	{
+		GPI_CH_CMD_UART_RFR_NOT_READY,
+		GPII_n_CH_CMD_UART_RFR_NOT_READY,
+		STATE_IGNORE,
+	},
+	{
+		GPI_EV_CMD_ALLOCATE,
+		GPII_n_EV_CH_CMD_ALLOCATE,
+		EV_STATE_ALLOCATED,
+	},
+	{
+		GPI_EV_CMD_RESET,
+		GPII_n_EV_CH_CMD_RESET,
+		EV_STATE_ALLOCATED,
+	},
+	{
+		GPI_EV_CMD_DEALLOC,
+		GPII_n_EV_CH_CMD_DE_ALLOC,
+		EV_STATE_NOT_ALLOCATED,
+	},
+};
+
+struct gpi_ring {
+	void *pre_aligned;
+	size_t alloc_size;
+	phys_addr_t phys_addr;
+	dma_addr_t dma_handle;
+	void *base;
+	void *wp;
+	void *rp;
+	u32 len;
+	u32 el_size;
+	u32 elements;
+	bool configured;
+};
+
+struct gpi_dev {
+	struct dma_device dma_device;
+	struct device *dev;
+	struct resource *res;
+	void __iomem *regs;
+	void __iomem *ee_base; /*ee register base address*/
+	u32 max_gpii; /* maximum # of gpii instances available per gpi block */
+	u32 gpii_mask; /* gpii instances available for apps */
+	u32 ev_factor; /* ev ring length factor */
+	struct gpii *gpiis;
+};
+
+struct reg_info {
+	char *name;
+	u32 offset;
+	u32 val;
+};
+
+struct gchan {
+	struct virt_dma_chan vc;
+	u32 chid;
+	u32 seid;
+	u32 protocol;
+	struct gpii *gpii;
+	enum gpi_ch_state ch_state;
+	enum gpi_pm_state pm_state;
+	void __iomem *ch_cntxt_base_reg;
+	void __iomem *ch_cntxt_db_reg;
+	void __iomem *ch_cmd_reg;
+	u32 dir;
+	struct gpi_ring ch_ring;
+	void *config;
+};
+
+struct gpii {
+	u32 gpii_id;
+	struct gchan gchan[MAX_CHANNELS_PER_GPII];
+	struct gpi_dev *gpi_dev;
+	int irq;
+	void __iomem *regs; /* points to gpi top */
+	void __iomem *ev_cntxt_base_reg;
+	void __iomem *ev_cntxt_db_reg;
+	void __iomem *ev_ring_rp_lsb_reg;
+	void __iomem *ev_cmd_reg;
+	void __iomem *ieob_clr_reg;
+	struct mutex ctrl_lock;
+	enum gpi_ev_state ev_state;
+	bool configured_irq;
+	enum gpi_pm_state pm_state;
+	rwlock_t pm_lock;
+	struct gpi_ring ev_ring;
+	struct tasklet_struct ev_task; /* event processing tasklet */
+	struct completion cmd_completion;
+	enum gpi_cmd gpi_cmd;
+	u32 cntxt_type_irq_msk;
+	bool ieob_set;
+};
+
+#define MAX_TRE 3
+
+struct gpi_desc {
+	struct virt_dma_desc vd;
+	size_t len;
+	void *db; /* DB register to program */
+	struct gchan *gchan;
+	struct gpi_tre tre[MAX_TRE];
+	u32 num_tre;
+};
+
+static const u32 GPII_CHAN_DIR[MAX_CHANNELS_PER_GPII] = {
+	GPI_CHTYPE_DIR_OUT, GPI_CHTYPE_DIR_IN
+};
+
+static irqreturn_t gpi_handle_irq(int irq, void *data);
+static void gpi_ring_recycle_ev_element(struct gpi_ring *ring);
+static int gpi_ring_add_element(struct gpi_ring *ring, void **wp);
+static void gpi_process_events(struct gpii *gpii);
+
+static inline struct gchan *to_gchan(struct dma_chan *dma_chan)
+{
+	return container_of(dma_chan, struct gchan, vc.chan);
+}
+
+static inline struct gpi_desc *to_gpi_desc(struct virt_dma_desc *vd)
+{
+	return container_of(vd, struct gpi_desc, vd);
+}
+
+static inline phys_addr_t to_physical(const struct gpi_ring *const ring,
+				      void *addr)
+{
+	return ring->phys_addr + (addr - ring->base);
+}
+
+static inline void *to_virtual(const struct gpi_ring *const ring, phys_addr_t addr)
+{
+	return ring->base + (addr - ring->phys_addr);
+}
+
+static inline u32 gpi_read_reg(struct gpii *gpii, void __iomem *addr)
+{
+	return readl_relaxed(addr);
+}
+
+static inline void gpi_write_reg(struct gpii *gpii, void __iomem *addr, u32 val)
+{
+	writel_relaxed(val, addr);
+}
+
+/* gpi_write_reg_field - write to specific bit field */
+static inline void gpi_write_reg_field(struct gpii *gpii, void __iomem *addr,
+				       u32 mask, u32 shift, u32 val)
+{
+	u32 tmp = gpi_read_reg(gpii, addr);
+
+	tmp &= ~mask;
+	val = tmp | ((val << shift) & mask);
+	gpi_write_reg(gpii, addr, val);
+}
+
+static inline void
+gpi_update_reg(struct gpii *gpii, u32 offset, u32 mask, u32 val)
+{
+	void __iomem *addr = gpii->regs + offset;
+	u32 tmp = gpi_read_reg(gpii, addr);
+
+	tmp &= ~mask;
+	tmp |= u32_encode_bits(val, mask);
+
+	gpi_write_reg(gpii, addr, tmp);
+}
+
+static void gpi_disable_interrupts(struct gpii *gpii)
+{
+	gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_GPII_IRQ_EN_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_INTSET_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_INTSET_BMSK, 0);
+
+	gpii->cntxt_type_irq_msk = 0;
+	devm_free_irq(gpii->gpi_dev->dev, gpii->irq, gpii);
+	gpii->configured_irq = false;
+}
+
+/* configure and enable interrupts */
+static int gpi_config_interrupts(struct gpii *gpii, enum gpii_irq_settings settings, bool mask)
+{
+	const u32 enable = (GPII_n_CNTXT_TYPE_IRQ_MSK_GENERAL |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL);
+	int ret;
+
+	if (!gpii->configured_irq) {
+		ret = devm_request_irq(gpii->gpi_dev->dev, gpii->irq,
+				       gpi_handle_irq, IRQF_TRIGGER_HIGH,
+				       "gpi-dma", gpii);
+		if (ret < 0) {
+			dev_err(gpii->gpi_dev->dev, "error request irq:%d ret:%d\n",
+				gpii->irq, ret);
+			return ret;
+		}
+	}
+
+	if (settings == MASK_IEOB_SETTINGS) {
+		/*
+		 * GPII only uses one EV ring per gpii so we can globally
+		 * enable/disable IEOB interrupt
+		 */
+		if (mask)
+			gpii->cntxt_type_irq_msk |= GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB;
+		else
+			gpii->cntxt_type_irq_msk &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB);
+		gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, gpii->cntxt_type_irq_msk);
+	} else {
+		gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, enable);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK,
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_GPII_IRQ_EN_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, GPII_n_CNTXT_GPII_IRQ_EN_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_MSI_BASE_LSB_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_MSI_BASE_MSB_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SCRATCH_0_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SCRATCH_1_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_INTSET_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_INTSET_BMSK, 1);
+		gpi_update_reg(gpii, GPII_n_ERROR_LOG_OFFS(gpii->gpii_id), U32_MAX, 0);
+
+		gpii->cntxt_type_irq_msk = enable;
+	}
+
+	gpii->configured_irq = true;
+	return 0;
+}
+
+/* Sends gpii event or channel command */
+static int gpi_send_cmd(struct gpii *gpii, struct gchan *gchan,
+			enum gpi_cmd gpi_cmd)
+{
+	u32 chid = MAX_CHANNELS_PER_GPII;
+	unsigned long timeout;
+	void __iomem *cmd_reg;
+	u32 cmd;
+
+	if (gpi_cmd >= GPI_MAX_CMD)
+		return -EINVAL;
+	if (IS_CHAN_CMD(gpi_cmd))
+		chid = gchan->chid;
+
+	dev_dbg(gpii->gpi_dev->dev,
+		"sending cmd: %s:%u\n", TO_GPI_CMD_STR(gpi_cmd), chid);
+
+	/* send opcode and wait for completion */
+	reinit_completion(&gpii->cmd_completion);
+	gpii->gpi_cmd = gpi_cmd;
+
+	cmd_reg = IS_CHAN_CMD(gpi_cmd) ? gchan->ch_cmd_reg : gpii->ev_cmd_reg;
+	cmd = IS_CHAN_CMD(gpi_cmd) ? GPII_n_CH_CMD(gpi_cmd_info[gpi_cmd].opcode, chid) :
+				     GPII_n_EV_CMD(gpi_cmd_info[gpi_cmd].opcode, 0);
+	gpi_write_reg(gpii, cmd_reg, cmd);
+	timeout = wait_for_completion_timeout(&gpii->cmd_completion,
+					      msecs_to_jiffies(CMD_TIMEOUT_MS));
+	if (!timeout) {
+		dev_err(gpii->gpi_dev->dev, "cmd: %s completion timeout:%u\n",
+			TO_GPI_CMD_STR(gpi_cmd), chid);
+		return -EIO;
+	}
+
+	/* confirm new ch state is correct , if the cmd is a state change cmd */
+	if (gpi_cmd_info[gpi_cmd].state == STATE_IGNORE)
+		return 0;
+
+	if (IS_CHAN_CMD(gpi_cmd) && gchan->ch_state == gpi_cmd_info[gpi_cmd].state)
+		return 0;
+
+	if (!IS_CHAN_CMD(gpi_cmd) && gpii->ev_state == gpi_cmd_info[gpi_cmd].state)
+		return 0;
+
+	return -EIO;
+}
+
+/* program transfer ring DB register */
+static inline void gpi_write_ch_db(struct gchan *gchan,
+				   struct gpi_ring *ring, void *wp)
+{
+	struct gpii *gpii = gchan->gpii;
+	phys_addr_t p_wp;
+
+	p_wp = to_physical(ring, wp);
+	gpi_write_reg(gpii, gchan->ch_cntxt_db_reg, p_wp);
+}
+
+/* program event ring DB register */
+static inline void gpi_write_ev_db(struct gpii *gpii,
+				   struct gpi_ring *ring, void *wp)
+{
+	phys_addr_t p_wp;
+
+	p_wp = ring->phys_addr + (wp - ring->base);
+	gpi_write_reg(gpii, gpii->ev_cntxt_db_reg, p_wp);
+}
+
+/* process transfer completion interrupt */
+static void gpi_process_ieob(struct gpii *gpii)
+{
+	gpi_write_reg(gpii, gpii->ieob_clr_reg, BIT(0));
+
+	gpi_config_interrupts(gpii, MASK_IEOB_SETTINGS, 0);
+	tasklet_hi_schedule(&gpii->ev_task);
+}
+
+/* process channel control interrupt */
+static void gpi_process_ch_ctrl_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_SRC_GPII_CH_IRQ_OFFS(gpii_id);
+	u32 ch_irq = gpi_read_reg(gpii, gpii->regs + offset);
+	struct gchan *gchan;
+	u32 chid, state;
+
+	/* clear the status */
+	offset = GPII_n_CNTXT_SRC_CH_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, (u32)ch_irq);
+
+	for (chid = 0; chid < MAX_CHANNELS_PER_GPII; chid++) {
+		if (!(BIT(chid) & ch_irq))
+			continue;
+
+		gchan = &gpii->gchan[chid];
+		state = gpi_read_reg(gpii, gchan->ch_cntxt_base_reg +
+				     CNTXT_0_CONFIG);
+		state = FIELD_GET(GPII_n_CH_k_CNTXT_0_CHSTATE, state);
+
+		/*
+		 * CH_CMD_DEALLOC cmd always successful. However cmd does
+		 * not change hardware status. So overwriting software state
+		 * to default state.
+		 */
+		if (gpii->gpi_cmd == GPI_CH_CMD_DE_ALLOC)
+			state = DEFAULT_CH_STATE;
+		gchan->ch_state = state;
+
+		/*
+		 * Triggering complete all if ch_state is not a stop in process.
+		 * Stop in process is a transition state and we will wait for
+		 * stop interrupt before notifying.
+		 */
+		if (gchan->ch_state != CH_STATE_STOP_IN_PROC)
+			complete_all(&gpii->cmd_completion);
+	}
+}
+
+/* processing gpi general error interrupts */
+static void gpi_process_gen_err_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_GPII_IRQ_STTS_OFFS(gpii_id);
+	u32 irq_stts = gpi_read_reg(gpii, gpii->regs + offset);
+
+	/* clear the status */
+	dev_dbg(gpii->gpi_dev->dev, "irq_stts:0x%x\n", irq_stts);
+
+	/* Clear the register */
+	offset = GPII_n_CNTXT_GPII_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, irq_stts);
+}
+
+/* processing gpi level error interrupts */
+static void gpi_process_glob_err_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_GLOB_IRQ_STTS_OFFS(gpii_id);
+	u32 irq_stts = gpi_read_reg(gpii, gpii->regs + offset);
+
+	offset = GPII_n_CNTXT_GLOB_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, irq_stts);
+
+	/* only error interrupt should be set */
+	if (irq_stts & ~GPI_GLOB_IRQ_ERROR_INT_MSK) {
+		dev_err(gpii->gpi_dev->dev, "invalid error status:0x%x\n", irq_stts);
+		return;
+	}
+
+	offset = GPII_n_ERROR_LOG_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, 0);
+}
+
+/* gpii interrupt handler */
+static irqreturn_t gpi_handle_irq(int irq, void *data)
+{
+	struct gpii *gpii = data;
+	u32 gpii_id = gpii->gpii_id;
+	u32 type, offset;
+	unsigned long flags;
+
+	read_lock_irqsave(&gpii->pm_lock, flags);
+
+	/*
+	 * States are out of sync to receive interrupt
+	 * while software state is in DISABLE state, bailing out.
+	 */
+	if (!REG_ACCESS_VALID(gpii->pm_state)) {
+		dev_err(gpii->gpi_dev->dev, "receive interrupt while in %s state\n",
+			TO_GPI_PM_STR(gpii->pm_state));
+		goto exit_irq;
+	}
+
+	offset = GPII_n_CNTXT_TYPE_IRQ_OFFS(gpii->gpii_id);
+	type = gpi_read_reg(gpii, gpii->regs + offset);
+
+	do {
+		/* global gpii error */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB) {
+			gpi_process_glob_err_irq(gpii);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB);
+		}
+
+		/* transfer complete interrupt */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB) {
+			gpi_process_ieob(gpii);
+			type &= ~GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB;
+		}
+
+		/* event control irq */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL) {
+			u32 ev_state;
+			u32 ev_ch_irq;
+
+			dev_dbg(gpii->gpi_dev->dev,
+				"processing EV CTRL interrupt\n");
+			offset = GPII_n_CNTXT_SRC_EV_CH_IRQ_OFFS(gpii_id);
+			ev_ch_irq = gpi_read_reg(gpii, gpii->regs + offset);
+
+			offset = GPII_n_CNTXT_SRC_EV_CH_IRQ_CLR_OFFS
+				(gpii_id);
+			gpi_write_reg(gpii, gpii->regs + offset, ev_ch_irq);
+			ev_state = gpi_read_reg(gpii, gpii->ev_cntxt_base_reg +
+						CNTXT_0_CONFIG);
+			ev_state = FIELD_GET(GPII_n_EV_k_CNTXT_0_CHSTATE, ev_state);
+
+			/*
+			 * CMD EV_CMD_DEALLOC is always successful. However
+			 * cmd does not change hardware status. So overwriting
+			 * software state to default state.
+			 */
+			if (gpii->gpi_cmd == GPI_EV_CMD_DEALLOC)
+				ev_state = DEFAULT_EV_CH_STATE;
+
+			gpii->ev_state = ev_state;
+			dev_dbg(gpii->gpi_dev->dev, "setting EV state to %s\n",
+				TO_GPI_EV_STATE_STR(gpii->ev_state));
+			complete_all(&gpii->cmd_completion);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL);
+		}
+
+		/* channel control irq */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL) {
+			dev_dbg(gpii->gpi_dev->dev, "process CH CTRL interrupts\n");
+			gpi_process_ch_ctrl_irq(gpii);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL);
+		}
+
+		if (type) {
+			dev_err(gpii->gpi_dev->dev, "Unhandled interrupt status:0x%x\n", type);
+			gpi_process_gen_err_irq(gpii);
+			goto exit_irq;
+		}
+
+		offset = GPII_n_CNTXT_TYPE_IRQ_OFFS(gpii->gpii_id);
+		type = gpi_read_reg(gpii, gpii->regs + offset);
+	} while (type);
+
+exit_irq:
+	read_unlock_irqrestore(&gpii->pm_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+/* process DMA Immediate completion data events */
+static void gpi_process_imed_data_event(struct gchan *gchan,
+					struct immediate_data_event *imed_event)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *tre = ch_ring->base + (ch_ring->el_size * imed_event->tre_index);
+	struct dmaengine_result result;
+	struct gpi_desc *gpi_desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+	u32 chid;
+
+	/*
+	 * If channel not active don't process event
+	 */
+	if (gchan->pm_state != ACTIVE_STATE) {
+		dev_err(gpii->gpi_dev->dev, "skipping processing event because ch @ %s state\n",
+			TO_GPI_PM_STR(gchan->pm_state));
+		return;
+	}
+
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vd = vchan_next_desc(&gchan->vc);
+	if (!vd) {
+		struct gpi_ere *gpi_ere;
+		struct gpi_tre *gpi_tre;
+
+		spin_unlock_irqrestore(&gchan->vc.lock, flags);
+		dev_dbg(gpii->gpi_dev->dev, "event without a pending descriptor!\n");
+		gpi_ere = (struct gpi_ere *)imed_event;
+		dev_dbg(gpii->gpi_dev->dev,
+			"Event: %08x %08x %08x %08x\n",
+			gpi_ere->dword[0], gpi_ere->dword[1],
+			gpi_ere->dword[2], gpi_ere->dword[3]);
+		gpi_tre = tre;
+		dev_dbg(gpii->gpi_dev->dev,
+			"Pending TRE: %08x %08x %08x %08x\n",
+			gpi_tre->dword[0], gpi_tre->dword[1],
+			gpi_tre->dword[2], gpi_tre->dword[3]);
+		return;
+	}
+	gpi_desc = to_gpi_desc(vd);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/*
+	 * RP pointed by Event is to last TRE processed,
+	 * we need to update ring rp to tre + 1
+	 */
+	tre += ch_ring->el_size;
+	if (tre >= (ch_ring->base + ch_ring->len))
+		tre = ch_ring->base;
+	ch_ring->rp = tre;
+
+	/* make sure rp updates are immediately visible to all cores */
+	smp_wmb();
+
+	chid = imed_event->chid;
+	if (imed_event->code == MSM_GPI_TCE_EOT && gpii->ieob_set) {
+		if (chid == GPI_RX_CHAN)
+			goto gpi_free_desc;
+		else
+			return;
+	}
+
+	if (imed_event->code == MSM_GPI_TCE_UNEXP_ERR)
+		result.result = DMA_TRANS_ABORTED;
+	else
+		result.result = DMA_TRANS_NOERROR;
+	result.residue = gpi_desc->len - imed_event->length;
+
+	dma_cookie_complete(&vd->tx);
+	dmaengine_desc_get_callback_invoke(&vd->tx, &result);
+
+gpi_free_desc:
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	list_del(&vd->node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+/* processing transfer completion events */
+static void gpi_process_xfer_compl_event(struct gchan *gchan,
+					 struct xfer_compl_event *compl_event)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *ev_rp = to_virtual(ch_ring, compl_event->ptr);
+	struct virt_dma_desc *vd;
+	struct gpi_desc *gpi_desc;
+	struct dmaengine_result result;
+	unsigned long flags;
+	u32 chid;
+
+	/* only process events on active channel */
+	if (unlikely(gchan->pm_state != ACTIVE_STATE)) {
+		dev_err(gpii->gpi_dev->dev, "skipping processing event because ch @ %s state\n",
+			TO_GPI_PM_STR(gchan->pm_state));
+		return;
+	}
+
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vd = vchan_next_desc(&gchan->vc);
+	if (!vd) {
+		struct gpi_ere *gpi_ere;
+
+		spin_unlock_irqrestore(&gchan->vc.lock, flags);
+		dev_err(gpii->gpi_dev->dev, "Event without a pending descriptor!\n");
+		gpi_ere = (struct gpi_ere *)compl_event;
+		dev_err(gpii->gpi_dev->dev,
+			"Event: %08x %08x %08x %08x\n",
+			gpi_ere->dword[0], gpi_ere->dword[1],
+			gpi_ere->dword[2], gpi_ere->dword[3]);
+		return;
+	}
+
+	gpi_desc = to_gpi_desc(vd);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/*
+	 * RP pointed by Event is to last TRE processed,
+	 * we need to update ring rp to ev_rp + 1
+	 */
+	ev_rp += ch_ring->el_size;
+	if (ev_rp >= (ch_ring->base + ch_ring->len))
+		ev_rp = ch_ring->base;
+	ch_ring->rp = ev_rp;
+
+	/* update must be visible to other cores */
+	smp_wmb();
+
+	chid = compl_event->chid;
+	if (compl_event->code == MSM_GPI_TCE_EOT && gpii->ieob_set) {
+		if (chid == GPI_RX_CHAN)
+			goto gpi_free_desc;
+		else
+			return;
+	}
+
+	if (compl_event->code == MSM_GPI_TCE_UNEXP_ERR) {
+		dev_err(gpii->gpi_dev->dev, "Error in Transaction\n");
+		result.result = DMA_TRANS_ABORTED;
+	} else {
+		dev_dbg(gpii->gpi_dev->dev, "Transaction Success\n");
+		result.result = DMA_TRANS_NOERROR;
+	}
+	result.residue = gpi_desc->len - compl_event->length;
+	dev_dbg(gpii->gpi_dev->dev, "Residue %d\n", result.residue);
+
+	dma_cookie_complete(&vd->tx);
+	dmaengine_desc_get_callback_invoke(&vd->tx, &result);
+
+gpi_free_desc:
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	list_del(&vd->node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+/* process all events */
+static void gpi_process_events(struct gpii *gpii)
+{
+	struct gpi_ring *ev_ring = &gpii->ev_ring;
+	phys_addr_t cntxt_rp;
+	void *rp;
+	union gpi_event *gpi_event;
+	struct gchan *gchan;
+	u32 chid, type;
+
+	cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+	rp = to_virtual(ev_ring, cntxt_rp);
+
+	do {
+		while (rp != ev_ring->rp) {
+			gpi_event = ev_ring->rp;
+			chid = gpi_event->xfer_compl_event.chid;
+			type = gpi_event->xfer_compl_event.type;
+
+			dev_dbg(gpii->gpi_dev->dev,
+				"Event: CHID:%u, type:%x %08x %08x %08x %08x\n",
+				chid, type, gpi_event->gpi_ere.dword[0],
+				gpi_event->gpi_ere.dword[1], gpi_event->gpi_ere.dword[2],
+				gpi_event->gpi_ere.dword[3]);
+
+			switch (type) {
+			case XFER_COMPLETE_EV_TYPE:
+				gchan = &gpii->gchan[chid];
+				gpi_process_xfer_compl_event(gchan,
+							     &gpi_event->xfer_compl_event);
+				break;
+			case STALE_EV_TYPE:
+				dev_dbg(gpii->gpi_dev->dev, "stale event, not processing\n");
+				break;
+			case IMMEDIATE_DATA_EV_TYPE:
+				gchan = &gpii->gchan[chid];
+				gpi_process_imed_data_event(gchan,
+							    &gpi_event->immediate_data_event);
+				break;
+			case QUP_NOTIF_EV_TYPE:
+				dev_dbg(gpii->gpi_dev->dev, "QUP_NOTIF_EV_TYPE\n");
+				break;
+			default:
+				dev_dbg(gpii->gpi_dev->dev,
+					"not supported event type:0x%x\n", type);
+			}
+			gpi_ring_recycle_ev_element(ev_ring);
+		}
+		gpi_write_ev_db(gpii, ev_ring, ev_ring->wp);
+
+		/* clear pending IEOB events */
+		gpi_write_reg(gpii, gpii->ieob_clr_reg, BIT(0));
+
+		cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+		rp = to_virtual(ev_ring, cntxt_rp);
+
+	} while (rp != ev_ring->rp);
+}
+
+/* processing events using tasklet */
+static void gpi_ev_tasklet(unsigned long data)
+{
+	struct gpii *gpii = (struct gpii *)data;
+
+	read_lock_bh(&gpii->pm_lock);
+	if (!REG_ACCESS_VALID(gpii->pm_state)) {
+		read_unlock_bh(&gpii->pm_lock);
+		dev_err(gpii->gpi_dev->dev, "not processing any events, pm_state:%s\n",
+			TO_GPI_PM_STR(gpii->pm_state));
+		return;
+	}
+
+	/* process the events */
+	gpi_process_events(gpii);
+
+	/* enable IEOB, switching back to interrupts */
+	gpi_config_interrupts(gpii, MASK_IEOB_SETTINGS, 1);
+	read_unlock_bh(&gpii->pm_lock);
+}
+
+/* marks all pending events for the channel as stale */
+static void gpi_mark_stale_events(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ev_ring = &gpii->ev_ring;
+	u32 cntxt_rp, local_rp;
+	void *ev_rp;
+
+	cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+
+	ev_rp = ev_ring->rp;
+	local_rp = (u32)to_physical(ev_ring, ev_rp);
+	while (local_rp != cntxt_rp) {
+		union gpi_event *gpi_event = ev_rp;
+		u32 chid = gpi_event->xfer_compl_event.chid;
+
+		if (chid == gchan->chid)
+			gpi_event->xfer_compl_event.type = STALE_EV_TYPE;
+		ev_rp += ev_ring->el_size;
+		if (ev_rp >= (ev_ring->base + ev_ring->len))
+			ev_rp = ev_ring->base;
+		cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+		local_rp = (u32)to_physical(ev_ring, ev_rp);
+	}
+}
+
+/* reset sw state and issue channel reset or de-alloc */
+static int gpi_reset_chan(struct gchan *gchan, enum gpi_cmd gpi_cmd)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	unsigned long flags;
+	LIST_HEAD(list);
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, gpi_cmd);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(gpi_cmd), ret);
+		return ret;
+	}
+
+	/* initialize the local ring ptrs */
+	ch_ring->rp = ch_ring->base;
+	ch_ring->wp = ch_ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+
+	/* check event ring for any stale events */
+	write_lock_irq(&gpii->pm_lock);
+	gpi_mark_stale_events(gchan);
+
+	/* remove all async descriptors */
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vchan_get_all_descriptors(&gchan->vc, &list);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	write_unlock_irq(&gpii->pm_lock);
+	vchan_dma_desc_free_list(&gchan->vc, &list);
+
+	return 0;
+}
+
+static int gpi_start_chan(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_START);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_CH_CMD_START), ret);
+		return ret;
+	}
+
+	/* gpii CH is active now */
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	return 0;
+}
+
+static int gpi_stop_chan(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_STOP);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_CH_CMD_STOP), ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/* allocate and configure the transfer channel */
+static int gpi_alloc_chan(struct gchan *chan, bool send_alloc_cmd)
+{
+	struct gpii *gpii = chan->gpii;
+	struct gpi_ring *ring = &chan->ch_ring;
+	int ret;
+	u32 id = gpii->gpii_id;
+	u32 chid = chan->chid;
+	u32 pair_chid = !chid;
+
+	if (send_alloc_cmd) {
+		ret = gpi_send_cmd(gpii, chan, GPI_CH_CMD_ALLOCATE);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+				TO_GPI_CMD_STR(GPI_CH_CMD_ALLOCATE), ret);
+			return ret;
+		}
+	}
+
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_0_CONFIG,
+		      GPII_n_CH_k_CNTXT_0(ring->el_size, 0, chan->dir, GPI_CHTYPE_PROTO_GPI));
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_1_R_LENGTH, ring->len);
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_2_RING_BASE_LSB, ring->phys_addr);
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_3_RING_BASE_MSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, chan->ch_cntxt_db_reg + CNTXT_5_RING_RP_MSB - CNTXT_4_RING_RP_LSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_0_OFFS(id, chid),
+		      GPII_n_CH_k_SCRATCH_0(pair_chid, chan->protocol, chan->seid));
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_1_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_2_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_3_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_QOS_OFFS(id, chid), 1);
+
+	/* flush all the writes */
+	wmb();
+	return 0;
+}
+
+/* allocate and configure event ring */
+static int gpi_alloc_ev_chan(struct gpii *gpii)
+{
+	struct gpi_ring *ring = &gpii->ev_ring;
+	void __iomem *base = gpii->ev_cntxt_base_reg;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, NULL, GPI_EV_CMD_ALLOCATE);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_EV_CMD_ALLOCATE), ret);
+		return ret;
+	}
+
+	/* program event context */
+	gpi_write_reg(gpii, base + CNTXT_0_CONFIG,
+		      GPII_n_EV_k_CNTXT_0(ring->el_size, GPI_INTTYPE_IRQ, GPI_CHTYPE_GPI_EV));
+	gpi_write_reg(gpii, base + CNTXT_1_R_LENGTH, ring->len);
+	gpi_write_reg(gpii, base + CNTXT_2_RING_BASE_LSB, lower_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, base + CNTXT_3_RING_BASE_MSB, upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, gpii->ev_cntxt_db_reg + CNTXT_5_RING_RP_MSB - CNTXT_4_RING_RP_LSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, base + CNTXT_8_RING_INT_MOD, 0);
+	gpi_write_reg(gpii, base + CNTXT_10_RING_MSI_LSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_11_RING_MSI_MSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_8_RING_INT_MOD, 0);
+	gpi_write_reg(gpii, base + CNTXT_12_RING_RP_UPDATE_LSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_13_RING_RP_UPDATE_MSB, 0);
+
+	/* add events to ring */
+	ring->wp = (ring->base + ring->len - ring->el_size);
+
+	/* flush all the writes */
+	wmb();
+
+	/* gpii is active now */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	gpi_write_ev_db(gpii, ring, ring->wp);
+
+	return 0;
+}
+
+/* calculate # of ERE/TRE available to queue */
+static int gpi_ring_num_elements_avail(const struct gpi_ring * const ring)
+{
+	int elements = 0;
+
+	if (ring->wp < ring->rp) {
+		elements = ((ring->rp - ring->wp) / ring->el_size) - 1;
+	} else {
+		elements = (ring->rp - ring->base) / ring->el_size;
+		elements += ((ring->base + ring->len - ring->wp) / ring->el_size) - 1;
+	}
+
+	return elements;
+}
+
+static int gpi_ring_add_element(struct gpi_ring *ring, void **wp)
+{
+	if (gpi_ring_num_elements_avail(ring) <= 0)
+		return -ENOMEM;
+
+	*wp = ring->wp;
+	ring->wp += ring->el_size;
+	if (ring->wp  >= (ring->base + ring->len))
+		ring->wp = ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+
+	return 0;
+}
+
+static void gpi_ring_recycle_ev_element(struct gpi_ring *ring)
+{
+	/* Update the WP */
+	ring->wp += ring->el_size;
+	if (ring->wp  >= (ring->base + ring->len))
+		ring->wp = ring->base;
+
+	/* Update the RP */
+	ring->rp += ring->el_size;
+	if (ring->rp  >= (ring->base + ring->len))
+		ring->rp = ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+}
+
+static void gpi_free_ring(struct gpi_ring *ring,
+			  struct gpii *gpii)
+{
+	dma_free_coherent(gpii->gpi_dev->dev, ring->alloc_size,
+			  ring->pre_aligned, ring->dma_handle);
+	memset(ring, 0, sizeof(*ring));
+}
+
+/* allocate memory for transfer and event rings */
+static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements,
+			  u32 el_size, struct gpii *gpii)
+{
+	u64 len = elements * el_size;
+	int bit;
+
+	/* ring len must be power of 2 */
+	bit = find_last_bit((unsigned long *)&len, 32);
+	if (((1 << bit) - 1) & len)
+		bit++;
+	len = 1 << bit;
+	ring->alloc_size = (len + (len - 1));
+	dev_dbg(gpii->gpi_dev->dev,
+		"#el:%u el_size:%u len:%u actual_len:%llu alloc_size:%lu\n",
+		  elements, el_size, (elements * el_size), len,
+		  ring->alloc_size);
+
+	ring->pre_aligned = dma_alloc_coherent(gpii->gpi_dev->dev,
+					       ring->alloc_size,
+					       &ring->dma_handle, GFP_KERNEL);
+	if (!ring->pre_aligned) {
+		dev_err(gpii->gpi_dev->dev, "could not alloc size:%lu mem for ring\n",
+			ring->alloc_size);
+		return -ENOMEM;
+	}
+
+	/* align the physical mem */
+	ring->phys_addr = (ring->dma_handle + (len - 1)) & ~(len - 1);
+	ring->base = ring->pre_aligned + (ring->phys_addr - ring->dma_handle);
+	ring->rp = ring->base;
+	ring->wp = ring->base;
+	ring->len = len;
+	ring->el_size = el_size;
+	ring->elements = ring->len / ring->el_size;
+	memset(ring->base, 0, ring->len);
+	ring->configured = true;
+
+	/* update to other cores */
+	smp_wmb();
+
+	dev_dbg(gpii->gpi_dev->dev,
+		"phy_pre:0x%0llx phy_alig:0x%0llx len:%u el_size:%u elements:%u\n",
+		ring->dma_handle, ring->phys_addr, ring->len,
+		ring->el_size, ring->elements);
+
+	return 0;
+}
+
+/* copy tre into transfer ring */
+static void gpi_queue_xfer(struct gpii *gpii, struct gchan *gchan,
+			   struct gpi_tre *gpi_tre, void **wp)
+{
+	struct gpi_tre *ch_tre;
+	int ret;
+
+	/* get next tre location we can copy */
+	ret = gpi_ring_add_element(&gchan->ch_ring, (void **)&ch_tre);
+	if (unlikely(ret)) {
+		dev_err(gpii->gpi_dev->dev, "Error adding ring element to xfer ring\n");
+		return;
+	}
+
+	/* copy the tre info */
+	memcpy(ch_tre, gpi_tre, sizeof(*ch_tre));
+	*wp = ch_tre;
+}
+
+/* reset and restart transfer channel */
+static int gpi_terminate_all(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int schid, echid, i;
+	int ret = 0;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/*
+	 * treat both channels as a group if its protocol is not UART
+	 * STOP, RESET, or START needs to be in lockstep
+	 */
+	schid = (gchan->protocol == QCOM_GPI_UART) ? gchan->chid : 0;
+	echid = (gchan->protocol == QCOM_GPI_UART) ? schid + 1 : MAX_CHANNELS_PER_GPII;
+
+	/* stop the channel */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		/* disable ch state so no more TRE processing */
+		write_lock_irq(&gpii->pm_lock);
+		gchan->pm_state = PREPARE_TERMINATE;
+		write_unlock_irq(&gpii->pm_lock);
+
+		/* send command to Stop the channel */
+		ret = gpi_stop_chan(gchan);
+	}
+
+	/* reset the channels (clears any pending tre) */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		ret = gpi_reset_chan(gchan, GPI_CH_CMD_RESET);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error resetting channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+
+		/* reprogram channel CNTXT */
+		ret = gpi_alloc_chan(gchan, false);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error alloc_channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+	}
+
+	/* restart the channels */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		ret = gpi_start_chan(gchan);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error Starting Channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+	}
+
+terminate_exit:
+	mutex_unlock(&gpii->ctrl_lock);
+	return ret;
+}
+
+/* pause dma transfer for all channels */
+static int gpi_pause(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int i, ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/*
+	 * pause/resume are per gpii not per channel, so
+	 * client needs to call pause only once
+	 */
+	if (gpii->pm_state == PAUSE_STATE) {
+		dev_dbg(gpii->gpi_dev->dev, "channel is already paused\n");
+		mutex_unlock(&gpii->ctrl_lock);
+		return 0;
+	}
+
+	/* send stop command to stop the channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_stop_chan(&gpii->gchan[i]);
+		if (ret) {
+			mutex_unlock(&gpii->ctrl_lock);
+			return ret;
+		}
+	}
+
+	disable_irq(gpii->irq);
+
+	/* Wait for threads to complete out */
+	tasklet_kill(&gpii->ev_task);
+
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PAUSE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return 0;
+}
+
+/* resume dma transfer */
+static int gpi_resume(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int i, ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+	if (gpii->pm_state == ACTIVE_STATE) {
+		dev_dbg(gpii->gpi_dev->dev, "channel is already active\n");
+		mutex_unlock(&gpii->ctrl_lock);
+		return 0;
+	}
+
+	enable_irq(gpii->irq);
+
+	/* send start command to start the channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_send_cmd(gpii, &gpii->gchan[i], GPI_CH_CMD_START);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error starting chan, ret:%d\n", ret);
+			mutex_unlock(&gpii->ctrl_lock);
+			return ret;
+		}
+	}
+
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return 0;
+}
+
+static void gpi_desc_free(struct virt_dma_desc *vd)
+{
+	struct gpi_desc *gpi_desc = to_gpi_desc(vd);
+
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+static int
+gpi_peripheral_config(struct dma_chan *chan, struct dma_slave_config *config)
+{
+	struct gchan *gchan = to_gchan(chan);
+
+	if (!config->peripheral_config)
+		return -EINVAL;
+
+	gchan->config = krealloc(gchan->config, config->peripheral_size, GFP_NOWAIT);
+	if (!gchan->config)
+		return -ENOMEM;
+
+	memcpy(gchan->config, config->peripheral_config, config->peripheral_size);
+
+	return 0;
+}
+
+static int gpi_create_i2c_tre(struct gchan *chan, struct gpi_desc *desc,
+			      struct scatterlist *sgl, enum dma_transfer_direction direction)
+{
+	struct gpi_i2c_config *i2c = chan->config;
+	struct device *dev = chan->gpii->gpi_dev->dev;
+	unsigned int tre_idx = 0;
+	dma_addr_t address;
+	struct gpi_tre *tre;
+	unsigned int i;
+
+	/* first create config tre if applicable */
+	if (i2c->set_config) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(i2c->low_count, TRE_I2C_C0_TLOW);
+		tre->dword[0] |= u32_encode_bits(i2c->high_count, TRE_I2C_C0_THIGH);
+		tre->dword[0] |= u32_encode_bits(i2c->cycle_count, TRE_I2C_C0_TCYL);
+		tre->dword[0] |= u32_encode_bits(i2c->pack_enable, TRE_I2C_C0_TX_PACK);
+		tre->dword[0] |= u32_encode_bits(i2c->pack_enable, TRE_I2C_C0_RX_PACK);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(i2c->clk_div, TRE_C0_CLK_DIV);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_CONFIG0, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the GO tre for Tx */
+	if (i2c->op == I2C_WRITE) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		if (i2c->multi_msg)
+			tre->dword[0] = u32_encode_bits(I2C_READ, TRE_I2C_GO_CMD);
+		else
+			tre->dword[0] = u32_encode_bits(i2c->op, TRE_I2C_GO_CMD);
+
+		tre->dword[0] |= u32_encode_bits(i2c->addr, TRE_I2C_GO_ADDR);
+		tre->dword[0] |= u32_encode_bits(i2c->stretch, TRE_I2C_GO_STRETCH);
+
+		tre->dword[1] = 0;
+		tre->dword[2] = u32_encode_bits(i2c->rx_len, TRE_RX_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_GO, TRE_FLAGS_TYPE);
+
+		if (i2c->multi_msg)
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_LINK);
+		else
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	if (i2c->op == I2C_READ || i2c->multi_msg == false) {
+		/* create the DMA TRE */
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		address = sg_dma_address(sgl);
+		tre->dword[0] = lower_32_bits(address);
+		tre->dword[1] = upper_32_bits(address);
+
+		tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
+	};
+
+	for (i = 0; i < tre_idx; i++)
+		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
+			desc->tre[i].dword[1], desc->tre[i].dword[2], desc->tre[i].dword[3]);
+
+	return tre_idx;
+}
+
+static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
+			      struct scatterlist *sgl, enum dma_transfer_direction direction)
+{
+	struct gpi_spi_config *spi = chan->config;
+	struct device *dev = chan->gpii->gpi_dev->dev;
+	unsigned int tre_idx = 0;
+	dma_addr_t address;
+	struct gpi_tre *tre;
+	unsigned int i;
+
+	/* first create config tre if applicable */
+	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(spi->word_len, TRE_SPI_C0_WORD_SZ);
+		tre->dword[0] |= u32_encode_bits(spi->loopback_en, TRE_SPI_C0_LOOPBACK);
+		tre->dword[0] |= u32_encode_bits(spi->clock_pol_high, TRE_SPI_C0_CPOL);
+		tre->dword[0] |= u32_encode_bits(spi->data_pol_high, TRE_SPI_C0_CPHA);
+		tre->dword[0] |= u32_encode_bits(spi->pack_en, TRE_SPI_C0_TX_PACK);
+		tre->dword[0] |= u32_encode_bits(spi->pack_en, TRE_SPI_C0_RX_PACK);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(spi->clk_div, TRE_C0_CLK_DIV);
+		tre->dword[2] |= u32_encode_bits(spi->clk_src, TRE_C0_CLK_SRC);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_CONFIG0, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the GO tre for Tx */
+	if (direction == DMA_MEM_TO_DEV) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(spi->fragmentation, TRE_SPI_GO_FRAG);
+		tre->dword[0] |= u32_encode_bits(spi->cs, TRE_SPI_GO_CS);
+		tre->dword[0] |= u32_encode_bits(spi->cmd, TRE_SPI_GO_CMD);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(spi->rx_len, TRE_RX_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_GO, TRE_FLAGS_TYPE);
+		if (spi->cmd == SPI_RX)
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOB);
+		else
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the dma tre */
+	tre = &desc->tre[tre_idx];
+	tre_idx++;
+
+	address = sg_dma_address(sgl);
+	tre->dword[0] = lower_32_bits(address);
+	tre->dword[1] = upper_32_bits(address);
+
+	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
+
+	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
+	if (direction == DMA_MEM_TO_DEV)
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
+
+	for (i = 0; i < tre_idx; i++)
+		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
+			desc->tre[i].dword[1], desc->tre[i].dword[2], desc->tre[i].dword[3]);
+
+	return tre_idx;
+}
+
+/* copy tre into transfer ring */
+static struct dma_async_tx_descriptor *
+gpi_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
+		  unsigned int sg_len, enum dma_transfer_direction direction,
+		  unsigned long flags, void *context)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	struct device *dev = gpii->gpi_dev->dev;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	struct gpi_desc *gpi_desc;
+	u32 nr, nr_tre = 0;
+	u8 set_config;
+	int i;
+
+	gpii->ieob_set = false;
+	if (!is_slave_direction(direction)) {
+		dev_err(gpii->gpi_dev->dev, "invalid dma direction: %d\n", direction);
+		return NULL;
+	}
+
+	if (sg_len > 1) {
+		dev_err(dev, "Multi sg sent, we support only one atm: %d\n", sg_len);
+		return NULL;
+	}
+
+	nr_tre = 3;
+	set_config = *(u32 *)gchan->config;
+	if (!set_config)
+		nr_tre = 2;
+	if (direction == DMA_DEV_TO_MEM) /* rx */
+		nr_tre = 1;
+
+	/* calculate # of elements required & available */
+	nr = gpi_ring_num_elements_avail(ch_ring);
+	if (nr < nr_tre) {
+		dev_err(dev, "not enough space in ring, avail:%u required:%u\n", nr, nr_tre);
+		return NULL;
+	}
+
+	gpi_desc = kzalloc(sizeof(*gpi_desc), GFP_NOWAIT);
+	if (!gpi_desc)
+		return NULL;
+
+	/* create TREs for xfer */
+	if (gchan->protocol == QCOM_GPI_SPI) {
+		i = gpi_create_spi_tre(gchan, gpi_desc, sgl, direction);
+	} else if (gchan->protocol == QCOM_GPI_I2C) {
+		i = gpi_create_i2c_tre(gchan, gpi_desc, sgl, direction);
+	} else {
+		dev_err(dev, "invalid peripheral: %d\n", gchan->protocol);
+		kfree(gpi_desc);
+		return NULL;
+	}
+
+	/* set up the descriptor */
+	gpi_desc->gchan = gchan;
+	gpi_desc->len = sg_dma_len(sgl);
+	gpi_desc->num_tre  = i;
+
+	return vchan_tx_prep(&gchan->vc, &gpi_desc->vd, flags);
+}
+
+/* rings transfer ring db to being transfer */
+static void gpi_issue_pending(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	unsigned long flags, pm_lock_flags;
+	struct virt_dma_desc *vd = NULL;
+	struct gpi_desc *gpi_desc;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *tre, *wp = NULL;
+	int i;
+
+	read_lock_irqsave(&gpii->pm_lock, pm_lock_flags);
+
+	/* move all submitted discriptors to issued list */
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	if (vchan_issue_pending(&gchan->vc))
+		vd = list_last_entry(&gchan->vc.desc_issued,
+				     struct virt_dma_desc, node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/* nothing to do list is empty */
+	if (!vd) {
+		read_unlock_irqrestore(&gpii->pm_lock, pm_lock_flags);
+		return;
+	}
+
+	gpi_desc = to_gpi_desc(vd);
+	for (i = 0; i < gpi_desc->num_tre; i++) {
+		tre = &gpi_desc->tre[i];
+		gpi_queue_xfer(gpii, gchan, tre, &wp);
+	}
+
+	gpi_desc->db = ch_ring->wp;
+	gpi_write_ch_db(gchan, &gchan->ch_ring, gpi_desc->db);
+	read_unlock_irqrestore(&gpii->pm_lock, pm_lock_flags);
+}
+
+static int gpi_ch_init(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	const int ev_factor = gpii->gpi_dev->ev_factor;
+	u32 elements;
+	int i = 0, ret = 0;
+
+	gchan->pm_state = CONFIG_STATE;
+
+	/* check if both channels are configured before continue */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++)
+		if (gpii->gchan[i].pm_state != CONFIG_STATE)
+			goto exit_gpi_init;
+
+	/* protocol must be same for both channels */
+	if (gpii->gchan[0].protocol != gpii->gchan[1].protocol) {
+		dev_err(gpii->gpi_dev->dev, "protocol did not match protocol %u != %u\n",
+			gpii->gchan[0].protocol, gpii->gchan[1].protocol);
+		ret = -EINVAL;
+		goto exit_gpi_init;
+	}
+
+	/* allocate memory for event ring */
+	elements = CHAN_TRES << ev_factor;
+	ret = gpi_alloc_ring(&gpii->ev_ring, elements,
+			     sizeof(union gpi_event), gpii);
+	if (ret)
+		goto exit_gpi_init;
+
+	/* configure interrupts */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PREPARE_HARDWARE;
+	write_unlock_irq(&gpii->pm_lock);
+	ret = gpi_config_interrupts(gpii, DEFAULT_IRQ_SETTINGS, 0);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error config. interrupts, ret:%d\n", ret);
+		goto error_config_int;
+	}
+
+	/* allocate event rings */
+	ret = gpi_alloc_ev_chan(gpii);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error alloc_ev_chan:%d\n", ret);
+		goto error_alloc_ev_ring;
+	}
+
+	/* Allocate all channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_alloc_chan(&gpii->gchan[i], true);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error allocating chan:%d\n", ret);
+			goto error_alloc_chan;
+		}
+	}
+
+	/* start channels  */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_start_chan(&gpii->gchan[i]);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error start chan:%d\n", ret);
+			goto error_start_chan;
+		}
+	}
+	return ret;
+
+error_start_chan:
+	for (i = i - 1; i >= 0; i++) {
+		gpi_stop_chan(&gpii->gchan[i]);
+		gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET);
+	}
+	i = 2;
+error_alloc_chan:
+	for (i = i - 1; i >= 0; i--)
+		gpi_reset_chan(gchan, GPI_CH_CMD_DE_ALLOC);
+error_alloc_ev_ring:
+	gpi_disable_interrupts(gpii);
+error_config_int:
+	gpi_free_ring(&gpii->ev_ring, gpii);
+exit_gpi_init:
+	mutex_unlock(&gpii->ctrl_lock);
+	return ret;
+}
+
+/* release all channel resources */
+static void gpi_free_chan_resources(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	enum gpi_pm_state cur_state;
+	int ret, i;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	cur_state = gchan->pm_state;
+
+	/* disable ch state so no more TRE processing for this channel */
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = PREPARE_TERMINATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* attempt to do graceful hardware shutdown */
+	if (cur_state == ACTIVE_STATE) {
+		gpi_stop_chan(gchan);
+
+		ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET);
+		if (ret)
+			dev_err(gpii->gpi_dev->dev, "error resetting channel:%d\n", ret);
+
+		gpi_reset_chan(gchan, GPI_CH_CMD_DE_ALLOC);
+	}
+
+	/* free all allocated memory */
+	gpi_free_ring(&gchan->ch_ring, gpii);
+	vchan_free_chan_resources(&gchan->vc);
+	kfree(gchan->config);
+
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = DISABLE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* if other rings are still active exit */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++)
+		if (gpii->gchan[i].ch_ring.configured)
+			goto exit_free;
+
+	/* deallocate EV Ring */
+	cur_state = gpii->pm_state;
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PREPARE_TERMINATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* wait for threads to complete out */
+	tasklet_kill(&gpii->ev_task);
+
+	/* send command to de allocate event ring */
+	if (cur_state == ACTIVE_STATE)
+		gpi_send_cmd(gpii, NULL, GPI_EV_CMD_DEALLOC);
+
+	gpi_free_ring(&gpii->ev_ring, gpii);
+
+	/* disable interrupts */
+	if (cur_state == ACTIVE_STATE)
+		gpi_disable_interrupts(gpii);
+
+	/* set final state to disable */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = DISABLE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+exit_free:
+	mutex_unlock(&gpii->ctrl_lock);
+}
+
+/* allocate channel resources */
+static int gpi_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/* allocate memory for transfer ring */
+	ret = gpi_alloc_ring(&gchan->ch_ring, CHAN_TRES,
+			     sizeof(struct gpi_tre), gpii);
+	if (ret)
+		goto xfer_alloc_err;
+
+	ret = gpi_ch_init(gchan);
+
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return ret;
+xfer_alloc_err:
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return ret;
+}
+
+static int gpi_find_avail_gpii(struct gpi_dev *gpi_dev, u32 seid)
+{
+	struct gchan *tx_chan, *rx_chan;
+	unsigned int gpii;
+
+	/* check if same seid is already configured for another chid */
+	for (gpii = 0; gpii < gpi_dev->max_gpii; gpii++) {
+		if (!((1 << gpii) & gpi_dev->gpii_mask))
+			continue;
+
+		tx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_TX_CHAN];
+		rx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_RX_CHAN];
+
+		if (rx_chan->vc.chan.client_count && rx_chan->seid == seid)
+			return gpii;
+		if (tx_chan->vc.chan.client_count && tx_chan->seid == seid)
+			return gpii;
+	}
+
+	/* no channels configured with same seid, return next avail gpii */
+	for (gpii = 0; gpii < gpi_dev->max_gpii; gpii++) {
+		if (!((1 << gpii) & gpi_dev->gpii_mask))
+			continue;
+
+		tx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_TX_CHAN];
+		rx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_RX_CHAN];
+
+		/* check if gpii is configured */
+		if (tx_chan->vc.chan.client_count ||
+		    rx_chan->vc.chan.client_count)
+			continue;
+
+		/* found a free gpii */
+		return gpii;
+	}
+
+	/* no gpii instance available to use */
+	return -EIO;
+}
+
+/* gpi_of_dma_xlate: open client requested channel */
+static struct dma_chan *gpi_of_dma_xlate(struct of_phandle_args *args,
+					 struct of_dma *of_dma)
+{
+	struct gpi_dev *gpi_dev = (struct gpi_dev *)of_dma->of_dma_data;
+	u32 seid, chid;
+	int gpii;
+	struct gchan *gchan;
+
+	if (args->args_count < 3) {
+		dev_err(gpi_dev->dev, "gpii require minimum 2 args, client passed:%d args\n",
+			args->args_count);
+		return NULL;
+	}
+
+	chid = args->args[0];
+	if (chid >= MAX_CHANNELS_PER_GPII) {
+		dev_err(gpi_dev->dev, "gpii channel:%d not valid\n", chid);
+		return NULL;
+	}
+
+	seid = args->args[1];
+
+	/* find next available gpii to use */
+	gpii = gpi_find_avail_gpii(gpi_dev, seid);
+	if (gpii < 0) {
+		dev_err(gpi_dev->dev, "no available gpii instances\n");
+		return NULL;
+	}
+
+	gchan = &gpi_dev->gpiis[gpii].gchan[chid];
+	if (gchan->vc.chan.client_count) {
+		dev_err(gpi_dev->dev, "gpii:%d chid:%d seid:%d already configured\n",
+			gpii, chid, gchan->seid);
+		return NULL;
+	}
+
+	gchan->seid = seid;
+	gchan->protocol = args->args[2];
+
+	return dma_get_slave_channel(&gchan->vc.chan);
+}
+
+static int gpi_probe(struct platform_device *pdev)
+{
+	struct gpi_dev *gpi_dev;
+	unsigned int i;
+	int ret;
+
+	gpi_dev = devm_kzalloc(&pdev->dev, sizeof(*gpi_dev), GFP_KERNEL);
+	if (!gpi_dev)
+		return -ENOMEM;
+
+	gpi_dev->dev = &pdev->dev;
+	gpi_dev->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	gpi_dev->regs = devm_ioremap_resource(gpi_dev->dev, gpi_dev->res);
+	if (IS_ERR(gpi_dev->regs))
+		return PTR_ERR(gpi_dev->regs);
+	gpi_dev->ee_base = gpi_dev->regs;
+
+	ret = of_property_read_u32(gpi_dev->dev->of_node, "dma-channels",
+				   &gpi_dev->max_gpii);
+	if (ret) {
+		dev_err(gpi_dev->dev, "missing 'max-no-gpii' DT node\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(gpi_dev->dev->of_node, "dma-channel-mask",
+				   &gpi_dev->gpii_mask);
+	if (ret) {
+		dev_err(gpi_dev->dev, "missing 'gpii-mask' DT node\n");
+		return ret;
+	}
+
+	gpi_dev->ev_factor = EV_FACTOR;
+
+	ret = dma_set_mask(gpi_dev->dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err(gpi_dev->dev, "Error setting dma_mask to 64, ret:%d\n", ret);
+		return ret;
+	}
+
+	gpi_dev->gpiis = devm_kzalloc(gpi_dev->dev, sizeof(*gpi_dev->gpiis) *
+				      gpi_dev->max_gpii, GFP_KERNEL);
+	if (!gpi_dev->gpiis)
+		return -ENOMEM;
+
+	/* setup all the supported gpii */
+	INIT_LIST_HEAD(&gpi_dev->dma_device.channels);
+	for (i = 0; i < gpi_dev->max_gpii; i++) {
+		struct gpii *gpii = &gpi_dev->gpiis[i];
+		int chan;
+
+		if (!((1 << i) & gpi_dev->gpii_mask))
+			continue;
+
+		/* set up ev cntxt register map */
+		gpii->ev_cntxt_base_reg = gpi_dev->ee_base + GPII_n_EV_CH_k_CNTXT_0_OFFS(i, 0);
+		gpii->ev_cntxt_db_reg = gpi_dev->ee_base + GPII_n_EV_CH_k_DOORBELL_0_OFFS(i, 0);
+		gpii->ev_ring_rp_lsb_reg = gpii->ev_cntxt_base_reg + CNTXT_4_RING_RP_LSB;
+		gpii->ev_cmd_reg = gpi_dev->ee_base + GPII_n_EV_CH_CMD_OFFS(i);
+		gpii->ieob_clr_reg = gpi_dev->ee_base + GPII_n_CNTXT_SRC_IEOB_IRQ_CLR_OFFS(i);
+
+		/* set up irq */
+		ret = platform_get_irq(pdev, i);
+		if (ret < 0) {
+			dev_err(gpi_dev->dev, "platform_get_irq failed for %d:%d\n", i, ret);
+			return ret;
+		}
+		gpii->irq = ret;
+
+		/* set up channel specific register info */
+		for (chan = 0; chan < MAX_CHANNELS_PER_GPII; chan++) {
+			struct gchan *gchan = &gpii->gchan[chan];
+
+			/* set up ch cntxt register map */
+			gchan->ch_cntxt_base_reg = gpi_dev->ee_base +
+				GPII_n_CH_k_CNTXT_0_OFFS(i, chan);
+			gchan->ch_cntxt_db_reg = gpi_dev->ee_base +
+				GPII_n_CH_k_DOORBELL_0_OFFS(i, chan);
+			gchan->ch_cmd_reg = gpi_dev->ee_base + GPII_n_CH_CMD_OFFS(i);
+
+			/* vchan setup */
+			vchan_init(&gchan->vc, &gpi_dev->dma_device);
+			gchan->vc.desc_free = gpi_desc_free;
+			gchan->chid = chan;
+			gchan->gpii = gpii;
+			gchan->dir = GPII_CHAN_DIR[chan];
+		}
+		mutex_init(&gpii->ctrl_lock);
+		rwlock_init(&gpii->pm_lock);
+		tasklet_init(&gpii->ev_task, gpi_ev_tasklet,
+			     (unsigned long)gpii);
+		init_completion(&gpii->cmd_completion);
+		gpii->gpii_id = i;
+		gpii->regs = gpi_dev->ee_base;
+		gpii->gpi_dev = gpi_dev;
+	}
+
+	platform_set_drvdata(pdev, gpi_dev);
+
+	/* clear and Set capabilities */
+	dma_cap_zero(gpi_dev->dma_device.cap_mask);
+	dma_cap_set(DMA_SLAVE, gpi_dev->dma_device.cap_mask);
+
+	/* configure dmaengine apis */
+	gpi_dev->dma_device.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
+	gpi_dev->dma_device.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+	gpi_dev->dma_device.src_addr_widths = DMA_SLAVE_BUSWIDTH_8_BYTES;
+	gpi_dev->dma_device.dst_addr_widths = DMA_SLAVE_BUSWIDTH_8_BYTES;
+	gpi_dev->dma_device.device_alloc_chan_resources = gpi_alloc_chan_resources;
+	gpi_dev->dma_device.device_free_chan_resources = gpi_free_chan_resources;
+	gpi_dev->dma_device.device_tx_status = dma_cookie_status;
+	gpi_dev->dma_device.device_issue_pending = gpi_issue_pending;
+	gpi_dev->dma_device.device_prep_slave_sg = gpi_prep_slave_sg;
+	gpi_dev->dma_device.device_config = gpi_peripheral_config;
+	gpi_dev->dma_device.device_terminate_all = gpi_terminate_all;
+	gpi_dev->dma_device.dev = gpi_dev->dev;
+	gpi_dev->dma_device.device_pause = gpi_pause;
+	gpi_dev->dma_device.device_resume = gpi_resume;
+
+	/* register with dmaengine framework */
+	ret = dma_async_device_register(&gpi_dev->dma_device);
+	if (ret) {
+		dev_err(gpi_dev->dev, "async_device_register failed ret:%d", ret);
+		return ret;
+	}
+
+	ret = of_dma_controller_register(gpi_dev->dev->of_node,
+					 gpi_of_dma_xlate, gpi_dev);
+	if (ret) {
+		dev_err(gpi_dev->dev, "of_dma_controller_reg failed ret:%d", ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+static const struct of_device_id gpi_of_match[] = {
+	{ .compatible = "qcom,sdm845-gpi-dma" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, gpi_of_match);
+
+static struct platform_driver gpi_driver = {
+	.probe = gpi_probe,
+	.driver = {
+		.name = KBUILD_MODNAME,
+		.of_match_table = gpi_of_match,
+	},
+};
+
+static int __init gpi_init(void)
+{
+	return platform_driver_register(&gpi_driver);
+}
+subsys_initcall(gpi_init)
+
+MODULE_DESCRIPTION("QCOM GPI DMA engine driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
new file mode 100644
index 000000000000..f46dc3372f11
--- /dev/null
+++ b/include/linux/dma/qcom-gpi-dma.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Linaro Limited
+ */
+
+#ifndef QCOM_GPI_DMA_H
+#define QCOM_GPI_DMA_H
+
+/**
+ * enum spi_transfer_cmd - spi transfer commands
+ */
+enum spi_transfer_cmd {
+	SPI_TX = 1,
+	SPI_RX,
+	SPI_DUPLEX,
+};
+
+/**
+ * struct gpi_spi_config - spi config for peripheral
+ *
+ * @loopback_en: spi loopback enable when set
+ * @clock_pol_high: clock polarity
+ * @data_pol_high: data polarity
+ * @pack_en: process tx/rx buffers as packed
+ * @word_len: spi word length
+ * @clk_div: source clock divider
+ * @clk_src: serial clock
+ * @cmd: spi cmd
+ * @fragmentation: keep CS assserted at end of sequence
+ * @cs: chip select toggle
+ * @set_config: set peripheral config
+ * @rx_len: receive length for buffer
+ */
+struct gpi_spi_config {
+	u8 set_config;
+	u8 loopback_en;
+	u8 clock_pol_high;
+	u8 data_pol_high;
+	u8 pack_en;
+	u8 word_len;
+	u8 fragmentation;
+	u8 cs;
+	u32 clk_div;
+	u32 clk_src;
+	enum spi_transfer_cmd cmd;
+	u32 rx_len;
+};
+
+enum i2c_op {
+	I2C_WRITE = 1,
+	I2C_READ,
+};
+
+/**
+ * struct gpi_i2c_config - i2c config for peripheral
+ *
+ * @pack_enable: process tx/rx buffers as packed
+ * @cycle_count: clock cycles to be sent
+ * @high_count: high period of clock
+ * @low_count: low period of clock
+ * @clk_div: source clock divider
+ * @addr: i2c bus address
+ * @stretch: stretch the clock at eot
+ * @set_config: set peripheral config
+ * @rx_len: receive length for buffer
+ * @op: i2c cmd
+ * @muli-msg: is part of multi i2c r-w msgs
+ */
+struct gpi_i2c_config {
+	u8 set_config;
+	u8 pack_enable;
+	u8 cycle_count;
+	u8 high_count;
+	u8 low_count;
+	u8 addr;
+	u8 stretch;
+	u16 clk_div;
+	u32 rx_len;
+	enum i2c_op op;
+	bool multi_msg;
+};
+
+#endif /* QCOM_GPI_DMA_H */
-- 
cgit 


From 36ccdf85829a7dd6936dba5d02fa50138471f0d3 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 23 Nov 2020 18:56:00 +0100
Subject: net, xsk: Avoid taking multiple skbuff references
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY")
addressed the problem that packets were discarded from the Tx AF_XDP
ring, when the driver returned NETDEV_TX_BUSY. Part of the fix was
bumping the skbuff reference count, so that the buffer would not be
freed by dev_direct_xmit(). A reference count larger than one means
that the skbuff is "shared", which is not the case.

If the "shared" skbuff is sent to the generic XDP receive path,
netif_receive_generic_xdp(), and pskb_expand_head() is entered the
BUG_ON(skb_shared(skb)) will trigger.

This patch adds a variant to dev_direct_xmit(), __dev_direct_xmit(),
where a user can select the skbuff free policy. This allows AF_XDP to
avoid bumping the reference count, but still keep the NETDEV_TX_BUSY
behavior.

Fixes: 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY")
Reported-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20201123175600.146255-1-bjorn.topel@gmail.com
---
 include/linux/netdevice.h | 14 +++++++++++++-
 net/core/dev.c            |  8 ++------
 net/xdp/xsk.c             |  8 +-------
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 964b494b0e8d..76775abf259d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2813,9 +2813,21 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 		     struct net_device *sb_dev);
 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 		       struct net_device *sb_dev);
+
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
+
+static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+	int ret;
+
+	ret = __dev_direct_xmit(skb, queue_id);
+	if (!dev_xmit_complete(ret))
+		kfree_skb(skb);
+	return ret;
+}
+
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
 void unregister_netdevice_many(struct list_head *head);
diff --git a/net/core/dev.c b/net/core/dev.c
index 82dc6b48e45f..8588ade790cb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4180,7 +4180,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 }
 EXPORT_SYMBOL(dev_queue_xmit_accel);
 
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 {
 	struct net_device *dev = skb->dev;
 	struct sk_buff *orig_skb = skb;
@@ -4210,17 +4210,13 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 	dev_xmit_recursion_dec();
 
 	local_bh_enable();
-
-	if (!dev_xmit_complete(ret))
-		kfree_skb(skb);
-
 	return ret;
 drop:
 	atomic_long_inc(&dev->tx_dropped);
 	kfree_skb_list(skb);
 	return NET_XMIT_DROP;
 }
-EXPORT_SYMBOL(dev_direct_xmit);
+EXPORT_SYMBOL(__dev_direct_xmit);
 
 /*************************************************************************
  *			Receiver routines
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 5a6cdf7b320d..b7b039bd9d03 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -411,11 +411,7 @@ static int xsk_generic_xmit(struct sock *sk)
 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
 		skb->destructor = xsk_destruct_skb;
 
-		/* Hinder dev_direct_xmit from freeing the packet and
-		 * therefore completing it in the destructor
-		 */
-		refcount_inc(&skb->users);
-		err = dev_direct_xmit(skb, xs->queue_id);
+		err = __dev_direct_xmit(skb, xs->queue_id);
 		if  (err == NETDEV_TX_BUSY) {
 			/* Tell user-space to retry the send */
 			skb->destructor = sock_wfree;
@@ -429,12 +425,10 @@ static int xsk_generic_xmit(struct sock *sk)
 		/* Ignore NET_XMIT_CN as packet might have been sent */
 		if (err == NET_XMIT_DROP) {
 			/* SKB completed but not sent */
-			kfree_skb(skb);
 			err = -EBUSY;
 			goto out;
 		}
 
-		consume_skb(skb);
 		sent_frame = true;
 	}
 
-- 
cgit 


From 159e1de201b6fca10bfec50405a3b53a561096a8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 17 Nov 2020 23:56:05 -0800
Subject: fscrypt: add fscrypt_is_nokey_name()

It's possible to create a duplicate filename in an encrypted directory
by creating a file concurrently with adding the encryption key.

Specifically, sys_open(O_CREAT) (or sys_mkdir(), sys_mknod(), or
sys_symlink()) can lookup the target filename while the directory's
encryption key hasn't been added yet, resulting in a negative no-key
dentry.  The VFS then calls ->create() (or ->mkdir(), ->mknod(), or
->symlink()) because the dentry is negative.  Normally, ->create() would
return -ENOKEY due to the directory's key being unavailable.  However,
if the key was added between the dentry lookup and ->create(), then the
filesystem will go ahead and try to create the file.

If the target filename happens to already exist as a normal name (not a
no-key name), a duplicate filename may be added to the directory.

In order to fix this, we need to fix the filesystems to prevent
->create(), ->mkdir(), ->mknod(), and ->symlink() on no-key names.
(->rename() and ->link() need it too, but those are already handled
correctly by fscrypt_prepare_rename() and fscrypt_prepare_link().)

In preparation for this, add a helper function fscrypt_is_nokey_name()
that filesystems can use to do this check.  Use this helper function for
the existing checks that fs/crypto/ does for rename and link.

Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20201118075609.120337-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/hooks.c       |  5 +++--
 include/linux/fscrypt.h | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 20b0df47fe6a..061418be4b08 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -61,7 +61,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
 		return err;
 
 	/* ... in case we looked up no-key name before key was added */
-	if (dentry->d_flags & DCACHE_NOKEY_NAME)
+	if (fscrypt_is_nokey_name(dentry))
 		return -ENOKEY;
 
 	if (!fscrypt_has_permitted_context(dir, inode))
@@ -86,7 +86,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
 		return err;
 
 	/* ... in case we looked up no-key name(s) before key was added */
-	if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME)
+	if (fscrypt_is_nokey_name(old_dentry) ||
+	    fscrypt_is_nokey_name(new_dentry))
 		return -ENOKEY;
 
 	if (old_dir != new_dir) {
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index a8f7a43f031b..8e1d31c959bf 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -111,6 +111,35 @@ static inline void fscrypt_handle_d_move(struct dentry *dentry)
 	dentry->d_flags &= ~DCACHE_NOKEY_NAME;
 }
 
+/**
+ * fscrypt_is_nokey_name() - test whether a dentry is a no-key name
+ * @dentry: the dentry to check
+ *
+ * This returns true if the dentry is a no-key dentry.  A no-key dentry is a
+ * dentry that was created in an encrypted directory that hasn't had its
+ * encryption key added yet.  Such dentries may be either positive or negative.
+ *
+ * When a filesystem is asked to create a new filename in an encrypted directory
+ * and the new filename's dentry is a no-key dentry, it must fail the operation
+ * with ENOKEY.  This includes ->create(), ->mkdir(), ->mknod(), ->symlink(),
+ * ->rename(), and ->link().  (However, ->rename() and ->link() are already
+ * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().)
+ *
+ * This is necessary because creating a filename requires the directory's
+ * encryption key, but just checking for the key on the directory inode during
+ * the final filesystem operation doesn't guarantee that the key was available
+ * during the preceding dentry lookup.  And the key must have already been
+ * available during the dentry lookup in order for it to have been checked
+ * whether the filename already exists in the directory and for the new file's
+ * dentry not to be invalidated due to it incorrectly having the no-key flag.
+ *
+ * Return: %true if the dentry is a no-key name
+ */
+static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_NOKEY_NAME;
+}
+
 /* crypto.c */
 void fscrypt_enqueue_decrypt_work(struct work_struct *);
 
@@ -244,6 +273,11 @@ static inline void fscrypt_handle_d_move(struct dentry *dentry)
 {
 }
 
+static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
+{
+	return false;
+}
+
 /* crypto.c */
 static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
-- 
cgit 


From 234f1b7f8daf112655c87f75ae8932564f871225 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 17 Nov 2020 23:56:09 -0800
Subject: fscrypt: remove unnecessary calls to fscrypt_require_key()

In an encrypted directory, a regular dentry (one that doesn't have the
no-key name flag) can only be created if the directory's encryption key
is available.

Therefore the calls to fscrypt_require_key() in __fscrypt_prepare_link()
and __fscrypt_prepare_rename() are unnecessary, as these functions
already check that the dentries they're given aren't no-key names.

Remove these unnecessary calls to fscrypt_require_key().

Link: https://lore.kernel.org/r/20201118075609.120337-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/hooks.c       | 26 ++++++++------------------
 include/linux/fscrypt.h |  3 +--
 2 files changed, 9 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 061418be4b08..c582e2ddb39c 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -54,15 +54,12 @@ EXPORT_SYMBOL_GPL(fscrypt_file_open);
 int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
 			   struct dentry *dentry)
 {
-	int err;
-
-	err = fscrypt_require_key(dir);
-	if (err)
-		return err;
-
-	/* ... in case we looked up no-key name before key was added */
 	if (fscrypt_is_nokey_name(dentry))
 		return -ENOKEY;
+	/*
+	 * We don't need to separately check that the directory inode's key is
+	 * available, as it's implied by the dentry not being a no-key name.
+	 */
 
 	if (!fscrypt_has_permitted_context(dir, inode))
 		return -EXDEV;
@@ -75,20 +72,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry,
 			     unsigned int flags)
 {
-	int err;
-
-	err = fscrypt_require_key(old_dir);
-	if (err)
-		return err;
-
-	err = fscrypt_require_key(new_dir);
-	if (err)
-		return err;
-
-	/* ... in case we looked up no-key name(s) before key was added */
 	if (fscrypt_is_nokey_name(old_dentry) ||
 	    fscrypt_is_nokey_name(new_dentry))
 		return -ENOKEY;
+	/*
+	 * We don't need to separately check that the directory inodes' keys are
+	 * available, as it's implied by the dentries not being no-key names.
+	 */
 
 	if (old_dir != new_dir) {
 		if (IS_ENCRYPTED(new_dir) &&
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 8e1d31c959bf..0c9e64969b73 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -710,8 +710,7 @@ static inline int fscrypt_require_key(struct inode *inode)
  *
  * A new link can only be added to an encrypted directory if the directory's
  * encryption key is available --- since otherwise we'd have no way to encrypt
- * the filename.  Therefore, we first set up the directory's encryption key (if
- * not already done) and return an error if it's unavailable.
+ * the filename.
  *
  * We also verify that the link will not violate the constraint that all files
  * in an encrypted directory tree use the same encryption policy.
-- 
cgit 


From 5903f61e035320104394f721f74cd22171636f63 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Mon, 23 Nov 2020 10:54:58 -0500
Subject: entry: Fix boot for !CONFIG_GENERIC_ENTRY

A copy-pasta mistake tries to set SYSCALL_WORK flags instead of TIF
flags for !CONFIG_GENERIC_ENTRY.  Also, add safeguards to catch this at
compilation time.

Fixes: 3136b93c3fb2 ("entry: Expose helpers to migrate TIF to SYSCALL_WORK flags")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/87a6v8qd9p.fsf_-_@collabora.com
---
 include/linux/thread_info.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 317363212ae9..ca80a214df09 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -35,6 +35,7 @@ enum {
 	GOOD_STACK,
 };
 
+#ifdef CONFIG_GENERIC_ENTRY
 enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SECCOMP,
 	SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
@@ -48,6 +49,7 @@ enum syscall_work_bit {
 #define SYSCALL_WORK_SYSCALL_TRACE	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
 #define SYSCALL_WORK_SYSCALL_EMU	BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
 #define SYSCALL_WORK_SYSCALL_AUDIT	BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
+#endif
 
 #include <asm/thread_info.h>
 
@@ -129,11 +131,11 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 #else /* CONFIG_GENERIC_ENTRY */
 
 #define set_syscall_work(fl)						\
-	set_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl)
+	set_ti_thread_flag(current_thread_info(), TIF_##fl)
 #define test_syscall_work(fl) \
-	test_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl)
+	test_ti_thread_flag(current_thread_info(), TIF_##fl)
 #define clear_syscall_work(fl) \
-	clear_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl)
+	clear_ti_thread_flag(current_thread_info(), TIF_##fl)
 
 #define set_task_syscall_work(t, fl) \
 	set_ti_thread_flag(task_thread_info(t), TIF_##fl)
-- 
cgit 


From 2fb94784952e4b290c392b74c2c67b4afa672523 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 24 Nov 2020 09:33:16 +0800
Subject: soundwire: registers: add definitions for clearable interrupt fields

DP0 has reserved fields and the read-only SDCA_CASCADE bit. We should
not try to write values in these fields, so add a formal definition
for clearable interrupts to be used in DP0 interrupt handling.

DPN also has reserved fields so add definitions for clearable
interrupts as well.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20201124013318.8963-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_registers.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_registers.h b/include/linux/soundwire/sdw_registers.h
index f420e8059779..0cb1a22685b8 100644
--- a/include/linux/soundwire/sdw_registers.h
+++ b/include/linux/soundwire/sdw_registers.h
@@ -41,6 +41,12 @@
 #define SDW_DP0_INT_IMPDEF1			BIT(5)
 #define SDW_DP0_INT_IMPDEF2			BIT(6)
 #define SDW_DP0_INT_IMPDEF3			BIT(7)
+#define SDW_DP0_INTERRUPTS			(SDW_DP0_INT_TEST_FAIL | \
+						 SDW_DP0_INT_PORT_READY | \
+						 SDW_DP0_INT_BRA_FAILURE | \
+						 SDW_DP0_INT_IMPDEF1 | \
+						 SDW_DP0_INT_IMPDEF2 | \
+						 SDW_DP0_INT_IMPDEF3)
 
 #define SDW_DP0_PORTCTRL_DATAMODE		GENMASK(3, 2)
 #define SDW_DP0_PORTCTRL_NXTINVBANK		BIT(4)
@@ -241,6 +247,11 @@
 #define SDW_DPN_INT_IMPDEF1			BIT(5)
 #define SDW_DPN_INT_IMPDEF2			BIT(6)
 #define SDW_DPN_INT_IMPDEF3			BIT(7)
+#define SDW_DPN_INTERRUPTS			(SDW_DPN_INT_TEST_FAIL | \
+						 SDW_DPN_INT_PORT_READY | \
+						 SDW_DPN_INT_IMPDEF1 | \
+						 SDW_DPN_INT_IMPDEF2 | \
+						 SDW_DPN_INT_IMPDEF3)
 
 #define SDW_DPN_PORTCTRL_FLOWMODE		GENMASK(1, 0)
 #define SDW_DPN_PORTCTRL_DATAMODE		GENMASK(3, 2)
-- 
cgit 


From 2a2b8eaa5b25668a6f717f94b55f4e3aaf87629d Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Tue, 24 Nov 2020 16:20:51 +0800
Subject: iommu: Handle freelists when using deferred flushing in iommu drivers

Allow the iommu_unmap_fast to return newly freed page table pages and
pass the freelist to queue_iova in the dma-iommu ops path.

This is useful for iommu drivers (in this case the intel iommu driver)
which need to wait for the ioTLB to be flushed before newly
free/unmapped page table pages can be freed. This way we can still batch
ioTLB free operations and handle the freelists.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20201124082057.2614359-2-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/dma-iommu.c   | 29 +++++++++++++++++-------
 drivers/iommu/intel/iommu.c | 55 +++++++++++++++++++++++++++++----------------
 include/linux/iommu.h       |  1 +
 3 files changed, 58 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 0cbcd3fc3e7e..9c827a4d2207 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -49,6 +49,18 @@ struct iommu_dma_cookie {
 	struct iommu_domain		*fq_domain;
 };
 
+static void iommu_dma_entry_dtor(unsigned long data)
+{
+	struct page *freelist = (struct page *)data;
+
+	while (freelist) {
+		unsigned long p = (unsigned long)page_address(freelist);
+
+		freelist = freelist->freelist;
+		free_page(p);
+	}
+}
+
 static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
 {
 	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
@@ -343,7 +355,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	if (!cookie->fq_domain && !iommu_domain_get_attr(domain,
 			DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) {
 		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
-					NULL))
+					  iommu_dma_entry_dtor))
 			pr_warn("iova flush queue initialization failed\n");
 		else
 			cookie->fq_domain = domain;
@@ -440,7 +452,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 }
 
 static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
-		dma_addr_t iova, size_t size)
+		dma_addr_t iova, size_t size, struct page *freelist)
 {
 	struct iova_domain *iovad = &cookie->iovad;
 
@@ -449,7 +461,8 @@ static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
 		cookie->msi_iova -= size;
 	else if (cookie->fq_domain)	/* non-strict mode */
 		queue_iova(iovad, iova_pfn(iovad, iova),
-				size >> iova_shift(iovad), 0);
+				size >> iova_shift(iovad),
+				(unsigned long)freelist);
 	else
 		free_iova_fast(iovad, iova_pfn(iovad, iova),
 				size >> iova_shift(iovad));
@@ -474,7 +487,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
 
 	if (!cookie->fq_domain)
 		iommu_iotlb_sync(domain, &iotlb_gather);
-	iommu_dma_free_iova(cookie, dma_addr, size);
+	iommu_dma_free_iova(cookie, dma_addr, size, iotlb_gather.freelist);
 }
 
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
@@ -496,7 +509,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 		return DMA_MAPPING_ERROR;
 
 	if (iommu_map_atomic(domain, iova, phys - iova_off, size, prot)) {
-		iommu_dma_free_iova(cookie, iova, size);
+		iommu_dma_free_iova(cookie, iova, size, NULL);
 		return DMA_MAPPING_ERROR;
 	}
 	return iova + iova_off;
@@ -649,7 +662,7 @@ out_unmap:
 out_free_sg:
 	sg_free_table(&sgt);
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, size);
+	iommu_dma_free_iova(cookie, iova, size, NULL);
 out_free_pages:
 	__iommu_dma_free_pages(pages, count);
 	return NULL;
@@ -900,7 +913,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	return __finalise_sg(dev, sg, nents, iova);
 
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, iova_len);
+	iommu_dma_free_iova(cookie, iova, iova_len, NULL);
 out_restore_sg:
 	__invalidate_sg(sg, nents);
 	return 0;
@@ -1228,7 +1241,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 	return msi_page;
 
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, size);
+	iommu_dma_free_iova(cookie, iova, size, NULL);
 out_free_page:
 	kfree(msi_page);
 	return NULL;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7b32703c0b47..fed0b3f0e25b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1243,17 +1243,17 @@ next:
    pages can only be freed after the IOTLB flush has been done. */
 static struct page *domain_unmap(struct dmar_domain *domain,
 				 unsigned long start_pfn,
-				 unsigned long last_pfn)
+				 unsigned long last_pfn,
+				 struct page *freelist)
 {
-	struct page *freelist;
-
 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
 	BUG_ON(start_pfn > last_pfn);
 
 	/* we don't need lock here; nobody else touches the iova range */
 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
-				       domain->pgd, 0, start_pfn, last_pfn, NULL);
+				       domain->pgd, 0, start_pfn, last_pfn,
+				       freelist);
 
 	/* free pgd */
 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
@@ -2011,7 +2011,8 @@ static void domain_exit(struct dmar_domain *domain)
 	if (domain->pgd) {
 		struct page *freelist;
 
-		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+		freelist = domain_unmap(domain, 0,
+					DOMAIN_MAX_PFN(domain->gaw), NULL);
 		dma_free_pagelist(freelist);
 	}
 
@@ -3570,7 +3571,7 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
 	if (dev_is_pci(dev))
 		pdev = to_pci_dev(dev);
 
-	freelist = domain_unmap(domain, start_pfn, last_pfn);
+	freelist = domain_unmap(domain, start_pfn, last_pfn, NULL);
 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
 			!has_iova_flush_queue(&domain->iovad)) {
 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
@@ -4636,7 +4637,8 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb,
 			struct page *freelist;
 
 			freelist = domain_unmap(si_domain,
-						start_vpfn, last_vpfn);
+						start_vpfn, last_vpfn,
+						NULL);
 
 			rcu_read_lock();
 			for_each_active_iommu(iommu, drhd)
@@ -5608,10 +5610,8 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
 				struct iommu_iotlb_gather *gather)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-	struct page *freelist = NULL;
 	unsigned long start_pfn, last_pfn;
-	unsigned int npages;
-	int iommu_id, level = 0;
+	int level = 0;
 
 	/* Cope with horrid API which requires us to unmap more than the
 	   size argument if it happens to be a large-page mapping. */
@@ -5623,22 +5623,38 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
 	start_pfn = iova >> VTD_PAGE_SHIFT;
 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
 
-	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
-
-	npages = last_pfn - start_pfn + 1;
-
-	for_each_domain_iommu(iommu_id, dmar_domain)
-		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
-				      start_pfn, npages, !freelist, 0);
-
-	dma_free_pagelist(freelist);
+	gather->freelist = domain_unmap(dmar_domain, start_pfn,
+					last_pfn, gather->freelist);
 
 	if (dmar_domain->max_addr == iova + size)
 		dmar_domain->max_addr = iova;
 
+	iommu_iotlb_gather_add_page(domain, gather, iova, size);
+
 	return size;
 }
 
+static void intel_iommu_tlb_sync(struct iommu_domain *domain,
+				 struct iommu_iotlb_gather *gather)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	unsigned long iova_pfn = IOVA_PFN(gather->start);
+	size_t size = gather->end - gather->start;
+	unsigned long start_pfn, last_pfn;
+	unsigned long nrpages;
+	int iommu_id;
+
+	nrpages = aligned_nrpages(gather->start, size);
+	start_pfn = mm_to_dma_pfn(iova_pfn);
+	last_pfn = start_pfn + nrpages - 1;
+
+	for_each_domain_iommu(iommu_id, dmar_domain)
+		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
+				      start_pfn, nrpages, !gather->freelist, 0);
+
+	dma_free_pagelist(gather->freelist);
+}
+
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 					    dma_addr_t iova)
 {
@@ -6098,6 +6114,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
 	.map			= intel_iommu_map,
 	.unmap			= intel_iommu_unmap,
+	.iotlb_sync		= intel_iommu_tlb_sync,
 	.iova_to_phys		= intel_iommu_iova_to_phys,
 	.probe_device		= intel_iommu_probe_device,
 	.probe_finalize		= intel_iommu_probe_finalize,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b95a6f8db6ff..e56bae327e35 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -180,6 +180,7 @@ struct iommu_iotlb_gather {
 	unsigned long		start;
 	unsigned long		end;
 	size_t			pgsize;
+	struct page		*freelist;
 };
 
 /**
-- 
cgit 


From 230309d08b871e439f8618db3610f2cc9b5f7c72 Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Tue, 24 Nov 2020 16:20:52 +0800
Subject: iommu: Add iommu_dma_free_cpu_cached_iovas()

Add a iommu_dma_free_cpu_cached_iovas function to allow drivers which
use the dma-iommu ops to free cached cpu iovas.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Link: https://lore.kernel.org/r/20201124082057.2614359-3-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/dma-iommu.c | 9 +++++++++
 include/linux/dma-iommu.h | 8 ++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9c827a4d2207..de521e22bafb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -49,6 +49,15 @@ struct iommu_dma_cookie {
 	struct iommu_domain		*fq_domain;
 };
 
+void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
+		struct iommu_domain *domain)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+
+	free_cpu_cached_iovas(cpu, iovad);
+}
+
 static void iommu_dma_entry_dtor(unsigned long data)
 {
 	struct page *freelist = (struct page *)data;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 2112f21f73d8..706b68d1359b 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -37,6 +37,9 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
+void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
+		struct iommu_domain *domain);
+
 #else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
@@ -78,5 +81,10 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
 {
 }
 
+static inline void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
+		struct iommu_domain *domain)
+{
+}
+
 #endif	/* CONFIG_IOMMU_DMA */
 #endif	/* __DMA_IOMMU_H */
-- 
cgit 


From a7656ecf825ac0434a5e7bf108ec1a56b65ee5e4 Mon Sep 17 00:00:00 2001
From: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Date: Wed, 25 Nov 2020 12:30:10 +0530
Subject: iommu/io-pgtable: Add a domain attribute for pagetable configuration

Add a new iommu domain attribute DOMAIN_ATTR_IO_PGTABLE_CFG
for pagetable configuration which initially will be used to
set quirks like for system cache aka last level cache to be
used by client drivers like GPU to set right attributes for
caching the hardware pagetables into the system cache and
later can be extended to include other page table configuration
data.

Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Link: https://lore.kernel.org/r/9190aa16f378fc0a7f8e57b2b9f60b033e7eeb4f.1606287059.git.saiprakash.ranjan@codeaurora.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/io-pgtable.h | 4 ++++
 include/linux/iommu.h      | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4cde111e425b..215fd9d69540 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -208,6 +208,10 @@ struct io_pgtable {
 
 #define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
 
+struct io_pgtable_domain_attr {
+	unsigned long quirks;
+};
+
 static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
 {
 	iop->cfg.tlb->tlb_flush_all(iop->cookie);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b95a6f8db6ff..ffaa389ea128 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
 	DOMAIN_ATTR_FSL_PAMUV1,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+	DOMAIN_ATTR_IO_PGTABLE_CFG,
 	DOMAIN_ATTR_MAX,
 };
 
-- 
cgit 


From e67890c97944b9962cf8c140a7f8077ed643b7d7 Mon Sep 17 00:00:00 2001
From: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Date: Wed, 25 Nov 2020 12:30:11 +0530
Subject: iommu/io-pgtable-arm: Add support to use system cache

Add a quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to override
the outer-cacheability attributes set in the TCR for a
non-coherent page table walker when using system cache.

Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Link: https://lore.kernel.org/r/f818676b4a2a9ad1edb92721947d47db41ed6a7c.1606287059.git.saiprakash.ranjan@codeaurora.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/io-pgtable-arm.c | 10 ++++++++--
 include/linux/io-pgtable.h     |  4 ++++
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index a7a9bc08dcd1..7c9ea9d7874a 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_NON_STRICT |
-			    IO_PGTABLE_QUIRK_ARM_TTBR1))
+			    IO_PGTABLE_QUIRK_ARM_TTBR1 |
+			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
 		return NULL;
 
 	data = arm_lpae_alloc_pgtable(cfg);
@@ -773,10 +774,15 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 		tcr->sh = ARM_LPAE_TCR_SH_IS;
 		tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
 		tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+		if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+			goto out_free_data;
 	} else {
 		tcr->sh = ARM_LPAE_TCR_SH_OS;
 		tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-		tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+		if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+			tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+		else
+			tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
 	}
 
 	tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 215fd9d69540..fb4d5a763e0c 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 	 *
 	 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 	 *	for use in the upper half of a split address space.
+	 *
+	 * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
+	 *	attributes set in the TCR for a non-coherent page-table walker.
 	 */
 	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
 	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
@@ -93,6 +96,7 @@ struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_MTK_EXT	BIT(3)
 	#define IO_PGTABLE_QUIRK_NON_STRICT	BIT(4)
 	#define IO_PGTABLE_QUIRK_ARM_TTBR1	BIT(5)
+	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA	BIT(6)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
-- 
cgit 


From 0801a0073f86e020987acbbd96b50f9c85d79de8 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:14 +0100
Subject: module: drop version-attribute alignment

Commit 98562ad8cb03 ("module: explicitly align module_version_attribute
structure") added an alignment attribute to the struct
module_version_attribute type in order to fix an alignment issue on m68k
where the structure is 2-byte aligned while MODULE_VERSION() forced the
__modver section entries to be 4-byte aligned (sizeof(void *)).

This was essentially an alternative fix to the problem addressed by
b4bc842802db ("module: deal with alignment issues in built-in module
versions") which used the array-of-pointer trick to prevent gcc from
increasing alignment of the version attribute entries. And with the
pointer indirection in place there's no need to increase the alignment
of the type.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 7ccdf87f376f..293250958512 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -66,7 +66,7 @@ struct module_version_attribute {
 	struct module_attribute mattr;
 	const char *module_name;
 	const char *version;
-} __attribute__ ((__aligned__(sizeof(void *))));
+};
 
 extern ssize_t __modver_version_show(struct module_attribute *,
 				     struct module_kobject *, char *);
-- 
cgit 


From b112082c8930e7aa72422484b2d31d3aa06f58bc Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:15 +0100
Subject: module: simplify version-attribute handling

Instead of using the array-of-pointers trick to avoid having gcc mess up
the built-in module-version array stride, specify type alignment when
declaring entries to prevent gcc from increasing alignment.

This is essentially an alternative (one-line) fix to the problem
addressed by commit b4bc842802db ("module: deal with alignment issues in
built-in module versions").

gcc can increase the alignment of larger objects with static extent as
an optimisation, but this can be suppressed by using the aligned
attribute when declaring variables.

Note that we have been relying on this behaviour for kernel parameters
for 16 years and it indeed hasn't changed since the introduction of the
aligned attribute in gcc-3.1.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 26 +++++++++++++-------------
 kernel/params.c        | 10 ++++------
 2 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 293250958512..5958075ea3f4 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -266,20 +266,20 @@ extern typeof(name) __mod_##type##__##name##_device_table		\
 #else
 #define MODULE_VERSION(_version)					\
 	MODULE_INFO(version, _version);					\
-	static struct module_version_attribute ___modver_attr = {	\
-		.mattr	= {						\
-			.attr	= {					\
-				.name	= "version",			\
-				.mode	= S_IRUGO,			\
+	static struct module_version_attribute __modver_attr		\
+		__used __section("__modver")				\
+		__aligned(__alignof__(struct module_version_attribute)) \
+		= {							\
+			.mattr	= {					\
+				.attr	= {				\
+					.name	= "version",		\
+					.mode	= S_IRUGO,		\
+				},					\
+				.show	= __modver_version_show,	\
 			},						\
-			.show	= __modver_version_show,		\
-		},							\
-		.module_name	= KBUILD_MODNAME,			\
-		.version	= _version,				\
-	};								\
-	static const struct module_version_attribute			\
-	__used __section("__modver")					\
-	* __moduleparam_const __modver_attr = &___modver_attr
+			.module_name	= KBUILD_MODNAME,		\
+			.version	= _version,			\
+		};
 #endif
 
 /* Optional firmware file (or files) needed by the module
diff --git a/kernel/params.c b/kernel/params.c
index 3835fb82c64b..aa7d6f2213f1 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -843,18 +843,16 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
 	return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
 }
 
-extern const struct module_version_attribute *__start___modver[];
-extern const struct module_version_attribute *__stop___modver[];
+extern const struct module_version_attribute __start___modver[];
+extern const struct module_version_attribute __stop___modver[];
 
 static void __init version_sysfs_builtin(void)
 {
-	const struct module_version_attribute **p;
+	const struct module_version_attribute *vattr;
 	struct module_kobject *mk;
 	int err;
 
-	for (p = __start___modver; p < __stop___modver; p++) {
-		const struct module_version_attribute *vattr = *p;
-
+	for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
 		mk = locate_module_kobject(vattr->module_name);
 		if (mk) {
 			err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
-- 
cgit 


From 8d6615f1fccc4f39d7d3dcf286b33e8a1e833d2b Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:17 +0100
Subject: params: drop redundant "unused" attributes

Drop the redundant "unused" attributes from module-parameter structures
already marked "used".

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleparam.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 6388eb9734a5..742074ad9f6e 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -22,7 +22,7 @@
 
 #define __MODULE_INFO(tag, name, info)					  \
 static const char __UNIQUE_ID(name)[]					  \
-  __used __section(".modinfo") __attribute__((unused, aligned(1)))	  \
+  __used __section(".modinfo") __attribute__((aligned(1)))		  \
   = __MODULE_INFO_PREFIX __stringify(tag) "=" info
 
 #define __MODULE_PARM_TYPE(name, _type)					  \
@@ -289,7 +289,7 @@ struct kparam_array
 	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
-    __section("__param") __attribute__ ((unused, aligned(sizeof(void *)))) \
+	__section("__param") __attribute__ ((aligned(sizeof(void *))))  \
 	= { __param_str_##name, THIS_MODULE, ops,			\
 	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
-- 
cgit 


From fe2f4fe139b321a38daafc715aeb7d21d9e8e5ad Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:18 +0100
Subject: params: use type alignment for kernel parameters

Specify type alignment for kernel parameters instead of sizeof(void *).

The alignment attribute is used to prevent gcc from increasing the
alignment of objects with static extent as an optimisation, something
which would mess up the __param array stride.

Using __alignof__(struct kernel_param) rather than sizeof(void *) is
preferred since it better indicates why it is there and doesn't break
should the type size or alignment change.

Note that on m68k the alignment of struct kernel_param is actually two
and that adding a 1- or 2-byte field to the 20-byte struct would cause a
breakage with the current 4-byte alignment.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleparam.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 742074ad9f6e..15ecc6cc3a3b 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -288,8 +288,8 @@ struct kparam_array
 	/* Default value instead of permissions? */			\
 	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
-	__used								\
-	__section("__param") __attribute__ ((aligned(sizeof(void *))))  \
+	__used __section("__param")					\
+	__aligned(__alignof__(struct kernel_param))			\
 	= { __param_str_##name, THIS_MODULE, ops,			\
 	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
-- 
cgit 


From 2aec389e19150ed3bf67ab708f2435563f76050f Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:19 +0100
Subject: params: clean up module-param macros

Clean up the module-param macros by adding some indentation and using
the __aligned() macro to improve readability.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleparam.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 15ecc6cc3a3b..eed280fae433 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -21,12 +21,12 @@
 #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
 
 #define __MODULE_INFO(tag, name, info)					  \
-static const char __UNIQUE_ID(name)[]					  \
-  __used __section(".modinfo") __attribute__((aligned(1)))		  \
-  = __MODULE_INFO_PREFIX __stringify(tag) "=" info
+	static const char __UNIQUE_ID(name)[]				  \
+		__used __section(".modinfo") __aligned(1)		  \
+		= __MODULE_INFO_PREFIX __stringify(tag) "=" info
 
 #define __MODULE_PARM_TYPE(name, _type)					  \
-  __MODULE_INFO(parmtype, name##type, #name ":" _type)
+	__MODULE_INFO(parmtype, name##type, #name ":" _type)
 
 /* One for each parameter, describing how to use it.  Some files do
    multiple of these per line, so can't just use MODULE_INFO. */
-- 
cgit 


From 367c820ef08082e68df8a3bc12e62393af21e4b5 Mon Sep 17 00:00:00 2001
From: Sumit Garg <sumit.garg@linaro.org>
Date: Wed, 7 Oct 2020 14:21:43 +0530
Subject: arm64: Enable perf events based hard lockup detector

With the recent feature added to enable perf events to use pseudo NMIs
as interrupts on platforms which support GICv3 or later, its now been
possible to enable hard lockup detector (or NMI watchdog) on arm64
platforms. So enable corresponding support.

One thing to note here is that normally lockup detector is initialized
just after the early initcalls but PMU on arm64 comes up much later as
device_initcall(). So we need to re-initialize lockup detection once
PMU has been initialized.

Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Acked-by: Alexandru Elisei <alexandru.elisei@arm.com>
Link: https://lore.kernel.org/r/1602060704-10921-1-git-send-email-sumit.garg@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/Kconfig             |  2 ++
 arch/arm64/kernel/perf_event.c | 41 +++++++++++++++++++++++++++++++++++++++--
 drivers/perf/arm_pmu.c         |  5 +++++
 include/linux/perf/arm_pmu.h   |  2 ++
 4 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1515f6f153a0..7d3440770b14 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -170,6 +170,8 @@ config ARM64
 	select HAVE_NMI
 	select HAVE_PATA_PLATFORM
 	select HAVE_PERF_EVENTS
+	select HAVE_PERF_EVENTS_NMI if ARM64_PSEUDO_NMI
+	select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 3605f77ad4df..38bb07eff872 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -23,6 +23,8 @@
 #include <linux/platform_device.h>
 #include <linux/sched_clock.h>
 #include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/cpufreq.h>
 
 /* ARMv8 Cortex-A53 specific event types. */
 #define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
@@ -1248,10 +1250,21 @@ static struct platform_driver armv8_pmu_driver = {
 
 static int __init armv8_pmu_driver_init(void)
 {
+	int ret;
+
 	if (acpi_disabled)
-		return platform_driver_register(&armv8_pmu_driver);
+		ret = platform_driver_register(&armv8_pmu_driver);
 	else
-		return arm_pmu_acpi_probe(armv8_pmuv3_init);
+		ret = arm_pmu_acpi_probe(armv8_pmuv3_init);
+
+	/*
+	 * Try to re-initialize lockup detector after PMU init in
+	 * case PMU events are triggered via NMIs.
+	 */
+	if (ret == 0 && arm_pmu_irq_is_nmi())
+		lockup_detector_init();
+
+	return ret;
 }
 device_initcall(armv8_pmu_driver_init)
 
@@ -1309,3 +1322,27 @@ void arch_perf_update_userpage(struct perf_event *event,
 	userpg->cap_user_time_zero = 1;
 	userpg->cap_user_time_short = 1;
 }
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+/*
+ * Safe maximum CPU frequency in case a particular platform doesn't implement
+ * cpufreq driver. Although, architecture doesn't put any restrictions on
+ * maximum frequency but 5 GHz seems to be safe maximum given the available
+ * Arm CPUs in the market which are clocked much less than 5 GHz. On the other
+ * hand, we can't make it much higher as it would lead to a large hard-lockup
+ * detection timeout on parts which are running slower (eg. 1GHz on
+ * Developerbox) and doesn't possess a cpufreq driver.
+ */
+#define SAFE_MAX_CPU_FREQ	5000000000UL // 5 GHz
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned long max_cpu_freq;
+
+	max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL;
+	if (!max_cpu_freq)
+		max_cpu_freq = SAFE_MAX_CPU_FREQ;
+
+	return (u64)max_cpu_freq * watchdog_thresh;
+}
+#endif
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index cb2f55f450e4..794a37d50853 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -726,6 +726,11 @@ static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu)
 	return per_cpu(hw_events->irq, cpu);
 }
 
+bool arm_pmu_irq_is_nmi(void)
+{
+	return has_nmi;
+}
+
 /*
  * PMU hardware loses all context when a CPU goes offline.
  * When a CPU is hotplugged back in, since some hardware registers are
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 505480217cf1..bf7966776c55 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -163,6 +163,8 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn);
 static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; }
 #endif
 
+bool arm_pmu_irq_is_nmi(void);
+
 /* Internal functions only for core arm_pmu code */
 struct arm_pmu *armpmu_alloc(void);
 struct arm_pmu *armpmu_alloc_atomic(void);
-- 
cgit 


From 63653368c25ff0b1b1aaf045c97ea87bd8c16123 Mon Sep 17 00:00:00 2001
From: Jeffle Xu <jefflexu@linux.alibaba.com>
Date: Wed, 25 Nov 2020 14:58:17 +0800
Subject: block: remove unused BIO_SPLIT_ENTRIES

Since commit 4b1faf931650 ("block: Kill bio_pair_split()"), there's
no user of BIO_SPLIT_ENTRIES anymore.

Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index c6d765382926..ecf67108f091 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -711,12 +711,6 @@ static inline bool bioset_initialized(struct bio_set *bs)
 	return bs->bio_slab != NULL;
 }
 
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
-
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 #define bip_for_each_vec(bvl, bip, iter)				\
-- 
cgit 


From 6527b938426f7fa66051273568d234b1fe01a15b Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Mon, 23 Nov 2020 17:38:17 +0200
Subject: net: phy: remove the .did_interrupt() and .ack_interrupt() callback

Now that all the PHY drivers have been migrated to directly implement
the generic .handle_interrupt() callback for a seamless support of
shared IRQs and all the .config_inter() implementations clear any
pending interrupts, we can safely remove the two callbacks.

With this patch, phylib has a proper support for shared IRQs (and not
just for multi-PHY devices. A PHY driver must implement both the
.handle_interrupt() and .config_intr() callbacks for the IRQs to be
actually used.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy.c        | 48 ++------------------------------------------
 drivers/net/phy/phy_device.c |  2 +-
 include/linux/phy.h          | 19 ++++--------------
 3 files changed, 7 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index dce86bad8231..45f75533c47c 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -113,23 +113,6 @@ void phy_print_status(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_print_status);
 
-/**
- * phy_clear_interrupt - Ack the phy device's interrupt
- * @phydev: the phy_device struct
- *
- * If the @phydev driver has an ack_interrupt function, call it to
- * ack and clear the phy device's interrupt.
- *
- * Returns 0 on success or < 0 on error.
- */
-static int phy_clear_interrupt(struct phy_device *phydev)
-{
-	if (phydev->drv->ack_interrupt)
-		return phydev->drv->ack_interrupt(phydev);
-
-	return 0;
-}
-
 /**
  * phy_config_interrupt - configure the PHY device for the requested interrupts
  * @phydev: the phy_device struct
@@ -943,15 +926,8 @@ EXPORT_SYMBOL(phy_error);
  */
 int phy_disable_interrupts(struct phy_device *phydev)
 {
-	int err;
-
 	/* Disable PHY interrupts */
-	err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
-	if (err)
-		return err;
-
-	/* Clear the interrupt */
-	return phy_clear_interrupt(phydev);
+	return phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
 }
 
 /**
@@ -966,22 +942,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 	struct phy_device *phydev = phy_dat;
 	struct phy_driver *drv = phydev->drv;
 
-	if (drv->handle_interrupt)
-		return drv->handle_interrupt(phydev);
-
-	if (drv->did_interrupt && !drv->did_interrupt(phydev))
-		return IRQ_NONE;
-
-	/* reschedule state queue work to run as soon as possible */
-	phy_trigger_machine(phydev);
-
-	/* did_interrupt() may have cleared the interrupt already */
-	if (!drv->did_interrupt && phy_clear_interrupt(phydev)) {
-		phy_error(phydev);
-		return IRQ_NONE;
-	}
-
-	return IRQ_HANDLED;
+	return drv->handle_interrupt(phydev);
 }
 
 /**
@@ -990,11 +951,6 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
  */
 static int phy_enable_interrupts(struct phy_device *phydev)
 {
-	int err = phy_clear_interrupt(phydev);
-
-	if (err < 0)
-		return err;
-
 	return phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED);
 }
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 81f672911305..80c2e646c093 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2826,7 +2826,7 @@ EXPORT_SYMBOL(phy_get_internal_delay);
 
 static bool phy_drv_supports_irq(struct phy_driver *phydrv)
 {
-	return phydrv->config_intr && (phydrv->ack_interrupt || phydrv->handle_interrupt);
+	return phydrv->config_intr && phydrv->handle_interrupt;
 }
 
 /**
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 8849a00a093f..381a95732b6a 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -743,18 +743,11 @@ struct phy_driver {
 	/** @read_status: Determines the negotiated speed and duplex */
 	int (*read_status)(struct phy_device *phydev);
 
-	/** @ack_interrupt: Clears any pending interrupts */
-	int (*ack_interrupt)(struct phy_device *phydev);
-
-	/** @config_intr: Enables or disables interrupts */
-	int (*config_intr)(struct phy_device *phydev);
-
-	/**
-	 * @did_interrupt: Checks if the PHY generated an interrupt.
-	 * For multi-PHY devices with shared PHY interrupt pin
-	 * Set interrupt bits have to be cleared.
+	/** @config_intr: Enables or disables interrupts.
+	 * It should also clear any pending interrupts prior to enabling the
+	 * IRQs and after disabling them.
 	 */
-	int (*did_interrupt)(struct phy_device *phydev);
+	int (*config_intr)(struct phy_device *phydev);
 
 	/** @handle_interrupt: Override default interrupt handling */
 	irqreturn_t (*handle_interrupt)(struct phy_device *phydev);
@@ -1487,10 +1480,6 @@ static inline int genphy_config_aneg(struct phy_device *phydev)
 	return __genphy_config_aneg(phydev, false);
 }
 
-static inline int genphy_no_ack_interrupt(struct phy_device *phydev)
-{
-	return 0;
-}
 static inline int genphy_no_config_intr(struct phy_device *phydev)
 {
 	return 0;
-- 
cgit 


From 403319be5de51167cd70ddf594b76c95e6d26844 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Tue, 24 Nov 2020 15:12:08 +0000
Subject: ima: Implement ima_inode_hash

This is in preparation to add a helper for BPF LSM programs to use
IMA hashes when attached to LSM hooks. There are LSM hooks like
inode_unlink which do not have a struct file * argument and cannot
use the existing ima_file_hash API.

An inode based API is, therefore, useful in LSM based detections like an
executable trying to delete itself which rely on the inode_unlink LSM
hook.

Moreover, the ima_file_hash function does nothing with the struct file
pointer apart from calling file_inode on it and converting it to an
inode.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Mimi Zohar <zohar@linux.ibm.com>
Link: https://lore.kernel.org/bpf/20201124151210.1081188-2-kpsingh@chromium.org
---
 include/linux/ima.h               |  6 +++
 security/integrity/ima/ima_main.c | 78 +++++++++++++++++++++++++++------------
 2 files changed, 60 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 8fa7bcfb2da2..7233a2751754 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -29,6 +29,7 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
 extern void ima_post_path_mknod(struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
+extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
 
 #ifdef CONFIG_IMA_KEXEC
@@ -115,6 +116,11 @@ static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size)
 	return -EOPNOTSUPP;
 }
 
+static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
 #endif /* CONFIG_IMA */
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 2d1af8899cab..cb2deaa188e7 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -501,37 +501,14 @@ int ima_file_check(struct file *file, int mask)
 }
 EXPORT_SYMBOL_GPL(ima_file_check);
 
-/**
- * ima_file_hash - return the stored measurement if a file has been hashed and
- * is in the iint cache.
- * @file: pointer to the file
- * @buf: buffer in which to store the hash
- * @buf_size: length of the buffer
- *
- * On success, return the hash algorithm (as defined in the enum hash_algo).
- * If buf is not NULL, this function also outputs the hash into buf.
- * If the hash is larger than buf_size, then only buf_size bytes will be copied.
- * It generally just makes sense to pass a buffer capable of holding the largest
- * possible hash: IMA_MAX_DIGEST_SIZE.
- * The file hash returned is based on the entire file, including the appended
- * signature.
- *
- * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
- * If the parameters are incorrect, return -EINVAL.
- */
-int ima_file_hash(struct file *file, char *buf, size_t buf_size)
+static int __ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
 {
-	struct inode *inode;
 	struct integrity_iint_cache *iint;
 	int hash_algo;
 
-	if (!file)
-		return -EINVAL;
-
 	if (!ima_policy_flag)
 		return -EOPNOTSUPP;
 
-	inode = file_inode(file);
 	iint = integrity_iint_find(inode);
 	if (!iint)
 		return -EOPNOTSUPP;
@@ -558,8 +535,61 @@ int ima_file_hash(struct file *file, char *buf, size_t buf_size)
 
 	return hash_algo;
 }
+
+/**
+ * ima_file_hash - return the stored measurement if a file has been hashed and
+ * is in the iint cache.
+ * @file: pointer to the file
+ * @buf: buffer in which to store the hash
+ * @buf_size: length of the buffer
+ *
+ * On success, return the hash algorithm (as defined in the enum hash_algo).
+ * If buf is not NULL, this function also outputs the hash into buf.
+ * If the hash is larger than buf_size, then only buf_size bytes will be copied.
+ * It generally just makes sense to pass a buffer capable of holding the largest
+ * possible hash: IMA_MAX_DIGEST_SIZE.
+ * The file hash returned is based on the entire file, including the appended
+ * signature.
+ *
+ * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
+ * If the parameters are incorrect, return -EINVAL.
+ */
+int ima_file_hash(struct file *file, char *buf, size_t buf_size)
+{
+	if (!file)
+		return -EINVAL;
+
+	return __ima_inode_hash(file_inode(file), buf, buf_size);
+}
 EXPORT_SYMBOL_GPL(ima_file_hash);
 
+/**
+ * ima_inode_hash - return the stored measurement if the inode has been hashed
+ * and is in the iint cache.
+ * @inode: pointer to the inode
+ * @buf: buffer in which to store the hash
+ * @buf_size: length of the buffer
+ *
+ * On success, return the hash algorithm (as defined in the enum hash_algo).
+ * If buf is not NULL, this function also outputs the hash into buf.
+ * If the hash is larger than buf_size, then only buf_size bytes will be copied.
+ * It generally just makes sense to pass a buffer capable of holding the largest
+ * possible hash: IMA_MAX_DIGEST_SIZE.
+ * The hash returned is based on the entire contents, including the appended
+ * signature.
+ *
+ * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
+ * If the parameters are incorrect, return -EINVAL.
+ */
+int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
+{
+	if (!inode)
+		return -EINVAL;
+
+	return __ima_inode_hash(inode, buf, buf_size);
+}
+EXPORT_SYMBOL_GPL(ima_inode_hash);
+
 /**
  * ima_post_create_tmpfile - mark newly created tmpfile as new
  * @file : newly created tmpfile
-- 
cgit 


From 8b5536ad1216c47fb9b37ef2cd0cfa70d79d4645 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Tue, 24 Nov 2020 18:49:28 +0800
Subject: lockdep: Introduce in_softirq lockdep assert

The current semantic for napi_consume_skb() is that caller need
to provide non-zero budget when calling from NAPI context, and
breaking this semantic will cause hard to debug problem, because
_kfree_skb_defer() need to run in atomic context in order to push
the skb to the particular cpu' napi_alloc_cache atomically.

So add the lockdep_assert_in_softirq() to assert when the running
context is not in_softirq, in_softirq means softirq is serving or
BH is disabled, which has a ambiguous semantics due to the BH
disabled confusion, so add a comment to emphasize that.

And the softirq context can be interrupted by hard IRQ or NMI
context, lockdep_assert_in_softirq() need to assert about hard
IRQ or NMI context too.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/lockdep.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index f5594879175a..92771bc1791f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -594,6 +594,16 @@ do {									\
 		      this_cpu_read(hardirqs_enabled)));		\
 } while (0)
 
+/*
+ * Acceptable for protecting per-CPU resources accessed from BH.
+ * Much like in_softirq() - semantics are ambiguous, use carefully.
+ */
+#define lockdep_assert_in_softirq()					\
+do {									\
+	WARN_ON_ONCE(__lockdep_enabled			&&		\
+		     (!in_softirq() || in_irq() || in_nmi()));		\
+} while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
@@ -605,6 +615,7 @@ do {									\
 
 # define lockdep_assert_preemption_enabled() do { } while (0)
 # define lockdep_assert_preemption_disabled() do { } while (0)
+# define lockdep_assert_in_softirq() do { } while (0)
 #endif
 
 #ifdef CONFIG_PROVE_RAW_LOCK_NESTING
-- 
cgit 


From 4c1ad562d303526b5d9b49f5e0d72da13ef78dec Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Fri, 20 Nov 2020 21:20:42 -0600
Subject: remoteproc: Add a rproc_set_firmware() API

A new API, rproc_set_firmware() is added to allow the remoteproc platform
drivers and remoteproc client drivers to be able to configure a custom
firmware name that is different from the default name used during
remoteproc registration. This function is being introduced to provide
a kernel-level equivalent of the current sysfs interface to remoteproc
client drivers, and can only change firmwares when the remoteproc is
offline. This allows some remoteproc drivers to choose different firmwares
at runtime based on the functionality the remote processor is providing.
The TI PRU Ethernet driver will be an example of such usage as it
requires to use different firmwares for different supported protocols.

Also, update the firmware_store() function used by the sysfs interface
to reuse this function to avoid code duplication.

Reviewed-by: Rishabh Bhatnagar <rishabhb@codeaurora.org>
Signed-off-by: Suman Anna <s-anna@ti.com>
Link: https://lore.kernel.org/r/20201121032042.6195-1-s-anna@ti.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c  | 63 +++++++++++++++++++++++++++++++++++
 drivers/remoteproc/remoteproc_sysfs.c | 33 ++----------------
 include/linux/remoteproc.h            |  1 +
 3 files changed, 66 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index dab2c0f5caf0..46c2937ebea9 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1934,6 +1934,69 @@ struct rproc *rproc_get_by_phandle(phandle phandle)
 #endif
 EXPORT_SYMBOL(rproc_get_by_phandle);
 
+/**
+ * rproc_set_firmware() - assign a new firmware
+ * @rproc: rproc handle to which the new firmware is being assigned
+ * @fw_name: new firmware name to be assigned
+ *
+ * This function allows remoteproc drivers or clients to configure a custom
+ * firmware name that is different from the default name used during remoteproc
+ * registration. The function does not trigger a remote processor boot,
+ * only sets the firmware name used for a subsequent boot. This function
+ * should also be called only when the remote processor is offline.
+ *
+ * This allows either the userspace to configure a different name through
+ * sysfs or a kernel-level remoteproc or a remoteproc client driver to set
+ * a specific firmware when it is controlling the boot and shutdown of the
+ * remote processor.
+ *
+ * Return: 0 on success or a negative value upon failure
+ */
+int rproc_set_firmware(struct rproc *rproc, const char *fw_name)
+{
+	struct device *dev;
+	int ret, len;
+	char *p;
+
+	if (!rproc || !fw_name)
+		return -EINVAL;
+
+	dev = rproc->dev.parent;
+
+	ret = mutex_lock_interruptible(&rproc->lock);
+	if (ret) {
+		dev_err(dev, "can't lock rproc %s: %d\n", rproc->name, ret);
+		return -EINVAL;
+	}
+
+	if (rproc->state != RPROC_OFFLINE) {
+		dev_err(dev, "can't change firmware while running\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	len = strcspn(fw_name, "\n");
+	if (!len) {
+		dev_err(dev, "can't provide empty string for firmware name\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	p = kstrndup(fw_name, len, GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	kfree(rproc->firmware);
+	rproc->firmware = p;
+
+out:
+	mutex_unlock(&rproc->lock);
+	return ret;
+}
+EXPORT_SYMBOL(rproc_set_firmware);
+
 static int rproc_validate(struct rproc *rproc)
 {
 	switch (rproc->state) {
diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c
index d1cf7bf277c4..1dbef895e65e 100644
--- a/drivers/remoteproc/remoteproc_sysfs.c
+++ b/drivers/remoteproc/remoteproc_sysfs.c
@@ -154,38 +154,9 @@ static ssize_t firmware_store(struct device *dev,
 			      const char *buf, size_t count)
 {
 	struct rproc *rproc = to_rproc(dev);
-	char *p;
-	int err, len = count;
+	int err;
 
-	err = mutex_lock_interruptible(&rproc->lock);
-	if (err) {
-		dev_err(dev, "can't lock rproc %s: %d\n", rproc->name, err);
-		return -EINVAL;
-	}
-
-	if (rproc->state != RPROC_OFFLINE) {
-		dev_err(dev, "can't change firmware while running\n");
-		err = -EBUSY;
-		goto out;
-	}
-
-	len = strcspn(buf, "\n");
-	if (!len) {
-		dev_err(dev, "can't provide a NULL firmware\n");
-		err = -EINVAL;
-		goto out;
-	}
-
-	p = kstrndup(buf, len, GFP_KERNEL);
-	if (!p) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	kfree(rproc->firmware);
-	rproc->firmware = p;
-out:
-	mutex_unlock(&rproc->lock);
+	err = rproc_set_firmware(rproc, buf);
 
 	return err ? err : count;
 }
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 3fa3ba6498e8..e8ac041c64d9 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -653,6 +653,7 @@ rproc_of_resm_mem_entry_init(struct device *dev, u32 of_resm_idx, size_t len,
 
 int rproc_boot(struct rproc *rproc);
 void rproc_shutdown(struct rproc *rproc);
+int rproc_set_firmware(struct rproc *rproc, const char *fw_name);
 void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type);
 int rproc_coredump_add_segment(struct rproc *rproc, dma_addr_t da, size_t size);
 int rproc_coredump_add_custom_segment(struct rproc *rproc,
-- 
cgit 


From 7656ca71b0ba04ae7437afe3d046e9134442678e Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 25 Nov 2020 15:06:41 +0300
Subject: usb: pd: DFP product types

USB Power Delivery Specification R3.0 introduced separate
field for the DFP product type to the ID Header VDO.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20201125120642.37156-2-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/pd_vdo.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h
index 8c5cb5830754..8c08eeb9a74b 100644
--- a/include/linux/usb/pd_vdo.h
+++ b/include/linux/usb/pd_vdo.h
@@ -103,17 +103,25 @@
  * --------------------
  * <31>     :: data capable as a USB host
  * <30>     :: data capable as a USB device
- * <29:27>  :: product type
+ * <29:27>  :: product type (UFP / Cable)
  * <26>     :: modal operation supported (1b == yes)
- * <25:16>  :: Reserved, Shall be set to zero
+ * <25:16>  :: product type (DFP)
  * <15:0>   :: USB-IF assigned VID for this cable vendor
  */
 #define IDH_PTYPE_UNDEF		0
 #define IDH_PTYPE_HUB		1
 #define IDH_PTYPE_PERIPH	2
+#define IDH_PTYPE_PSD		3
+#define IDH_PTYPE_AMA		5
+
 #define IDH_PTYPE_PCABLE	3
 #define IDH_PTYPE_ACABLE	4
-#define IDH_PTYPE_AMA		5
+
+#define IDH_PTYPE_DFP_UNDEF	0
+#define IDH_PTYPE_DFP_HUB	1
+#define IDH_PTYPE_DFP_HOST	2
+#define IDH_PTYPE_DFP_PB	3
+#define IDH_PTYPE_DFP_AMC	4
 
 #define VDO_IDH(usbh, usbd, ptype, is_modal, vid)		\
 	((usbh) << 31 | (usbd) << 30 | ((ptype) & 0x7) << 27	\
@@ -122,6 +130,7 @@
 #define PD_IDH_PTYPE(vdo)	(((vdo) >> 27) & 0x7)
 #define PD_IDH_VID(vdo)		((vdo) & 0xffff)
 #define PD_IDH_MODAL_SUPP(vdo)	((vdo) & (1 << 26))
+#define PD_IDH_DFP_PTYPE(vdo)	(((vdo) >> 23) & 0x7)
 
 /*
  * Cert Stat VDO
-- 
cgit 


From 640586f8af356096e084d69a9909d217852bde48 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 19 Nov 2020 17:02:21 +0100
Subject: powerpc/ptrace: Simplify gpr_get()/tm_cgpr_get()

gpr_get() does membuf_write() twice to override pt_regs->msr in
between. We can call membuf_write() once and change ->msr in the
kernel buffer, this simplifies the code and the next fix.

The patch adds a new simple helper, membuf_at(offs), it returns the
new membuf which can be safely used after membuf_write().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
[mpe: Fixup some minor whitespace issues noticed by Christophe]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201119160221.GA5188@redhat.com
---
 arch/powerpc/kernel/ptrace/ptrace-tm.c   | 12 ++++--------
 arch/powerpc/kernel/ptrace/ptrace-view.c | 10 +++-------
 include/linux/regset.h                   | 12 ++++++++++++
 3 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/ptrace/ptrace-tm.c b/arch/powerpc/kernel/ptrace/ptrace-tm.c
index 54f2d076206f..f15cbbab45b7 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-tm.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-tm.c
@@ -86,6 +86,8 @@ int tm_cgpr_active(struct task_struct *target, const struct user_regset *regset)
 int tm_cgpr_get(struct task_struct *target, const struct user_regset *regset,
 		struct membuf to)
 {
+	struct membuf to_msr = membuf_at(&to, offsetof(struct pt_regs, msr));
+
 	if (!cpu_has_feature(CPU_FTR_TM))
 		return -ENODEV;
 
@@ -96,16 +98,10 @@ int tm_cgpr_get(struct task_struct *target, const struct user_regset *regset,
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
 
-	membuf_write(&to, &target->thread.ckpt_regs,
-			offsetof(struct pt_regs, msr));
-	membuf_store(&to, get_user_ckpt_msr(target));
+	membuf_write(&to, &target->thread.ckpt_regs, sizeof(struct user_pt_regs));
 
-	BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
-		     offsetof(struct pt_regs, msr) + sizeof(long));
+	membuf_store(&to_msr, get_user_ckpt_msr(target));
 
-	membuf_write(&to, &target->thread.ckpt_regs.orig_gpr3,
-			sizeof(struct user_pt_regs) -
-			offsetof(struct pt_regs, orig_gpr3));
 	return membuf_zero(&to, ELF_NGREG * sizeof(unsigned long) -
 			sizeof(struct user_pt_regs));
 }
diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c
index 7e6478e7ed07..299e0b6d709d 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-view.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-view.c
@@ -217,6 +217,7 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data)
 static int gpr_get(struct task_struct *target, const struct user_regset *regset,
 		   struct membuf to)
 {
+	struct membuf to_msr = membuf_at(&to, offsetof(struct pt_regs, msr));
 	int i;
 
 	if (target->thread.regs == NULL)
@@ -228,15 +229,10 @@ static int gpr_get(struct task_struct *target, const struct user_regset *regset,
 			target->thread.regs->gpr[i] = NV_REG_POISON;
 	}
 
-	membuf_write(&to, target->thread.regs, offsetof(struct pt_regs, msr));
-	membuf_store(&to, get_user_msr(target));
+	membuf_write(&to, target->thread.regs, sizeof(struct user_pt_regs));
 
-	BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
-		     offsetof(struct pt_regs, msr) + sizeof(long));
+	membuf_store(&to_msr, get_user_msr(target));
 
-	membuf_write(&to, &target->thread.regs->orig_gpr3,
-			sizeof(struct user_pt_regs) -
-			offsetof(struct pt_regs, orig_gpr3));
 	return membuf_zero(&to, ELF_NGREG * sizeof(unsigned long) -
 				 sizeof(struct user_pt_regs));
 }
diff --git a/include/linux/regset.h b/include/linux/regset.h
index c3403f328257..a00765f0e8cf 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -46,6 +46,18 @@ static inline int membuf_write(struct membuf *s, const void *v, size_t size)
 	return s->left;
 }
 
+static inline struct membuf membuf_at(const struct membuf *s, size_t offs)
+{
+	struct membuf n = *s;
+
+	if (offs > n.left)
+		offs = n.left;
+	n.p += offs;
+	n.left -= offs;
+
+	return n;
+}
+
 /* current s->p must be aligned for v; v must be a scalar */
 #define membuf_store(s, v)				\
 ({							\
-- 
cgit 


From 11e5e568ceed7c8c570313a14fa96c72f21dad31 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Tue, 24 Nov 2020 17:48:04 -0800
Subject: usb: typec: tcpm: Stay in SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS till Rp is
 seen

TD.4.7.3. Try SNK DRP Connect Try.SRC DRP fails. The compliance
tester mimics being a Try.SRC USB-C port.
The failure is due to TCPM exiting SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS
when VBUS is not present eventhough when SNK.Rp is seen. Exit to
SRC_TRYWAIT from SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS only when SNK.Rp
is not seen for PD_T_TRY_CC_DEBOUNCE.

>From the spec:
The port shall then transition to Attached.SNK when the SNK.Rp state
is detected on exactly one of the CC1 or CC2 pins for at least
tTryCCDebounce and VBUS is detected. Alternatively, the port shall
transition to TryWait.SRC if SNK.Rp state is not detected for
tTryCCDebounce.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Link: https://lore.kernel.org/r/20201125014804.1596719-1-badhri@google.com
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 18 +++++++++++++-----
 include/linux/usb/pd.h        |  1 +
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 277b9d4d9c84..3bbc1f10af49 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -3124,15 +3124,13 @@ static void run_state_machine(struct tcpm_port *port)
 		break;
 	case SNK_TRY_WAIT_DEBOUNCE:
 		tcpm_set_state(port, SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS,
-			       PD_T_PD_DEBOUNCE);
+			       PD_T_TRY_CC_DEBOUNCE);
 		break;
 	case SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS:
-		if (port->vbus_present && tcpm_port_is_sink(port)) {
+		if (port->vbus_present && tcpm_port_is_sink(port))
 			tcpm_set_state(port, SNK_ATTACHED, 0);
-		} else {
-			tcpm_set_state(port, SRC_TRYWAIT, 0);
+		else
 			port->max_wait = 0;
-		}
 		break;
 	case SRC_TRYWAIT:
 		tcpm_set_cc(port, tcpm_rp_cc(port));
@@ -4053,6 +4051,12 @@ static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1,
 		if (!tcpm_port_is_sink(port))
 			tcpm_set_state(port, SNK_TRYWAIT_DEBOUNCE, 0);
 		break;
+	case SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS:
+		if (!tcpm_port_is_sink(port))
+			tcpm_set_state(port, SRC_TRYWAIT, PD_T_TRY_CC_DEBOUNCE);
+		else
+			tcpm_set_state(port, SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS, 0);
+		break;
 	case SNK_TRYWAIT:
 		/* Do nothing, waiting for tCCDebounce */
 		break;
@@ -4139,6 +4143,10 @@ static void _tcpm_pd_vbus_on(struct tcpm_port *port)
 	case SNK_TRYWAIT_DEBOUNCE:
 		/* Do nothing, waiting for Rp */
 		break;
+	case SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS:
+		if (port->vbus_present && tcpm_port_is_sink(port))
+			tcpm_set_state(port, SNK_ATTACHED, 0);
+		break;
 	case SRC_TRY_WAIT:
 	case SRC_TRY_DEBOUNCE:
 		/* Do nothing, waiting for sink detection */
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index 3a805e2ecbc9..63a66dd5d832 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -484,6 +484,7 @@ static inline unsigned int rdo_max_power(u32 rdo)
 
 #define PD_T_CC_DEBOUNCE	200	/* 100 - 200 ms */
 #define PD_T_PD_DEBOUNCE	20	/* 10 - 20 ms */
+#define PD_T_TRY_CC_DEBOUNCE	15	/* 10 - 20 ms */
 
 #define PD_N_CAPS_COUNT		(PD_T_NO_RESPONSE / PD_T_SEND_SOURCE_CAP)
 #define PD_N_HARD_RESET_COUNT	2
-- 
cgit 


From 07e21d4d96493fd0a8220ab134855253a34a9c84 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 4 Nov 2020 01:22:22 +0800
Subject: soundwire: SDCA: add helper macro to access controls

The upcoming SDCA (SoundWire Device Class Audio) specification defines
a hierarchical encoding to interface with Class-defined capabilities.

The specification is not yet accessible to the general public but this
information is released with explicit permission from the MIPI Board
to avoid delays with SDCA support on Linux platforms.

A block of 64 MBytes of register addresses are allocated to SDCA
controls, starting at address 0x40000000. The 26 LSBs which identify
individual controls are set based on the following variables:

- Function Number. An SCDA device can be split in up to 8 independent
  Functions. Each of these Functions is described in the SDCA
  specification, e.g. Smart Amplifier, Smart Microphone, Simple
  Microphone, Jack codec, HID, etc.

- Entity Number.  Within each Function,  an Entity is  an identifiable
  block.  Up   to  127  Entities   are  connected  in   a  pre-defined
  graph  (similar to  USB), with  Entity0 reserved  for Function-level
  configurations.  In  contrast  to  USB, the  SDCA  spec  pre-defines
  Function Types, topologies, and allowed  options, i.e. the degree of
  freedom  is not  unlimited to  limit  the possibility  of errors  in
  descriptors leading to software quirks.

- Control Selector. Within each Entity, the SDCA specification defines
  48 controls such as Mute, Gain, AGC, etc, and 16 implementation
  defined ones. Some Control Selectors might be used for low-level
  platform setup, and other exposed to applications and users. Note
  that the same Control Selector capability, e.g. Latency control,
  might be located at different offsets in different entities, the
  Control Selector mapping is Entity-specific.

- Control Number. Some Control Selectors allow channel-specific values
  to be set, with up to 64 channels allowed. This is mostly used for
  volume control.

- Current/Next values. Some Control Selectors are
  'Dual-Ranked'. Software may either update the Current value directly
  for immediate effect. Alternatively, software may write into the
  'Next' values and update the SoundWire 1.2 'Commit Groups' register
  to copy 'Next' values into 'Current' ones in a synchronized
  manner. This is different from bank switching which is typically
  used to change the bus configuration only.

- MBQ. the Multi-Byte Quantity bit is used to provide atomic updates
  when accessing more that one byte, for example a 16-bit volume
  control would be updated consistently, the intermediate values
  mixing old MSB with new LSB are not applied.

These 6 parameters are used to build a 32-bit address to access the
desired Controls. Because of address range, paging is required, but
the most often used parameter values are placed in the lower 16 bits
of the address. This helps to keep the paging registers constant while
updating Controls for a specific Device/Function.

Reviewed-by: Rander Wang <rander.wang@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Acked-By: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20201103172226.4278-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_registers.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_registers.h b/include/linux/soundwire/sdw_registers.h
index f420e8059779..e14dff9a9c7f 100644
--- a/include/linux/soundwire/sdw_registers.h
+++ b/include/linux/soundwire/sdw_registers.h
@@ -298,4 +298,36 @@
 #define SDW_CASC_PORT_MASK_INTSTAT3		1
 #define SDW_CASC_PORT_REG_OFFSET_INTSTAT3	2
 
+/*
+ * v1.2 device - SDCA address mapping
+ *
+ * Spec definition
+ *	Bits		Contents
+ *	31		0 (required by addressing range)
+ *	30:26		0b10000 (Control Prefix)
+ *	25		0 (Reserved)
+ *	24:22		Function Number [2:0]
+ *	21		Entity[6]
+ *	20:19		Control Selector[5:4]
+ *	18		0 (Reserved)
+ *	17:15		Control Number[5:3]
+ *	14		Next
+ *	13		MBQ
+ *	12:7		Entity[5:0]
+ *	6:3		Control Selector[3:0]
+ *	2:0		Control Number[2:0]
+ */
+
+#define SDW_SDCA_CTL(fun, ent, ctl, ch)		(BIT(30) |			\
+						 (((fun) & 0x7) << 22) |	\
+						 (((ent) & 0x40) << 15) |	\
+						 (((ent) & 0x3f) << 7) |	\
+						 (((ctl) & 0x30) << 15) |	\
+						 (((ctl) & 0x0f) << 3) |	\
+						 (((ch) & 0x38) << 12) |	\
+						 ((ch) & 0x07))
+
+#define SDW_SDCA_MBQ_CTL(reg)			((reg) | BIT(13))
+#define SDW_SDCA_NEXT_CTL(reg)			((reg) | BIT(14))
+
 #endif /* __SDW_REGISTERS_H */
-- 
cgit 


From fb5103f9d6ce197b4d0b67b4e60e68470f5293d1 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Wed, 4 Nov 2020 01:22:23 +0800
Subject: regmap/SoundWire: sdw: add support for SoundWire 1.2 MBQ

The SoundWire 1.1 specification only allowed for reads and writes of
bytes. The SoundWire 1.2 specification adds a new capability to
transfer "Multi-Byte Quantities" (MBQ) across the bus. The transfers
still happens one-byte-at-a-time, but the update is atomic.

For example when writing a 16-bit volume, the first byte transferred
is only taken into account when the second byte is successfully
transferred.

The mechanism is symmetrical for read and writes:
- On a read, the address of the last byte to be read is modified by
setting the MBQ bit
- On a write, the address of all but the last byte to be written are
modified by setting the MBQ bit. The address for the last byte relies
on the MBQ bit being cleared.

The current definitions for MBQ-based controls in the SDCA draft
standard are limited to 16 bits for volumes, so for now this is the
only supported format. An update will be provided if and when support
for 24-bit and 32-bit values is specified by the SDCA standard.

One possible objection is that this code could have been handled with
regmap-sdw.c. However this is a new spec addition not handled by every
SoundWire 1.1 and non-SDCA device, so there's no reason to load code
that will never be used.

Also in practice it's extremely unlikely that CONFIG_REGMAP would not
be selected with CONFIG_REGMAP_MBQ selected. However there's no
functional dependency between the two modules so they can be selected
separately.

Reviewed-by: Rander Wang <rander.wang@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20201103172226.4278-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/Kconfig          |   6 ++-
 drivers/base/regmap/Makefile         |   1 +
 drivers/base/regmap/regmap-sdw-mbq.c | 101 +++++++++++++++++++++++++++++++++++
 include/linux/regmap.h               |  35 ++++++++++++
 4 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/regmap/regmap-sdw-mbq.c

(limited to 'include/linux')

diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig
index bcb90d8c3960..50b1e2d06a25 100644
--- a/drivers/base/regmap/Kconfig
+++ b/drivers/base/regmap/Kconfig
@@ -4,7 +4,7 @@
 # subsystems should select the appropriate symbols.
 
 config REGMAP
-	default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ || REGMAP_SOUNDWIRE || REGMAP_SCCB || REGMAP_I3C || REGMAP_SPI_AVMM)
+	default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ || REGMAP_SOUNDWIRE || REGMAP_SOUNDWIRE_MBQ || REGMAP_SCCB || REGMAP_I3C || REGMAP_SPI_AVMM)
 	select IRQ_DOMAIN if REGMAP_IRQ
 	bool
 
@@ -46,6 +46,10 @@ config REGMAP_SOUNDWIRE
 	tristate
 	depends on SOUNDWIRE
 
+config REGMAP_SOUNDWIRE_MBQ
+	tristate
+	depends on SOUNDWIRE
+
 config REGMAP_SCCB
 	tristate
 	depends on I2C
diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index ac1b69ee4051..33f63adb5b3d 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_REGMAP_MMIO) += regmap-mmio.o
 obj-$(CONFIG_REGMAP_IRQ) += regmap-irq.o
 obj-$(CONFIG_REGMAP_W1) += regmap-w1.o
 obj-$(CONFIG_REGMAP_SOUNDWIRE) += regmap-sdw.o
+obj-$(CONFIG_REGMAP_SOUNDWIRE_MBQ) += regmap-sdw-mbq.o
 obj-$(CONFIG_REGMAP_SCCB) += regmap-sccb.o
 obj-$(CONFIG_REGMAP_I3C) += regmap-i3c.o
 obj-$(CONFIG_REGMAP_SPI_AVMM) += regmap-spi-avmm.o
diff --git a/drivers/base/regmap/regmap-sdw-mbq.c b/drivers/base/regmap/regmap-sdw-mbq.c
new file mode 100644
index 000000000000..8ce30650b97c
--- /dev/null
+++ b/drivers/base/regmap/regmap-sdw-mbq.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2020 Intel Corporation.
+
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_registers.h>
+#include "internal.h"
+
+static int regmap_sdw_mbq_write(void *context, unsigned int reg, unsigned int val)
+{
+	struct device *dev = context;
+	struct sdw_slave *slave = dev_to_sdw_dev(dev);
+	int ret;
+
+	ret = sdw_write(slave, SDW_SDCA_MBQ_CTL(reg), (val >> 8) & 0xff);
+	if (ret < 0)
+		return ret;
+
+	return sdw_write(slave, reg, val & 0xff);
+}
+
+static int regmap_sdw_mbq_read(void *context, unsigned int reg, unsigned int *val)
+{
+	struct device *dev = context;
+	struct sdw_slave *slave = dev_to_sdw_dev(dev);
+	int read0;
+	int read1;
+
+	read0 = sdw_read(slave, reg);
+	if (read0 < 0)
+		return read0;
+
+	read1 = sdw_read(slave, SDW_SDCA_MBQ_CTL(reg));
+	if (read1 < 0)
+		return read1;
+
+	*val = (read1 << 8) | read0;
+
+	return 0;
+}
+
+static struct regmap_bus regmap_sdw_mbq = {
+	.reg_read = regmap_sdw_mbq_read,
+	.reg_write = regmap_sdw_mbq_write,
+	.reg_format_endian_default = REGMAP_ENDIAN_LITTLE,
+	.val_format_endian_default = REGMAP_ENDIAN_LITTLE,
+};
+
+static int regmap_sdw_mbq_config_check(const struct regmap_config *config)
+{
+	/* MBQ-based controls are only 16-bits for now */
+	if (config->val_bits != 16)
+		return -ENOTSUPP;
+
+	/* Registers are 32 bits wide */
+	if (config->reg_bits != 32)
+		return -ENOTSUPP;
+
+	if (config->pad_bits != 0)
+		return -ENOTSUPP;
+
+	return 0;
+}
+
+struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw,
+				     const struct regmap_config *config,
+				     struct lock_class_key *lock_key,
+				     const char *lock_name)
+{
+	int ret;
+
+	ret = regmap_sdw_mbq_config_check(config);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return __regmap_init(&sdw->dev, &regmap_sdw_mbq,
+			&sdw->dev, config, lock_key, lock_name);
+}
+EXPORT_SYMBOL_GPL(__regmap_init_sdw_mbq);
+
+struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw,
+					  const struct regmap_config *config,
+					  struct lock_class_key *lock_key,
+					  const char *lock_name)
+{
+	int ret;
+
+	ret = regmap_sdw_mbq_config_check(config);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return __devm_regmap_init(&sdw->dev, &regmap_sdw_mbq,
+			&sdw->dev, config, lock_key, lock_name);
+}
+EXPORT_SYMBOL_GPL(__devm_regmap_init_sdw_mbq);
+
+MODULE_DESCRIPTION("Regmap SoundWire MBQ Module");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index e7834d98207f..a652d1474d6a 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -570,6 +570,10 @@ struct regmap *__regmap_init_sdw(struct sdw_slave *sdw,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
+struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw,
+				     const struct regmap_config *config,
+				     struct lock_class_key *lock_key,
+				     const char *lock_name);
 struct regmap *__regmap_init_spi_avmm(struct spi_device *spi,
 				      const struct regmap_config *config,
 				      struct lock_class_key *lock_key,
@@ -619,6 +623,10 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
+struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw,
+					  const struct regmap_config *config,
+					  struct lock_class_key *lock_key,
+					  const char *lock_name);
 struct regmap *__devm_regmap_init_slimbus(struct slim_device *slimbus,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
@@ -817,6 +825,19 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 	__regmap_lockdep_wrapper(__regmap_init_sdw, #config,		\
 				sdw, config)
 
+/**
+ * regmap_init_sdw_mbq() - Initialise register map
+ *
+ * @sdw: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_sdw_mbq(sdw, config)					\
+	__regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config,		\
+				sdw, config)
+
 /**
  * regmap_init_spi_avmm() - Initialize register map for Intel SPI Slave
  * to AVMM Bus Bridge
@@ -989,6 +1010,20 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 	__regmap_lockdep_wrapper(__devm_regmap_init_sdw, #config,	\
 				sdw, config)
 
+/**
+ * devm_regmap_init_sdw_mbq() - Initialise managed register map
+ *
+ * @sdw: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap. The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_sdw_mbq(sdw, config)			\
+	__regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, #config,   \
+				sdw, config)
+
 /**
  * devm_regmap_init_slimbus() - Initialise managed register map
  *
-- 
cgit 


From 4df910620bebb5cfe234af16ac8f6474b60215fd Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 25 Nov 2020 13:22:21 +0800
Subject: mm: memcg: relayout structure mem_cgroup to avoid cache interference

0day reported one -22.7% regression for will-it-scale page_fault2
case [1] on a 4 sockets 144 CPU platform, and bisected to it to be
caused by Waiman's optimization (commit bd0b230fe1) of saving one
'struct page_counter' space for 'struct mem_cgroup'.

Initially we thought it was due to the cache alignment change introduced
by the patch, but further debug shows that it is due to some hot data
members ('vmstats_local', 'vmstats_percpu', 'vmstats') sit in 2 adjacent
cacheline (2N and 2N+1 cacheline), and when adjacent cache line prefetch
is enabled, it triggers an "extended level" of cache false sharing for
2 adjacent cache lines.

So exchange the 2 member blocks, while keeping mostly the original
cache alignment, which can restore and even enhance the performance,
and save 64 bytes of space for 'struct mem_cgroup' (from 2880 to 2816,
with 0day's default RHEL-8.3 kernel config)

[1]. https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/

Fixes: bd0b230fe145 ("mm/memcg: unify swap and memsw page counters")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a80c59af2c60..922a7f600465 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -282,20 +282,6 @@ struct mem_cgroup {
 
 	MEMCG_PADDING(_pad1_);
 
-	/*
-	 * set > 0 if pages under this cgroup are moving to other cgroup.
-	 */
-	atomic_t		moving_account;
-	struct task_struct	*move_lock_task;
-
-	/* Legacy local VM stats and events */
-	struct memcg_vmstats_percpu __percpu *vmstats_local;
-
-	/* Subtree VM stats and events (batched updates) */
-	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
-
-	MEMCG_PADDING(_pad2_);
-
 	atomic_long_t		vmstats[MEMCG_NR_STAT];
 	atomic_long_t		vmevents[NR_VM_EVENT_ITEMS];
 
@@ -317,6 +303,20 @@ struct mem_cgroup {
 	struct list_head objcg_list; /* list of inherited objcgs */
 #endif
 
+	MEMCG_PADDING(_pad2_);
+
+	/*
+	 * set > 0 if pages under this cgroup are moving to other cgroup.
+	 */
+	atomic_t		moving_account;
+	struct task_struct	*move_lock_task;
+
+	/* Legacy local VM stats and events */
+	struct memcg_vmstats_percpu __percpu *vmstats_local;
+
+	/* Subtree VM stats and events (batched updates) */
+	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head cgwb_list;
 	struct wb_domain cgwb_domain;
-- 
cgit 


From 2a2970891647fee7e7ea767425f895140faffaa8 Mon Sep 17 00:00:00 2001
From: Chris Mi <cmi@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:24 -0800
Subject: net/mlx5: Add sample offload hardware bits and structures

Hardware introduces flow sampler object for packet sampling.
Add the offload hardware bits and structures.

Signed-off-by: Chris Mi <cmi@nvidia.com>
Reviewed-by: Oz Shlomo <ozsh@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 651591a2965d..65ea35af0527 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10657,11 +10657,13 @@ struct mlx5_ifc_affiliated_event_header_bits {
 enum {
 	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT(0xc),
 	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT(0x13),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT(0x20),
 };
 
 enum {
 	MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc,
 	MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13,
+	MLX5_GENERAL_OBJECT_TYPES_SAMPLER = 0x20,
 };
 
 enum {
@@ -10736,6 +10738,33 @@ struct mlx5_ifc_create_encryption_key_in_bits {
 	struct mlx5_ifc_encryption_key_obj_bits encryption_key_object;
 };
 
+struct mlx5_ifc_sampler_obj_bits {
+	u8         modify_field_select[0x40];
+
+	u8         table_type[0x8];
+	u8         level[0x8];
+	u8         reserved_at_50[0xf];
+	u8         ignore_flow_level[0x1];
+
+	u8         sample_ratio[0x20];
+
+	u8         reserved_at_80[0x8];
+	u8         sample_table_id[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         default_table_id[0x18];
+
+	u8         sw_steering_icm_address_rx[0x40];
+	u8         sw_steering_icm_address_tx[0x40];
+
+	u8         reserved_at_140[0xa0];
+};
+
+struct mlx5_ifc_create_sampler_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+	struct mlx5_ifc_sampler_obj_bits sampler_object;
+};
+
 enum {
 	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_128 = 0x0,
 	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_256 = 0x1,
-- 
cgit 


From 38730630880c6f47ad73dd90524ff52443b8bc48 Mon Sep 17 00:00:00 2001
From: Chris Mi <cmi@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:25 -0800
Subject: net/mlx5: Add sampler destination type

The flow sampler object is a new destination type. Add a new member
for the flow destination.

Signed-off-by: Chris Mi <cmi@nvidia.com>
Reviewed-by: Oz Shlomo <ozsh@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c | 3 +++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c             | 3 +++
 include/linux/mlx5/fs.h                                      | 1 +
 include/linux/mlx5/mlx5_ifc.h                                | 1 +
 4 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
index a700f3c86899..87d65f6b5310 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -247,6 +247,9 @@ const char *parse_fs_dst(struct trace_seq *p,
 	case MLX5_FLOW_DESTINATION_TYPE_TIR:
 		trace_seq_printf(p, "tir=%u\n", dst->tir_num);
 		break;
+	case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER:
+		trace_seq_printf(p, "sampler_id=%u\n", dst->sampler_id);
+		break;
 	case MLX5_FLOW_DESTINATION_TYPE_COUNTER:
 		trace_seq_printf(p, "counter_id=%u\n", counter_id);
 		break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index babe3405132a..c2fed9c3d75c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -515,6 +515,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 						 dst->dest_attr.vport.pkt_reformat->id);
 				}
 				break;
+			case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER:
+				id = dst->dest_attr.sampler_id;
+				break;
 			default:
 				id = dst->dest_attr.tir_num;
 			}
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 846d94ad04bc..35d2cc1646d3 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -132,6 +132,7 @@ struct mlx5_flow_destination {
 			struct mlx5_pkt_reformat *pkt_reformat;
 			u8		flags;
 		} vport;
+		u32			sampler_id;
 	};
 };
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 65ea35af0527..2f2add4bd5e1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1616,6 +1616,7 @@ enum mlx5_flow_destination_type {
 	MLX5_FLOW_DESTINATION_TYPE_VPORT        = 0x0,
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE   = 0x1,
 	MLX5_FLOW_DESTINATION_TYPE_TIR          = 0x2,
+	MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6,
 
 	MLX5_FLOW_DESTINATION_TYPE_PORT         = 0x99,
 	MLX5_FLOW_DESTINATION_TYPE_COUNTER      = 0x100,
-- 
cgit 


From 7da3ad6c26f41f403fe6823c3de242551db09c37 Mon Sep 17 00:00:00 2001
From: Muhammad Sammar <muhammads@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:27 -0800
Subject: net/mlx5: Add misc4 to mlx5_ifc_fte_match_param_bits

Add misc4 match params to enable matching on prog_sample_fields.

Signed-off-by: Muhammad Sammar <muhammads@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  2 +-
 include/linux/mlx5/device.h                       |  1 +
 include/linux/mlx5/mlx5_ifc.h                     | 25 ++++++++++++++++++++++-
 include/uapi/rdma/mlx5_user_ioctl_cmds.h          |  2 +-
 4 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index afe7f0bffb93..b24a9849c45e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -194,7 +194,7 @@ struct mlx5_ft_underlay_qp {
 	u32 qpn;
 };
 
-#define MLX5_FTE_MATCH_PARAM_RESERVED	reserved_at_a00
+#define MLX5_FTE_MATCH_PARAM_RESERVED	reserved_at_c00
 /* Calculate the fte_match_param length and without the reserved length.
  * Make sure the reserved field is the last.
  */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index cf824366a7d1..e9639c4cf2ed 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1076,6 +1076,7 @@ enum {
 	MLX5_MATCH_INNER_HEADERS	= 1 << 2,
 	MLX5_MATCH_MISC_PARAMETERS_2	= 1 << 3,
 	MLX5_MATCH_MISC_PARAMETERS_3	= 1 << 4,
+	MLX5_MATCH_MISC_PARAMETERS_4	= 1 << 5,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2f2add4bd5e1..11c24fafd7f2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -623,6 +623,26 @@ struct mlx5_ifc_fte_match_set_misc3_bits {
 	u8         reserved_at_140[0xc0];
 };
 
+struct mlx5_ifc_fte_match_set_misc4_bits {
+	u8         prog_sample_field_value_0[0x20];
+
+	u8         prog_sample_field_id_0[0x20];
+
+	u8         prog_sample_field_value_1[0x20];
+
+	u8         prog_sample_field_id_1[0x20];
+
+	u8         prog_sample_field_value_2[0x20];
+
+	u8         prog_sample_field_id_2[0x20];
+
+	u8         prog_sample_field_value_3[0x20];
+
+	u8         prog_sample_field_id_3[0x20];
+
+	u8         reserved_at_100[0x100];
+};
+
 struct mlx5_ifc_cmd_pas_bits {
 	u8         pa_h[0x20];
 
@@ -1669,7 +1689,9 @@ struct mlx5_ifc_fte_match_param_bits {
 
 	struct mlx5_ifc_fte_match_set_misc3_bits misc_parameters_3;
 
-	u8         reserved_at_a00[0x600];
+	struct mlx5_ifc_fte_match_set_misc4_bits misc_parameters_4;
+
+	u8         reserved_at_c00[0x400];
 };
 
 enum {
@@ -5462,6 +5484,7 @@ enum {
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS    = 0x2,
 	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3,
 	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_3 = 0x4,
+	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_4 = 0x5,
 };
 
 struct mlx5_ifc_query_flow_group_out_bits {
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index e24d66d278cf..3fd9b380a091 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -232,7 +232,7 @@ enum mlx5_ib_device_query_context_attrs {
 	MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT),
 };
 
-#define MLX5_IB_DW_MATCH_PARAM 0x80
+#define MLX5_IB_DW_MATCH_PARAM 0x90
 
 struct mlx5_ib_match_params {
 	__u32	match_params[MLX5_IB_DW_MATCH_PARAM];
-- 
cgit 


From 59d2ae1db89f5a491d48f798ffccef36cc69ca65 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:28 -0800
Subject: net/mlx5: Add ts_cqe_to_dest_cqn related bits

Add a bit in HCA capabilities layout to indicate if ts_cqe_to_dest_cqn is
supported.

In addition, add ts_cqe_to_dest_cqn field to SQ context, for driver to
set the actual CQN.

Signed-off-by: Eran Ben Elisha <eranbe@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 11c24fafd7f2..632b9a61fda5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1261,7 +1261,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   ece_support[0x1];
 	u8	   reserved_at_a4[0x7];
 	u8         log_max_srq[0x5];
-	u8         reserved_at_b0[0x10];
+	u8         reserved_at_b0[0x2];
+	u8         ts_cqe_to_dest_cqn[0x1];
+	u8         reserved_at_b3[0xd];
 
 	u8         max_sgl_for_optimized_performance[0x8];
 	u8         log_max_cq_sz[0x8];
@@ -3312,8 +3314,12 @@ struct mlx5_ifc_sqc_bits {
 	u8         reserved_at_80[0x10];
 	u8         hairpin_peer_vhca[0x10];
 
-	u8         reserved_at_a0[0x50];
+	u8         reserved_at_a0[0x20];
 
+	u8         reserved_at_c0[0x8];
+	u8         ts_cqe_to_dest_cqn[0x18];
+
+	u8         reserved_at_e0[0x10];
 	u8         packet_pacing_rate_limit_index[0x10];
 	u8         tis_lst_sz[0x10];
 	u8         reserved_at_110[0x10];
-- 
cgit 


From e5dfe6b57e8eada1c238dd2c7020345b0d8f6454 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:29 -0800
Subject: net/mlx5: Avoid exposing driver internal command helpers

mlx5 command init and cleanup routines are internal to mlx5_core driver.
Hence, avoid exporting them and move their definition to mlx5_core
driver's internal file mlx5_core.h

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c       | 3 ---
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 4 ++++
 include/linux/mlx5/driver.h                         | 4 ----
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index e49387dbef98..50c7b9ee80c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -2142,7 +2142,6 @@ dma_pool_err:
 	kvfree(cmd->stats);
 	return err;
 }
-EXPORT_SYMBOL(mlx5_cmd_init);
 
 void mlx5_cmd_cleanup(struct mlx5_core_dev *dev)
 {
@@ -2155,11 +2154,9 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev)
 	dma_pool_destroy(cmd->pool);
 	kvfree(cmd->stats);
 }
-EXPORT_SYMBOL(mlx5_cmd_cleanup);
 
 void mlx5_cmd_set_state(struct mlx5_core_dev *dev,
 			enum mlx5_cmdif_state cmdif_state)
 {
 	dev->cmd.state = cmdif_state;
 }
-EXPORT_SYMBOL(mlx5_cmd_set_state);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 8cec85ab419d..9d00efa9e6bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -122,6 +122,10 @@ enum mlx5_semaphore_space_address {
 
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
 int mlx5_query_board_id(struct mlx5_core_dev *dev);
+int mlx5_cmd_init(struct mlx5_core_dev *dev);
+void mlx5_cmd_cleanup(struct mlx5_core_dev *dev);
+void mlx5_cmd_set_state(struct mlx5_core_dev *dev,
+			enum mlx5_cmdif_state cmdif_state);
 int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index add85094f9a5..5e84b1d53650 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -888,10 +888,6 @@ enum {
 	CMD_ALLOWED_OPCODE_ALL,
 };
 
-int mlx5_cmd_init(struct mlx5_core_dev *dev);
-void mlx5_cmd_cleanup(struct mlx5_core_dev *dev);
-void mlx5_cmd_set_state(struct mlx5_core_dev *dev,
-			enum mlx5_cmdif_state cmdif_state);
 void mlx5_cmd_use_events(struct mlx5_core_dev *dev);
 void mlx5_cmd_use_polling(struct mlx5_core_dev *dev);
 void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode);
-- 
cgit 


From 349125ba232ea53d71a57c65c81f109c323cc369 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:31 -0800
Subject: net/mlx5: Update the hardware interface definition for vhca state

Update the hardware interface definitions to query and modify vhca
state, related EQE and event code.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h   |  7 +++++++
 include/linux/mlx5/mlx5_ifc.h | 17 ++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index e9639c4cf2ed..f1de49d64a98 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -346,6 +346,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_NIC_VPORT_CHANGE   = 0xd,
 
 	MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED = 0xe,
+	MLX5_EVENT_TYPE_VHCA_STATE_CHANGE = 0xf,
 
 	MLX5_EVENT_TYPE_DCT_DRAINED        = 0x1c,
 	MLX5_EVENT_TYPE_DCT_KEY_VIOLATION  = 0x1d,
@@ -717,6 +718,11 @@ struct mlx5_eqe_sync_fw_update {
 	u8 sync_rst_state;
 };
 
+struct mlx5_eqe_vhca_state {
+	__be16 ec_function;
+	__be16 function_id;
+} __packed;
+
 union ev_data {
 	__be32				raw[7];
 	struct mlx5_eqe_cmd		cmd;
@@ -736,6 +742,7 @@ union ev_data {
 	struct mlx5_eqe_temp_warning	temp_warning;
 	struct mlx5_eqe_xrq_err		xrq_err;
 	struct mlx5_eqe_sync_fw_update	sync_fw_update;
+	struct mlx5_eqe_vhca_state	vhca_state;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 632b9a61fda5..3ace1976514c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -299,6 +299,8 @@ enum {
 	MLX5_CMD_OP_CREATE_UMEM                   = 0xa08,
 	MLX5_CMD_OP_DESTROY_UMEM                  = 0xa0a,
 	MLX5_CMD_OP_SYNC_STEERING                 = 0xb00,
+	MLX5_CMD_OP_QUERY_VHCA_STATE              = 0xb0d,
+	MLX5_CMD_OP_MODIFY_VHCA_STATE             = 0xb0e,
 	MLX5_CMD_OP_MAX
 };
 
@@ -1244,7 +1246,15 @@ enum mlx5_fc_bulk_alloc_bitmask {
 #define MLX5_FC_BULK_NUM_FCS(fc_enum) (MLX5_FC_BULK_SIZE_FACTOR * (fc_enum))
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-	u8         reserved_at_0[0x30];
+	u8         reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x3];
+	u8         event_on_vhca_state_teardown_request[0x1];
+	u8         event_on_vhca_state_in_use[0x1];
+	u8         event_on_vhca_state_active[0x1];
+	u8         event_on_vhca_state_allocated[0x1];
+	u8         event_on_vhca_state_invalid[0x1];
+	u8         reserved_at_28[0x8];
 	u8         vhca_id[0x10];
 
 	u8         reserved_at_40[0x40];
@@ -1534,7 +1544,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         disable_local_lb_uc[0x1];
 	u8         disable_local_lb_mc[0x1];
 	u8         log_min_hairpin_wq_data_sz[0x5];
-	u8         reserved_at_3e8[0x3];
+	u8         reserved_at_3e8[0x2];
+	u8         vhca_state[0x1];
 	u8         log_max_vlan_list[0x5];
 	u8         reserved_at_3f0[0x3];
 	u8         log_max_current_mc_list[0x5];
@@ -1602,7 +1613,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         max_num_of_monitor_counters[0x10];
 	u8         num_ppcnt_monitor_counters[0x10];
 
-	u8         reserved_at_640[0x10];
+	u8         max_num_sf[0x10];
 	u8         num_q_monitor_counters[0x10];
 
 	u8         reserved_at_660[0x20];
-- 
cgit 


From 21adf05d4584c99a07a604224b9cfeddcc6bc47c Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:32 -0800
Subject: net/mlx5: Expose IP-in-IP TX and RX capability bits

Expose FW indication that it supports stateless offloads for IP over IP
tunneled packets per direction. In some HW like ConnectX-4 IP-in-IP
support is not symmetric, it supports steering on the inner header but
it doesn't TX-Checksum and TSO. Add IP-in-IP capability per direction to
cover this case as well.

Note: only if both indications are turned on, the global
tunnel_stateless_ip_over_ip is on too.

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3ace1976514c..96888f9f822d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -913,7 +913,10 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         tunnel_stateless_ipv4_over_vxlan[0x1];
 	u8         tunnel_stateless_ip_over_ip[0x1];
 	u8         insert_trailer[0x1];
-	u8         reserved_at_2b[0x5];
+	u8         reserved_at_2b[0x1];
+	u8         tunnel_stateless_ip_over_ip_rx[0x1];
+	u8         tunnel_stateless_ip_over_ip_tx[0x1];
+	u8         reserved_at_2e[0x2];
 	u8         max_vxlan_udp_ports[0x8];
 	u8         reserved_at_38[0x6];
 	u8         max_geneve_opt_len[0x1];
-- 
cgit 


From 959af5569f57d189b5c4fccae45f16d5ff01aa39 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:33 -0800
Subject: net/mlx5: Expose other function ifc bits

Expose other function ifc bits to enable setting HCA caps on behalf of
other function.

In addition, expose vhca_resource_manager bit to control whether the
other function functionality is supported by firmware.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 96888f9f822d..3e337386faa8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1249,7 +1249,8 @@ enum mlx5_fc_bulk_alloc_bitmask {
 #define MLX5_FC_BULK_NUM_FCS(fc_enum) (MLX5_FC_BULK_SIZE_FACTOR * (fc_enum))
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-	u8         reserved_at_0[0x20];
+	u8         reserved_at_0[0x1f];
+	u8         vhca_resource_manager[0x1];
 
 	u8         reserved_at_20[0x3];
 	u8         event_on_vhca_state_teardown_request[0x1];
@@ -4247,7 +4248,11 @@ struct mlx5_ifc_set_hca_cap_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_function[0x1];
+	u8         reserved_at_41[0xf];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	union mlx5_ifc_hca_cap_union_bits capability;
 };
-- 
cgit 


From 3b1e58aa832ed537289be6a51a2015309688a90c Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:36 -0800
Subject: net/mlx5: Make API mlx5_core_is_ecpf accept const pointer

Subsequent patch implements helper API which has mlx5_core_dev
as const pointer, make its caller API too const *.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Bodong Wang <bodong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5e84b1d53650..d6ef3068d7d3 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1133,7 +1133,7 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev)
 	return dev->coredev_type == MLX5_COREDEV_VF;
 }
 
-static inline bool mlx5_core_is_ecpf(struct mlx5_core_dev *dev)
+static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev)
 {
 	return dev->caps.embedded_cpu;
 }
-- 
cgit 


From 8a90f2fc67826a3876df1cb72e42d4a9a135f4fa Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:37 -0800
Subject: net/mlx5: Rename peer_pf to host_pf

To match the hardware spec, rename peer_pf to host_pf.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Bodong Wang <bodong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/ecpf.c     | 51 ++++++++++++++--------
 .../net/ethernet/mellanox/mlx5/core/pagealloc.c    | 12 ++---
 include/linux/mlx5/driver.h                        |  2 +-
 3 files changed, 40 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
index 3dc9dd3f24dc..68ca0e2b26cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
@@ -8,37 +8,52 @@ bool mlx5_read_embedded_cpu(struct mlx5_core_dev *dev)
 	return (ioread32be(&dev->iseg->initializing) >> MLX5_ECPU_BIT_NUM) & 1;
 }
 
-static int mlx5_peer_pf_init(struct mlx5_core_dev *dev)
+static int mlx5_cmd_host_pf_enable_hca(struct mlx5_core_dev *dev)
 {
-	u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {};
-	int err;
+	u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(enable_hca_in)]   = {};
 
 	MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA);
-	err = mlx5_cmd_exec_in(dev, enable_hca, in);
+	MLX5_SET(enable_hca_in, in, function_id, 0);
+	MLX5_SET(enable_hca_in, in, embedded_cpu_function, 0);
+	return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+}
+
+static int mlx5_cmd_host_pf_disable_hca(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(disable_hca_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(disable_hca_in)]   = {};
+
+	MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA);
+	MLX5_SET(disable_hca_in, in, function_id, 0);
+	MLX5_SET(disable_hca_in, in, embedded_cpu_function, 0);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_host_pf_init(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	err = mlx5_cmd_host_pf_enable_hca(dev);
 	if (err)
-		mlx5_core_err(dev, "Failed to enable peer PF HCA err(%d)\n",
-			      err);
+		mlx5_core_err(dev, "Failed to enable external host PF HCA err(%d)\n", err);
 
 	return err;
 }
 
-static void mlx5_peer_pf_cleanup(struct mlx5_core_dev *dev)
+static void mlx5_host_pf_cleanup(struct mlx5_core_dev *dev)
 {
-	u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {};
 	int err;
 
-	MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA);
-	err = mlx5_cmd_exec_in(dev, disable_hca, in);
+	err = mlx5_cmd_host_pf_disable_hca(dev);
 	if (err) {
-		mlx5_core_err(dev, "Failed to disable peer PF HCA err(%d)\n",
-			      err);
+		mlx5_core_err(dev, "Failed to disable external host PF HCA err(%d)\n", err);
 		return;
 	}
 
-	err = mlx5_wait_for_pages(dev, &dev->priv.peer_pf_pages);
+	err = mlx5_wait_for_pages(dev, &dev->priv.host_pf_pages);
 	if (err)
-		mlx5_core_warn(dev, "Timeout reclaiming peer PF pages err(%d)\n",
-			       err);
+		mlx5_core_warn(dev, "Timeout reclaiming external host PF pages err(%d)\n", err);
 }
 
 int mlx5_ec_init(struct mlx5_core_dev *dev)
@@ -46,10 +61,10 @@ int mlx5_ec_init(struct mlx5_core_dev *dev)
 	if (!mlx5_core_is_ecpf(dev))
 		return 0;
 
-	/* ECPF shall enable HCA for peer PF in the same way a PF
+	/* ECPF shall enable HCA for host PF in the same way a PF
 	 * does this for its VFs.
 	 */
-	return mlx5_peer_pf_init(dev);
+	return mlx5_host_pf_init(dev);
 }
 
 void mlx5_ec_cleanup(struct mlx5_core_dev *dev)
@@ -57,5 +72,5 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev)
 	if (!mlx5_core_is_ecpf(dev))
 		return;
 
-	mlx5_peer_pf_cleanup(dev);
+	mlx5_host_pf_cleanup(dev);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index 150638814517..539baea358bf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -374,7 +374,7 @@ retry:
 	if (func_id)
 		dev->priv.vfs_pages += npages;
 	else if (mlx5_core_is_ecpf(dev) && !ec_function)
-		dev->priv.peer_pf_pages += npages;
+		dev->priv.host_pf_pages += npages;
 
 	mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x, err %d\n",
 		      npages, ec_function, func_id, err);
@@ -416,7 +416,7 @@ static void release_all_pages(struct mlx5_core_dev *dev, u32 func_id,
 	if (func_id)
 		dev->priv.vfs_pages -= npages;
 	else if (mlx5_core_is_ecpf(dev) && !ec_function)
-		dev->priv.peer_pf_pages -= npages;
+		dev->priv.host_pf_pages -= npages;
 
 	mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x\n",
 		      npages, ec_function, func_id);
@@ -506,7 +506,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
 	if (func_id)
 		dev->priv.vfs_pages -= num_claimed;
 	else if (mlx5_core_is_ecpf(dev) && !ec_function)
-		dev->priv.peer_pf_pages -= num_claimed;
+		dev->priv.host_pf_pages -= num_claimed;
 
 out_free:
 	kvfree(out);
@@ -661,9 +661,9 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev)
 	WARN(dev->priv.vfs_pages,
 	     "VFs FW pages counter is %d after reclaiming all pages\n",
 	     dev->priv.vfs_pages);
-	WARN(dev->priv.peer_pf_pages,
-	     "Peer PF FW pages counter is %d after reclaiming all pages\n",
-	     dev->priv.peer_pf_pages);
+	WARN(dev->priv.host_pf_pages,
+	     "External host PF FW pages counter is %d after reclaiming all pages\n",
+	     dev->priv.host_pf_pages);
 
 	return 0;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d6ef3068d7d3..8e9bcb3bfd77 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -547,7 +547,7 @@ struct mlx5_priv {
 	atomic_t		reg_pages;
 	struct list_head	free_list;
 	int			vfs_pages;
-	int			peer_pf_pages;
+	int			host_pf_pages;
 
 	struct mlx5_core_health health;
 
-- 
cgit 


From 617b860c1875842d9cc3338d7dabd2b3538038f1 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 20 Nov 2020 15:03:39 -0800
Subject: net/mlx5: Treat host PF vport as other (non eswitch manager) vport

When eswitch manager is running on ECPF, host PF should be treated
as non eswitch manager port, similar to other VF vports.
Fail to do so, results in firmware treating PF's vport as ECPF
vport for eswitch ACL tables.
Non zero check to figure out if a given vport is other vport or not
is not sufficient becase PF vport number = 0 on ECPF.
Hence, create esw acl tables with an attribute of other vport.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/esw/acl/helper.c   |  5 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   | 54 ++++++++++------------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 14 ++----
 include/linux/mlx5/fs.h                            |  5 +-
 4 files changed, 34 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c
index 22f4c1c28006..4a369669e51e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c
@@ -8,6 +8,7 @@
 struct mlx5_flow_table *
 esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size)
 {
+	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_flow_namespace *root_ns;
 	struct mlx5_flow_table *acl;
@@ -33,7 +34,9 @@ esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size)
 		return ERR_PTR(-EOPNOTSUPP);
 	}
 
-	acl = mlx5_create_vport_flow_table(root_ns, 0, size, 0, vport_num);
+	ft_attr.max_fte = size;
+	ft_attr.flags = MLX5_FLOW_TABLE_OTHER_VPORT;
+	acl = mlx5_create_vport_flow_table(root_ns, &ft_attr, vport_num);
 	if (IS_ERR(acl)) {
 		err = PTR_ERR(acl);
 		esw_warn(dev, "vport[%d] create %s ACL table, err(%d)\n", vport_num,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index c2fed9c3d75c..8e06731d3cb3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -172,10 +172,9 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns,
 		MLX5_SET(set_flow_table_root_in, in, table_id, ft->id);
 
 	MLX5_SET(set_flow_table_root_in, in, underlay_qpn, underlay_qpn);
-	if (ft->vport) {
-		MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
-		MLX5_SET(set_flow_table_root_in, in, other_vport, 1);
-	}
+	MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
+	MLX5_SET(set_flow_table_root_in, in, other_vport,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 
 	return mlx5_cmd_exec_in(dev, set_flow_table_root, in);
 }
@@ -199,10 +198,9 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(create_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.level, ft->level);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, log_size);
-	if (ft->vport) {
-		MLX5_SET(create_flow_table_in, in, vport_number, ft->vport);
-		MLX5_SET(create_flow_table_in, in, other_vport, 1);
-	}
+	MLX5_SET(create_flow_table_in, in, vport_number, ft->vport);
+	MLX5_SET(create_flow_table_in, in, other_vport,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 
 	MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en,
 		 en_decap);
@@ -252,10 +250,9 @@ static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns,
 		 MLX5_CMD_OP_DESTROY_FLOW_TABLE);
 	MLX5_SET(destroy_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(destroy_flow_table_in, in, table_id, ft->id);
-	if (ft->vport) {
-		MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport);
-		MLX5_SET(destroy_flow_table_in, in, other_vport, 1);
-	}
+	MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport);
+	MLX5_SET(destroy_flow_table_in, in, other_vport,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 
 	return mlx5_cmd_exec_in(dev, destroy_flow_table, in);
 }
@@ -283,11 +280,9 @@ static int mlx5_cmd_modify_flow_table(struct mlx5_flow_root_namespace *ns,
 				 flow_table_context.lag_master_next_table_id, 0);
 		}
 	} else {
-		if (ft->vport) {
-			MLX5_SET(modify_flow_table_in, in, vport_number,
-				 ft->vport);
-			MLX5_SET(modify_flow_table_in, in, other_vport, 1);
-		}
+		MLX5_SET(modify_flow_table_in, in, vport_number, ft->vport);
+		MLX5_SET(modify_flow_table_in, in, other_vport,
+			 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 		MLX5_SET(modify_flow_table_in, in, modify_field_select,
 			 MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID);
 		if (next_ft) {
@@ -325,6 +320,9 @@ static int mlx5_cmd_create_flow_group(struct mlx5_flow_root_namespace *ns,
 		MLX5_SET(create_flow_group_in, in, other_vport, 1);
 	}
 
+	MLX5_SET(create_flow_group_in, in, vport_number, ft->vport);
+	MLX5_SET(create_flow_group_in, in, other_vport,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 	err = mlx5_cmd_exec_inout(dev, create_flow_group, in, out);
 	if (!err)
 		fg->id = MLX5_GET(create_flow_group_out, out,
@@ -344,11 +342,9 @@ static int mlx5_cmd_destroy_flow_group(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(destroy_flow_group_in, in, table_type, ft->type);
 	MLX5_SET(destroy_flow_group_in, in, table_id, ft->id);
 	MLX5_SET(destroy_flow_group_in, in, group_id, fg->id);
-	if (ft->vport) {
-		MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport);
-		MLX5_SET(destroy_flow_group_in, in, other_vport, 1);
-	}
-
+	MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport);
+	MLX5_SET(destroy_flow_group_in, in, other_vport,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 	return mlx5_cmd_exec_in(dev, destroy_flow_group, in);
 }
 
@@ -427,10 +423,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(set_fte_in, in, ignore_flow_level,
 		 !!(fte->action.flags & FLOW_ACT_IGNORE_FLOW_LEVEL));
 
-	if (ft->vport) {
-		MLX5_SET(set_fte_in, in, vport_number, ft->vport);
-		MLX5_SET(set_fte_in, in, other_vport, 1);
-	}
+	MLX5_SET(set_fte_in, in, vport_number, ft->vport);
+	MLX5_SET(set_fte_in, in, other_vport,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 
 	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
@@ -604,10 +599,9 @@ static int mlx5_cmd_delete_fte(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(delete_fte_in, in, table_type, ft->type);
 	MLX5_SET(delete_fte_in, in, table_id, ft->id);
 	MLX5_SET(delete_fte_in, in, flow_index, fte->index);
-	if (ft->vport) {
-		MLX5_SET(delete_fte_in, in, vport_number, ft->vport);
-		MLX5_SET(delete_fte_in, in, other_vport, 1);
-	}
+	MLX5_SET(delete_fte_in, in, vport_number, ft->vport);
+	MLX5_SET(delete_fte_in, in, other_vport,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 
 	return mlx5_cmd_exec_in(dev, delete_fte, in);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 951f8041fb62..2df8b298d381 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1147,17 +1147,11 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 }
 EXPORT_SYMBOL(mlx5_create_flow_table);
 
-struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
-						     int prio, int max_fte,
-						     u32 level, u16 vport)
+struct mlx5_flow_table *
+mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+			     struct mlx5_flow_table_attr *ft_attr, u16 vport)
 {
-	struct mlx5_flow_table_attr ft_attr = {};
-
-	ft_attr.max_fte = max_fte;
-	ft_attr.level   = level;
-	ft_attr.prio    = prio;
-
-	return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_NORMAL, vport);
+	return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_NORMAL, vport);
 }
 
 struct mlx5_flow_table*
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 35d2cc1646d3..1f51f4c3b1af 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -50,6 +50,7 @@ enum {
 	MLX5_FLOW_TABLE_TUNNEL_EN_DECAP = BIT(1),
 	MLX5_FLOW_TABLE_TERMINATION = BIT(2),
 	MLX5_FLOW_TABLE_UNMANAGED = BIT(3),
+	MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4),
 };
 
 #define LEFTOVERS_RULE_NUM	 2
@@ -174,9 +175,7 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 
 struct mlx5_flow_table *
 mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
-			     int prio,
-			     int num_flow_table_entries,
-			     u32 level, u16 vport);
+			     struct mlx5_flow_table_attr *ft_attr, u16 vport);
 struct mlx5_flow_table *mlx5_create_lag_demux_flow_table(
 					       struct mlx5_flow_namespace *ns,
 					       int prio, u32 level);
-- 
cgit 


From 8d8d53cf8fd028310b1189165b939cde124895d7 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 29 Oct 2020 12:52:40 +1100
Subject: dma-mapping: Allow mixing bypass and mapped DMA operation

At the moment we allow bypassing DMA ops only when we can do this for
the entire RAM. However there are configs with mixed type memory
where we could still allow bypassing IOMMU in most cases;
POWERPC with persistent memory is one example.

This adds an arch hook to determine where bypass can still work and
we invoke direct DMA API. The following patch checks the bus limit
on POWERPC to allow or disallow direct mapping.

This adds a ARCH_HAS_DMA_MAP_DIRECT config option to make the arch_xxxx
hooks no-op by default.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-map-ops.h | 14 ++++++++++++++
 kernel/dma/Kconfig          |  4 ++++
 kernel/dma/mapping.c        | 12 ++++++++----
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index a5f89fc4d6df..38c8a4558e08 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -314,6 +314,20 @@ static inline void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
 void *arch_dma_set_uncached(void *addr, size_t size);
 void arch_dma_clear_uncached(void *addr, size_t size);
 
+#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT
+bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr);
+bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle);
+bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg,
+		int nents);
+bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
+		int nents);
+#else
+#define arch_dma_map_page_direct(d, a)		(false)
+#define arch_dma_unmap_page_direct(d, a)	(false)
+#define arch_dma_map_sg_direct(d, s, n)		(false)
+#define arch_dma_unmap_sg_direct(d, s, n)	(false)
+#endif
+
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		const struct iommu_ops *iommu, bool coherent);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a21458..43d106598e82 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -20,6 +20,10 @@ config DMA_OPS
 config DMA_OPS_BYPASS
 	bool
 
+# Lets platform IOMMU driver choose between bypass and IOMMU
+config ARCH_HAS_DMA_MAP_DIRECT
+	bool
+
 config NEED_SG_DMA_LENGTH
 	bool
 
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 51bb8fa8eb89..f87a89d08654 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -149,7 +149,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 	if (WARN_ON_ONCE(!dev->dma_mask))
 		return DMA_MAPPING_ERROR;
 
-	if (dma_map_direct(dev, ops))
+	if (dma_map_direct(dev, ops) ||
+	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
 	else
 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
@@ -165,7 +166,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_map_direct(dev, ops))
+	if (dma_map_direct(dev, ops) ||
+	    arch_dma_unmap_page_direct(dev, addr + size))
 		dma_direct_unmap_page(dev, addr, size, dir, attrs);
 	else if (ops->unmap_page)
 		ops->unmap_page(dev, addr, size, dir, attrs);
@@ -188,7 +190,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
 	if (WARN_ON_ONCE(!dev->dma_mask))
 		return 0;
 
-	if (dma_map_direct(dev, ops))
+	if (dma_map_direct(dev, ops) ||
+	    arch_dma_map_sg_direct(dev, sg, nents))
 		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
 	else
 		ents = ops->map_sg(dev, sg, nents, dir, attrs);
@@ -207,7 +210,8 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (dma_map_direct(dev, ops))
+	if (dma_map_direct(dev, ops) ||
+	    arch_dma_unmap_sg_direct(dev, sg, nents))
 		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
 	else if (ops->unmap_sg)
 		ops->unmap_sg(dev, sg, nents, dir, attrs);
-- 
cgit 


From c7a5899eb26e2a4d516d53f65b6dd67be2228041 Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Tue, 17 Nov 2020 17:47:23 +0100
Subject: xfrm: redact SA secret with lockdown confidentiality

redact XFRM SA secret in the netlink response to xfrm_get_sa()
or dumpall sa.
Enable lockdown, confidentiality mode, at boot or at run time.

e.g. when enabled:
cat /sys/kernel/security/lockdown
none integrity [confidentiality]

ip xfrm state
src 172.16.1.200 dst 172.16.1.100
	proto esp spi 0x00000002 reqid 2 mode tunnel
	replay-window 0
	aead rfc4106(gcm(aes)) 0x0000000000000000000000000000000000000000 96

note: the aead secret is redacted.
Redacting secret is also a FIPS 140-2 requirement.

v1->v2
 - add size checks before memset calls
v2->v3
 - replace spaces with tabs for consistency
v3->v4
 - use kernel lockdown instead of a /proc setting
v4->v5
 - remove kconfig option

Reviewed-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/security.h |  1 +
 net/xfrm/xfrm_user.c     | 74 +++++++++++++++++++++++++++++++++++++++++++-----
 security/security.c      |  1 +
 3 files changed, 69 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index bc2725491560..1112a79a7dba 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -127,6 +127,7 @@ enum lockdown_reason {
 	LOCKDOWN_PERF,
 	LOCKDOWN_TRACEFS,
 	LOCKDOWN_XMON_RW,
+	LOCKDOWN_XFRM_SECRET,
 	LOCKDOWN_CONFIDENTIALITY_MAX,
 };
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d0c32a8fcc4a..0727ac853b55 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -848,21 +848,84 @@ static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb
 	return 0;
 }
 
+static bool xfrm_redact(void)
+{
+	return IS_ENABLED(CONFIG_SECURITY) &&
+		security_locked_down(LOCKDOWN_XFRM_SECRET);
+}
+
 static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
 {
 	struct xfrm_algo *algo;
+	struct xfrm_algo_auth *ap;
 	struct nlattr *nla;
+	bool redact_secret = xfrm_redact();
 
 	nla = nla_reserve(skb, XFRMA_ALG_AUTH,
 			  sizeof(*algo) + (auth->alg_key_len + 7) / 8);
 	if (!nla)
 		return -EMSGSIZE;
-
 	algo = nla_data(nla);
 	strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name));
-	memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8);
+
+	if (redact_secret && auth->alg_key_len)
+		memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8);
+	else
+		memcpy(algo->alg_key, auth->alg_key,
+		       (auth->alg_key_len + 7) / 8);
 	algo->alg_key_len = auth->alg_key_len;
 
+	nla = nla_reserve(skb, XFRMA_ALG_AUTH_TRUNC, xfrm_alg_auth_len(auth));
+	if (!nla)
+		return -EMSGSIZE;
+	ap = nla_data(nla);
+	memcpy(ap, auth, sizeof(struct xfrm_algo_auth));
+	if (redact_secret && auth->alg_key_len)
+		memset(ap->alg_key, 0, (auth->alg_key_len + 7) / 8);
+	else
+		memcpy(ap->alg_key, auth->alg_key,
+		       (auth->alg_key_len + 7) / 8);
+	return 0;
+}
+
+static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb)
+{
+	struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_AEAD, aead_len(aead));
+	struct xfrm_algo_aead *ap;
+	bool redact_secret = xfrm_redact();
+
+	if (!nla)
+		return -EMSGSIZE;
+
+	ap = nla_data(nla);
+	memcpy(ap, aead, sizeof(*aead));
+
+	if (redact_secret && aead->alg_key_len)
+		memset(ap->alg_key, 0, (aead->alg_key_len + 7) / 8);
+	else
+		memcpy(ap->alg_key, aead->alg_key,
+		       (aead->alg_key_len + 7) / 8);
+	return 0;
+}
+
+static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb)
+{
+	struct xfrm_algo *ap;
+	bool redact_secret = xfrm_redact();
+	struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_CRYPT,
+					 xfrm_alg_len(ealg));
+	if (!nla)
+		return -EMSGSIZE;
+
+	ap = nla_data(nla);
+	memcpy(ap, ealg, sizeof(*ealg));
+
+	if (redact_secret && ealg->alg_key_len)
+		memset(ap->alg_key, 0, (ealg->alg_key_len + 7) / 8);
+	else
+		memcpy(ap->alg_key, ealg->alg_key,
+		       (ealg->alg_key_len + 7) / 8);
+
 	return 0;
 }
 
@@ -906,20 +969,17 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 			goto out;
 	}
 	if (x->aead) {
-		ret = nla_put(skb, XFRMA_ALG_AEAD, aead_len(x->aead), x->aead);
+		ret = copy_to_user_aead(x->aead, skb);
 		if (ret)
 			goto out;
 	}
 	if (x->aalg) {
 		ret = copy_to_user_auth(x->aalg, skb);
-		if (!ret)
-			ret = nla_put(skb, XFRMA_ALG_AUTH_TRUNC,
-				      xfrm_alg_auth_len(x->aalg), x->aalg);
 		if (ret)
 			goto out;
 	}
 	if (x->ealg) {
-		ret = nla_put(skb, XFRMA_ALG_CRYPT, xfrm_alg_len(x->ealg), x->ealg);
+		ret = copy_to_user_ealg(x->ealg, skb);
 		if (ret)
 			goto out;
 	}
diff --git a/security/security.c b/security/security.c
index a28045dc9e7f..abff77c1c8a7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -65,6 +65,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
 	[LOCKDOWN_PERF] = "unsafe use of perf",
 	[LOCKDOWN_TRACEFS] = "use of tracefs",
 	[LOCKDOWN_XMON_RW] = "xmon read and write access",
+	[LOCKDOWN_XFRM_SECRET] = "xfrm SA secret",
 	[LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
 };
 
-- 
cgit 


From 928296ea5da37838d7127de4b10f47cd97401b13 Mon Sep 17 00:00:00 2001
From: Matthias Brugger <mbrugger@suse.com>
Date: Fri, 30 Oct 2020 12:36:11 +0100
Subject: soc: mediatek: pm_domains: Make bus protection generic

Bus protection is not exclusively done by calling the infracfg misc driver.
Make the calls for setting and clearing the bus protection generic so
that we can use other blocks for it as well.

Signed-off-by: Matthias Brugger <mbrugger@suse.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20201030113622.201188-6-enric.balletbo@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/Kconfig          |  1 -
 drivers/soc/mediatek/mtk-infracfg.c   |  5 ---
 drivers/soc/mediatek/mtk-pm-domains.c | 57 +++++++++++++++++++++++++++--------
 include/linux/soc/mediatek/infracfg.h |  5 +++
 4 files changed, 49 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig
index 68d800f9e4a5..67cef12dc585 100644
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -48,7 +48,6 @@ config MTK_SCPSYS_PM_DOMAINS
 	bool "MediaTek SCPSYS generic power domain"
 	default ARCH_MEDIATEK
 	depends on PM
-	depends on MTK_INFRACFG
 	select PM_GENERIC_DOMAINS
 	select REGMAP
 	help
diff --git a/drivers/soc/mediatek/mtk-infracfg.c b/drivers/soc/mediatek/mtk-infracfg.c
index 4a123796aad3..0590b68e0d78 100644
--- a/drivers/soc/mediatek/mtk-infracfg.c
+++ b/drivers/soc/mediatek/mtk-infracfg.c
@@ -12,11 +12,6 @@
 #define MTK_POLL_DELAY_US   10
 #define MTK_POLL_TIMEOUT    (jiffies_to_usecs(HZ))
 
-#define INFRA_TOPAXI_PROTECTEN		0x0220
-#define INFRA_TOPAXI_PROTECTSTA1	0x0228
-#define INFRA_TOPAXI_PROTECTEN_SET	0x0260
-#define INFRA_TOPAXI_PROTECTEN_CLR	0x0264
-
 /**
  * mtk_infracfg_set_bus_protection - enable bus protection
  * @infracfg: The infracfg regmap
diff --git a/drivers/soc/mediatek/mtk-pm-domains.c b/drivers/soc/mediatek/mtk-pm-domains.c
index 06a16e45356a..6122701d018f 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.c
+++ b/drivers/soc/mediatek/mtk-pm-domains.c
@@ -86,18 +86,24 @@ static int scpsys_sram_disable(struct scpsys_domain *pd)
 					MTK_POLL_TIMEOUT);
 }
 
-static int scpsys_bus_protect_enable(struct scpsys_domain *pd)
+static int _scpsys_bus_protect_enable(const struct scpsys_bus_prot_data *bpd, struct regmap *regmap)
 {
-	const struct scpsys_bus_prot_data *bpd = pd->data->bp_infracfg;
 	int i, ret;
 
 	for (i = 0; i < SPM_MAX_BUS_PROT_DATA; i++) {
-		if (!bpd[i].bus_prot_mask)
+		u32 val, mask = bpd[i].bus_prot_mask;
+
+		if (!mask)
 			break;
 
-		ret = mtk_infracfg_set_bus_protection(pd->infracfg,
-						      bpd[i].bus_prot_mask,
-						      bpd[i].bus_prot_reg_update);
+		if (bpd[i].bus_prot_reg_update)
+			regmap_set_bits(regmap, bpd[i].bus_prot_set, mask);
+		else
+			regmap_write(regmap, INFRA_TOPAXI_PROTECTEN_SET, mask);
+
+		ret = regmap_read_poll_timeout(regmap, INFRA_TOPAXI_PROTECTSTA1,
+					       val, (val & mask) == mask,
+					       MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT);
 		if (ret)
 			return ret;
 	}
@@ -105,18 +111,34 @@ static int scpsys_bus_protect_enable(struct scpsys_domain *pd)
 	return 0;
 }
 
-static int scpsys_bus_protect_disable(struct scpsys_domain *pd)
+static int scpsys_bus_protect_enable(struct scpsys_domain *pd)
+{
+	int ret;
+
+	ret = _scpsys_bus_protect_enable(pd->data->bp_infracfg, pd->infracfg);
+
+	return ret;
+}
+
+static int _scpsys_bus_protect_disable(const struct scpsys_bus_prot_data *bpd,
+				       struct regmap *regmap)
 {
-	const struct scpsys_bus_prot_data *bpd = pd->data->bp_infracfg;
 	int i, ret;
 
-	for (i = SPM_MAX_BUS_PROT_DATA; i > 0; i--) {
-		if (!bpd[i].bus_prot_mask)
+	for (i = SPM_MAX_BUS_PROT_DATA - 1; i >= 0; i--) {
+		u32 val, mask = bpd[i].bus_prot_mask;
+
+		if (!mask)
 			continue;
 
-		ret = mtk_infracfg_clear_bus_protection(pd->infracfg,
-							bpd[i].bus_prot_mask,
-							bpd[i].bus_prot_reg_update);
+		if (bpd[i].bus_prot_reg_update)
+			regmap_clear_bits(regmap, bpd[i].bus_prot_clr, mask);
+		else
+			regmap_write(regmap, INFRA_TOPAXI_PROTECTEN_CLR, mask);
+
+		ret = regmap_read_poll_timeout(regmap, INFRA_TOPAXI_PROTECTSTA1,
+					       val, !(val & mask),
+					       MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT);
 		if (ret)
 			return ret;
 	}
@@ -124,6 +146,15 @@ static int scpsys_bus_protect_disable(struct scpsys_domain *pd)
 	return 0;
 }
 
+static int scpsys_bus_protect_disable(struct scpsys_domain *pd)
+{
+	int ret;
+
+	ret = _scpsys_bus_protect_disable(pd->data->bp_infracfg, pd->infracfg);
+
+	return ret;
+}
+
 static int scpsys_power_on(struct generic_pm_domain *genpd)
 {
 	struct scpsys_domain *pd = container_of(genpd, struct scpsys_domain, genpd);
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 233463d789c6..5bcaab767f6a 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -32,6 +32,11 @@
 #define MT7622_TOP_AXI_PROT_EN_WB		(BIT(2) | BIT(6) | \
 						 BIT(7) | BIT(8))
 
+#define INFRA_TOPAXI_PROTECTEN			0x0220
+#define INFRA_TOPAXI_PROTECTSTA1		0x0228
+#define INFRA_TOPAXI_PROTECTEN_SET		0x0260
+#define INFRA_TOPAXI_PROTECTEN_CLR		0x0264
+
 #define REG_INFRA_MISC				0xf00
 #define F_DDR_4GB_SUPPORT_EN			BIT(13)
 
-- 
cgit 


From eb9fa767fbe19d3db7d303e9fde7f3056221ffe1 Mon Sep 17 00:00:00 2001
From: Matthias Brugger <mbrugger@suse.com>
Date: Fri, 30 Oct 2020 12:36:17 +0100
Subject: soc: mediatek: pm-domains: Add support for mt8183

Add the needed board data to support mt8183 SoC.

Signed-off-by: Matthias Brugger <mbrugger@suse.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20201030113622.201188-12-enric.balletbo@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mt8183-pm-domains.h | 221 +++++++++++++++++++++++++++++++
 drivers/soc/mediatek/mtk-pm-domains.c    |   5 +
 drivers/soc/mediatek/mtk-pm-domains.h    |   1 +
 include/linux/soc/mediatek/infracfg.h    |  46 +++++++
 4 files changed, 273 insertions(+)
 create mode 100644 drivers/soc/mediatek/mt8183-pm-domains.h

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/mt8183-pm-domains.h b/drivers/soc/mediatek/mt8183-pm-domains.h
new file mode 100644
index 000000000000..8d996c5d2682
--- /dev/null
+++ b/drivers/soc/mediatek/mt8183-pm-domains.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __SOC_MEDIATEK_MT8183_PM_DOMAINS_H
+#define __SOC_MEDIATEK_MT8183_PM_DOMAINS_H
+
+#include "mtk-pm-domains.h"
+#include <dt-bindings/power/mt8183-power.h>
+
+/*
+ * MT8183 power domain support
+ */
+
+static const struct scpsys_domain_data scpsys_domain_data_mt8183[] = {
+	[MT8183_POWER_DOMAIN_AUDIO] = {
+		.sta_mask = PWR_STATUS_AUDIO,
+		.ctl_offs = 0x0314,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+	},
+	[MT8183_POWER_DOMAIN_CONN] = {
+		.sta_mask = PWR_STATUS_CONN,
+		.ctl_offs = 0x032c,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_CONN, MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR, MT8183_TOP_AXI_PROT_EN_STA1),
+		},
+	},
+	[MT8183_POWER_DOMAIN_MFG_ASYNC] = {
+		.sta_mask = PWR_STATUS_MFG_ASYNC,
+		.ctl_offs = 0x0334,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+	},
+	[MT8183_POWER_DOMAIN_MFG] = {
+		.sta_mask = PWR_STATUS_MFG,
+		.ctl_offs = 0x0338,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8183_POWER_DOMAIN_MFG_CORE0] = {
+		.sta_mask = BIT(7),
+		.ctl_offs = 0x034c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8183_POWER_DOMAIN_MFG_CORE1] = {
+		.sta_mask = BIT(20),
+		.ctl_offs = 0x0310,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8183_POWER_DOMAIN_MFG_2D] = {
+		.sta_mask = PWR_STATUS_MFG_2D,
+		.ctl_offs = 0x0348,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_1_MFG, MT8183_TOP_AXI_PROT_EN_1_SET,
+				    MT8183_TOP_AXI_PROT_EN_1_CLR, MT8183_TOP_AXI_PROT_EN_STA1_1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MFG, MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR, MT8183_TOP_AXI_PROT_EN_STA1),
+		},
+	},
+	[MT8183_POWER_DOMAIN_DISP] = {
+		.sta_mask = PWR_STATUS_DISP,
+		.ctl_offs = 0x030c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_1_DISP, MT8183_TOP_AXI_PROT_EN_1_SET,
+				    MT8183_TOP_AXI_PROT_EN_1_CLR, MT8183_TOP_AXI_PROT_EN_STA1_1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_DISP, MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR, MT8183_TOP_AXI_PROT_EN_STA1),
+		},
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_DISP,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_CAM] = {
+		.sta_mask = BIT(25),
+		.ctl_offs = 0x0344,
+		.sram_pdn_bits = GENMASK(9, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MM_CAM, MT8183_TOP_AXI_PROT_EN_MM_SET,
+				    MT8183_TOP_AXI_PROT_EN_MM_CLR, MT8183_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_CAM, MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR, MT8183_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR_IGN(MT8183_TOP_AXI_PROT_EN_MM_CAM_2ND,
+					MT8183_TOP_AXI_PROT_EN_MM_SET,
+					MT8183_TOP_AXI_PROT_EN_MM_CLR,
+					MT8183_TOP_AXI_PROT_EN_MM_STA1),
+		},
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_CAM,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_ISP] = {
+		.sta_mask = PWR_STATUS_ISP,
+		.ctl_offs = 0x0308,
+		.sram_pdn_bits = GENMASK(9, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MM_ISP,
+				    MT8183_TOP_AXI_PROT_EN_MM_SET,
+				    MT8183_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR_IGN(MT8183_TOP_AXI_PROT_EN_MM_ISP_2ND,
+					MT8183_TOP_AXI_PROT_EN_MM_SET,
+					MT8183_TOP_AXI_PROT_EN_MM_CLR,
+					MT8183_TOP_AXI_PROT_EN_MM_STA1),
+		},
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_ISP,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_VDEC] = {
+		.sta_mask = BIT(31),
+		.ctl_offs = 0x0300,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_VDEC,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_VENC] = {
+		.sta_mask = PWR_STATUS_VENC,
+		.ctl_offs = 0x0304,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_VENC,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_VPU_TOP] = {
+		.sta_mask = BIT(26),
+		.ctl_offs = 0x0324,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MM_VPU_TOP,
+				    MT8183_TOP_AXI_PROT_EN_MM_SET,
+				    MT8183_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_VPU_TOP,
+				    MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR,
+				    MT8183_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MM_VPU_TOP_2ND,
+				    MT8183_TOP_AXI_PROT_EN_MM_SET,
+				    MT8183_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MM_STA1),
+		},
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_VPU_TOP,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_VPU_CORE0] = {
+		.sta_mask = BIT(27),
+		.ctl_offs = 0x33c,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE0,
+				    MT8183_TOP_AXI_PROT_EN_MCU_SET,
+				    MT8183_TOP_AXI_PROT_EN_MCU_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MCU_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE0_2ND,
+				    MT8183_TOP_AXI_PROT_EN_MCU_SET,
+				    MT8183_TOP_AXI_PROT_EN_MCU_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MCU_STA1),
+		},
+		.caps = MTK_SCPD_SRAM_ISO,
+	},
+	[MT8183_POWER_DOMAIN_VPU_CORE1] = {
+		.sta_mask = BIT(28),
+		.ctl_offs = 0x0340,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE1,
+				    MT8183_TOP_AXI_PROT_EN_MCU_SET,
+				    MT8183_TOP_AXI_PROT_EN_MCU_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MCU_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE1_2ND,
+				    MT8183_TOP_AXI_PROT_EN_MCU_SET,
+				    MT8183_TOP_AXI_PROT_EN_MCU_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MCU_STA1),
+		},
+		.caps = MTK_SCPD_SRAM_ISO,
+	},
+};
+
+static const struct scpsys_soc_data mt8183_scpsys_data = {
+	.domains_data = scpsys_domain_data_mt8183,
+	.num_domains = ARRAY_SIZE(scpsys_domain_data_mt8183),
+	.pwr_sta_offs = 0x0180,
+	.pwr_sta2nd_offs = 0x0184
+};
+
+#endif /* __SOC_MEDIATEK_MT8183_PM_DOMAINS_H */
diff --git a/drivers/soc/mediatek/mtk-pm-domains.c b/drivers/soc/mediatek/mtk-pm-domains.c
index 03279a999dfc..8703d50cd2b7 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.c
+++ b/drivers/soc/mediatek/mtk-pm-domains.c
@@ -16,6 +16,7 @@
 #include <linux/soc/mediatek/infracfg.h>
 
 #include "mt8173-pm-domains.h"
+#include "mt8183-pm-domains.h"
 
 #define MTK_POLL_DELAY_US		10
 #define MTK_POLL_TIMEOUT		USEC_PER_SEC
@@ -505,6 +506,10 @@ static const struct of_device_id scpsys_of_match[] = {
 		.compatible = "mediatek,mt8173-power-controller",
 		.data = &mt8173_scpsys_data,
 	},
+	{
+		.compatible = "mediatek,mt8183-power-controller",
+		.data = &mt8183_scpsys_data,
+	},
 	{ }
 };
 
diff --git a/drivers/soc/mediatek/mtk-pm-domains.h b/drivers/soc/mediatek/mtk-pm-domains.h
index 809d2d43f01d..2c745f11b422 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.h
+++ b/drivers/soc/mediatek/mtk-pm-domains.h
@@ -22,6 +22,7 @@
 #define SPM_PWR_STATUS			0x060c
 #define SPM_PWR_STATUS_2ND		0x0610
 
+#define PWR_STATUS_CONN			BIT(1)
 #define PWR_STATUS_DISP			BIT(3)
 #define PWR_STATUS_MFG			BIT(4)
 #define PWR_STATUS_ISP			BIT(5)
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 5bcaab767f6a..9d01e32e19bc 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -2,6 +2,52 @@
 #ifndef __SOC_MEDIATEK_INFRACFG_H
 #define __SOC_MEDIATEK_INFRACFG_H
 
+#define MT8183_TOP_AXI_PROT_EN_STA1			0x228
+#define MT8183_TOP_AXI_PROT_EN_STA1_1			0x258
+#define MT8183_TOP_AXI_PROT_EN_SET			0x2a0
+#define MT8183_TOP_AXI_PROT_EN_CLR			0x2a4
+#define MT8183_TOP_AXI_PROT_EN_1_SET			0x2a8
+#define MT8183_TOP_AXI_PROT_EN_1_CLR			0x2ac
+#define MT8183_TOP_AXI_PROT_EN_MCU_SET			0x2c4
+#define MT8183_TOP_AXI_PROT_EN_MCU_CLR			0x2c8
+#define MT8183_TOP_AXI_PROT_EN_MCU_STA1			0x2e4
+#define MT8183_TOP_AXI_PROT_EN_MM_SET			0x2d4
+#define MT8183_TOP_AXI_PROT_EN_MM_CLR			0x2d8
+#define MT8183_TOP_AXI_PROT_EN_MM_STA1			0x2ec
+
+#define MT8183_TOP_AXI_PROT_EN_DISP			(BIT(10) | BIT(11))
+#define MT8183_TOP_AXI_PROT_EN_CONN			(BIT(13) | BIT(14))
+#define MT8183_TOP_AXI_PROT_EN_MFG			(BIT(21) | BIT(22))
+#define MT8183_TOP_AXI_PROT_EN_CAM			BIT(28)
+#define MT8183_TOP_AXI_PROT_EN_VPU_TOP			BIT(27)
+#define MT8183_TOP_AXI_PROT_EN_1_DISP			(BIT(16) | BIT(17))
+#define MT8183_TOP_AXI_PROT_EN_1_MFG			GENMASK(21, 19)
+#define MT8183_TOP_AXI_PROT_EN_MM_ISP			(BIT(3) | BIT(8))
+#define MT8183_TOP_AXI_PROT_EN_MM_ISP_2ND		BIT(10)
+#define MT8183_TOP_AXI_PROT_EN_MM_CAM			(BIT(4) | BIT(5) | \
+							 BIT(9) | BIT(13))
+#define MT8183_TOP_AXI_PROT_EN_MM_VPU_TOP		(GENMASK(9, 6) | \
+							 BIT(12))
+#define MT8183_TOP_AXI_PROT_EN_MM_VPU_TOP_2ND		(BIT(10) | BIT(11))
+#define MT8183_TOP_AXI_PROT_EN_MM_CAM_2ND		BIT(11)
+#define MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE0_2ND	(BIT(0) | BIT(2) | \
+							 BIT(4))
+#define MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE1_2ND	(BIT(1) | BIT(3) | \
+							 BIT(5))
+#define MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE0		BIT(6)
+#define MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE1		BIT(7)
+
+#define MT8183_SMI_COMMON_CLAMP_EN			0x3c0
+#define MT8183_SMI_COMMON_CLAMP_EN_SET			0x3c4
+#define MT8183_SMI_COMMON_CLAMP_EN_CLR			0x3c8
+
+#define MT8183_SMI_COMMON_SMI_CLAMP_DISP		GENMASK(7, 0)
+#define MT8183_SMI_COMMON_SMI_CLAMP_VENC		BIT(1)
+#define MT8183_SMI_COMMON_SMI_CLAMP_ISP			BIT(2)
+#define MT8183_SMI_COMMON_SMI_CLAMP_CAM			(BIT(3) | BIT(4))
+#define MT8183_SMI_COMMON_SMI_CLAMP_VPU_TOP		(BIT(5) | BIT(6))
+#define MT8183_SMI_COMMON_SMI_CLAMP_VDEC		BIT(7)
+
 #define MT8173_TOP_AXI_PROT_EN_MCI_M2		BIT(0)
 #define MT8173_TOP_AXI_PROT_EN_MM_M0		BIT(1)
 #define MT8173_TOP_AXI_PROT_EN_MM_M1		BIT(2)
-- 
cgit 


From a49d5e7a89d644a5c0ddc851be4bbf08614e6015 Mon Sep 17 00:00:00 2001
From: Weiyi Lu <weiyi.lu@mediatek.com>
Date: Fri, 30 Oct 2020 12:36:22 +0100
Subject: soc: mediatek: pm-domains: Add support for mt8192

Add the needed board data to support mt8192 SoC.

Signed-off-by: Weiyi Lu <weiyi.lu@mediatek.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Tested-by: Weiyi Lu <weiyi.lu@mediatek.com>
Link: https://lore.kernel.org/r/20201030113622.201188-17-enric.balletbo@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mt8192-pm-domains.h | 292 +++++++++++++++++++++++++++++++
 drivers/soc/mediatek/mtk-pm-domains.c    |   5 +
 include/linux/soc/mediatek/infracfg.h    |  56 ++++++
 3 files changed, 353 insertions(+)
 create mode 100644 drivers/soc/mediatek/mt8192-pm-domains.h

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/mt8192-pm-domains.h b/drivers/soc/mediatek/mt8192-pm-domains.h
new file mode 100644
index 000000000000..0fdf6dc6231f
--- /dev/null
+++ b/drivers/soc/mediatek/mt8192-pm-domains.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __SOC_MEDIATEK_MT8192_PM_DOMAINS_H
+#define __SOC_MEDIATEK_MT8192_PM_DOMAINS_H
+
+#include "mtk-pm-domains.h"
+#include <dt-bindings/power/mt8192-power.h>
+
+/*
+ * MT8192 power domain support
+ */
+
+static const struct scpsys_domain_data scpsys_domain_data_mt8192[] = {
+	[MT8192_POWER_DOMAIN_AUDIO] = {
+		.sta_mask = BIT(21),
+		.ctl_offs = 0x0354,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_2_AUDIO,
+				    MT8192_TOP_AXI_PROT_EN_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_CONN] = {
+		.sta_mask = PWR_STATUS_CONN,
+		.ctl_offs = 0x0304,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_CONN,
+				    MT8192_TOP_AXI_PROT_EN_SET,
+				    MT8192_TOP_AXI_PROT_EN_CLR,
+				    MT8192_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_CONN_2ND,
+				    MT8192_TOP_AXI_PROT_EN_SET,
+				    MT8192_TOP_AXI_PROT_EN_CLR,
+				    MT8192_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_1_CONN,
+				    MT8192_TOP_AXI_PROT_EN_1_SET,
+				    MT8192_TOP_AXI_PROT_EN_1_CLR,
+				    MT8192_TOP_AXI_PROT_EN_1_STA1),
+		},
+		.caps = MTK_SCPD_KEEP_DEFAULT_OFF,
+	},
+	[MT8192_POWER_DOMAIN_MFG0] = {
+		.sta_mask = BIT(2),
+		.ctl_offs = 0x0308,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG1] = {
+		.sta_mask = BIT(3),
+		.ctl_offs = 0x030c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_1_MFG1,
+				    MT8192_TOP_AXI_PROT_EN_1_SET,
+				    MT8192_TOP_AXI_PROT_EN_1_CLR,
+				    MT8192_TOP_AXI_PROT_EN_1_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_2_MFG1,
+				    MT8192_TOP_AXI_PROT_EN_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MFG1,
+				    MT8192_TOP_AXI_PROT_EN_SET,
+				    MT8192_TOP_AXI_PROT_EN_CLR,
+				    MT8192_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_2_MFG1_2ND,
+				    MT8192_TOP_AXI_PROT_EN_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_MFG2] = {
+		.sta_mask = BIT(4),
+		.ctl_offs = 0x0310,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG3] = {
+		.sta_mask = BIT(5),
+		.ctl_offs = 0x0314,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG4] = {
+		.sta_mask = BIT(6),
+		.ctl_offs = 0x0318,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG5] = {
+		.sta_mask = BIT(7),
+		.ctl_offs = 0x031c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG6] = {
+		.sta_mask = BIT(8),
+		.ctl_offs = 0x0320,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_DISP] = {
+		.sta_mask = BIT(20),
+		.ctl_offs = 0x0350,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR_IGN(MT8192_TOP_AXI_PROT_EN_MM_DISP,
+					MT8192_TOP_AXI_PROT_EN_MM_SET,
+					MT8192_TOP_AXI_PROT_EN_MM_CLR,
+					MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR_IGN(MT8192_TOP_AXI_PROT_EN_MM_2_DISP,
+					MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+					MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+					MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_DISP,
+				    MT8192_TOP_AXI_PROT_EN_SET,
+				    MT8192_TOP_AXI_PROT_EN_CLR,
+				    MT8192_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_DISP_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_DISP_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_IPE] = {
+		.sta_mask = BIT(14),
+		.ctl_offs = 0x0338,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_IPE,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_IPE_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_ISP] = {
+		.sta_mask = BIT(12),
+		.ctl_offs = 0x0330,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_ISP,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_ISP_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_ISP2] = {
+		.sta_mask = BIT(13),
+		.ctl_offs = 0x0334,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_ISP2,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_ISP2_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_MDP] = {
+		.sta_mask = BIT(19),
+		.ctl_offs = 0x034c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_MDP,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_MDP_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_VENC] = {
+		.sta_mask = BIT(17),
+		.ctl_offs = 0x0344,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_VENC,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_VENC_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_VDEC] = {
+		.sta_mask = BIT(15),
+		.ctl_offs = 0x033c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_VDEC,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_VDEC_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_VDEC2] = {
+		.sta_mask = BIT(16),
+		.ctl_offs = 0x0340,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_CAM] = {
+		.sta_mask = BIT(23),
+		.ctl_offs = 0x035c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_2_CAM,
+				    MT8192_TOP_AXI_PROT_EN_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_CAM,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_1_CAM,
+				    MT8192_TOP_AXI_PROT_EN_1_SET,
+				    MT8192_TOP_AXI_PROT_EN_1_CLR,
+				    MT8192_TOP_AXI_PROT_EN_1_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_CAM_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_VDNR_CAM,
+				    MT8192_TOP_AXI_PROT_EN_VDNR_SET,
+				    MT8192_TOP_AXI_PROT_EN_VDNR_CLR,
+				    MT8192_TOP_AXI_PROT_EN_VDNR_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_CAM_RAWA] = {
+		.sta_mask = BIT(24),
+		.ctl_offs = 0x0360,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_CAM_RAWB] = {
+		.sta_mask = BIT(25),
+		.ctl_offs = 0x0364,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_CAM_RAWC] = {
+		.sta_mask = BIT(26),
+		.ctl_offs = 0x0368,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+};
+
+static const struct scpsys_soc_data mt8192_scpsys_data = {
+	.domains_data = scpsys_domain_data_mt8192,
+	.num_domains = ARRAY_SIZE(scpsys_domain_data_mt8192),
+	.pwr_sta_offs = 0x016c,
+	.pwr_sta2nd_offs = 0x0170,
+};
+
+#endif /* __SOC_MEDIATEK_MT8192_PM_DOMAINS_H */
diff --git a/drivers/soc/mediatek/mtk-pm-domains.c b/drivers/soc/mediatek/mtk-pm-domains.c
index c3b85b69f2f7..fb70cb3b07b3 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.c
+++ b/drivers/soc/mediatek/mtk-pm-domains.c
@@ -17,6 +17,7 @@
 
 #include "mt8173-pm-domains.h"
 #include "mt8183-pm-domains.h"
+#include "mt8192-pm-domains.h"
 
 #define MTK_POLL_DELAY_US		10
 #define MTK_POLL_TIMEOUT		USEC_PER_SEC
@@ -521,6 +522,10 @@ static const struct of_device_id scpsys_of_match[] = {
 		.compatible = "mediatek,mt8183-power-controller",
 		.data = &mt8183_scpsys_data,
 	},
+	{
+		.compatible = "mediatek,mt8192-power-controller",
+		.data = &mt8192_scpsys_data,
+	},
 	{ }
 };
 
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 9d01e32e19bc..e7842debc05d 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -2,6 +2,62 @@
 #ifndef __SOC_MEDIATEK_INFRACFG_H
 #define __SOC_MEDIATEK_INFRACFG_H
 
+#define MT8192_TOP_AXI_PROT_EN_STA1			0x228
+#define MT8192_TOP_AXI_PROT_EN_1_STA1			0x258
+#define MT8192_TOP_AXI_PROT_EN_SET			0x2a0
+#define MT8192_TOP_AXI_PROT_EN_CLR			0x2a4
+#define MT8192_TOP_AXI_PROT_EN_1_SET			0x2a8
+#define MT8192_TOP_AXI_PROT_EN_1_CLR			0x2ac
+#define MT8192_TOP_AXI_PROT_EN_MM_SET			0x2d4
+#define MT8192_TOP_AXI_PROT_EN_MM_CLR			0x2d8
+#define MT8192_TOP_AXI_PROT_EN_MM_STA1			0x2ec
+#define MT8192_TOP_AXI_PROT_EN_2_SET			0x714
+#define MT8192_TOP_AXI_PROT_EN_2_CLR			0x718
+#define MT8192_TOP_AXI_PROT_EN_2_STA1			0x724
+#define MT8192_TOP_AXI_PROT_EN_VDNR_SET			0xb84
+#define MT8192_TOP_AXI_PROT_EN_VDNR_CLR			0xb88
+#define MT8192_TOP_AXI_PROT_EN_VDNR_STA1		0xb90
+#define MT8192_TOP_AXI_PROT_EN_MM_2_SET			0xdcc
+#define MT8192_TOP_AXI_PROT_EN_MM_2_CLR			0xdd0
+#define MT8192_TOP_AXI_PROT_EN_MM_2_STA1		0xdd8
+
+#define MT8192_TOP_AXI_PROT_EN_DISP			(BIT(6) | BIT(23))
+#define MT8192_TOP_AXI_PROT_EN_CONN			(BIT(13) | BIT(18))
+#define MT8192_TOP_AXI_PROT_EN_CONN_2ND			BIT(14)
+#define MT8192_TOP_AXI_PROT_EN_MFG1			GENMASK(22, 21)
+#define MT8192_TOP_AXI_PROT_EN_1_CONN			BIT(10)
+#define MT8192_TOP_AXI_PROT_EN_1_MFG1			BIT(21)
+#define MT8192_TOP_AXI_PROT_EN_1_CAM			BIT(22)
+#define MT8192_TOP_AXI_PROT_EN_2_CAM			BIT(0)
+#define MT8192_TOP_AXI_PROT_EN_2_ADSP			BIT(3)
+#define MT8192_TOP_AXI_PROT_EN_2_AUDIO			BIT(4)
+#define MT8192_TOP_AXI_PROT_EN_2_MFG1			GENMASK(6, 5)
+#define MT8192_TOP_AXI_PROT_EN_2_MFG1_2ND		BIT(7)
+#define MT8192_TOP_AXI_PROT_EN_MM_CAM			(BIT(0) | BIT(2))
+#define MT8192_TOP_AXI_PROT_EN_MM_DISP			(BIT(0) | BIT(2) | \
+							BIT(10) | BIT(12) | \
+							BIT(14) | BIT(16) | \
+							BIT(24) | BIT(26))
+#define MT8192_TOP_AXI_PROT_EN_MM_CAM_2ND		(BIT(1) | BIT(3))
+#define MT8192_TOP_AXI_PROT_EN_MM_DISP_2ND		(BIT(1) | BIT(3) | \
+							BIT(15) | BIT(17) | \
+							BIT(25) | BIT(27))
+#define MT8192_TOP_AXI_PROT_EN_MM_ISP2			BIT(14)
+#define MT8192_TOP_AXI_PROT_EN_MM_ISP2_2ND		BIT(15)
+#define MT8192_TOP_AXI_PROT_EN_MM_IPE			BIT(16)
+#define MT8192_TOP_AXI_PROT_EN_MM_IPE_2ND		BIT(17)
+#define MT8192_TOP_AXI_PROT_EN_MM_VDEC			BIT(24)
+#define MT8192_TOP_AXI_PROT_EN_MM_VDEC_2ND		BIT(25)
+#define MT8192_TOP_AXI_PROT_EN_MM_VENC			BIT(26)
+#define MT8192_TOP_AXI_PROT_EN_MM_VENC_2ND		BIT(27)
+#define MT8192_TOP_AXI_PROT_EN_MM_2_ISP			BIT(8)
+#define MT8192_TOP_AXI_PROT_EN_MM_2_DISP		(BIT(8) | BIT(12))
+#define MT8192_TOP_AXI_PROT_EN_MM_2_ISP_2ND		BIT(9)
+#define MT8192_TOP_AXI_PROT_EN_MM_2_DISP_2ND		(BIT(9) | BIT(13))
+#define MT8192_TOP_AXI_PROT_EN_MM_2_MDP			BIT(12)
+#define MT8192_TOP_AXI_PROT_EN_MM_2_MDP_2ND		BIT(13)
+#define MT8192_TOP_AXI_PROT_EN_VDNR_CAM			BIT(21)
+
 #define MT8183_TOP_AXI_PROT_EN_STA1			0x228
 #define MT8183_TOP_AXI_PROT_EN_STA1_1			0x258
 #define MT8183_TOP_AXI_PROT_EN_SET			0x2a0
-- 
cgit 


From 431ec7bd4d52caa2fc20fbe87744f522e3d1efad Mon Sep 17 00:00:00 2001
From: Michael Klein <michael@fossekall.de>
Date: Fri, 27 Nov 2020 13:52:01 +0100
Subject: mfd: si476x-core.h: Fix "regulator" spelling in comment

"regualtor" -> "regulator"

Signed-off-by: Michael Klein <michael@fossekall.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/si476x-core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/si476x-core.h b/include/linux/mfd/si476x-core.h
index 4708c2b8512a..dd95c37ca134 100644
--- a/include/linux/mfd/si476x-core.h
+++ b/include/linux/mfd/si476x-core.h
@@ -57,7 +57,7 @@ enum si476x_mfd_cells {
  * @SI476X_POWER_DOWN: In this state all regulators are turned off
  * and the reset line is pulled low. The device is completely
  * inactive.
- * @SI476X_POWER_UP_FULL: In this state all the power regualtors are
+ * @SI476X_POWER_UP_FULL: In this state all the power regulators are
  * turned on, reset line pulled high, IRQ line is enabled(polling is
  * active for polling use scenario) and device is turned on with
  * POWER_UP command. The device is ready to be used.
-- 
cgit 


From 6a0eaf5123e0e1223252b88cd5775f74105e27bd Mon Sep 17 00:00:00 2001
From: Dean Camera <dean@fourwalledcubicle.com>
Date: Thu, 26 Nov 2020 09:39:34 +1100
Subject: HID: Increase HID maximum report size to 16KB

Currently the maximum HID report size which can be buffered by the kernel is
8KB. This is sufficient for the vast majority of HID devices on the market, as
most HID reports are fairly small.

However, some unusual devices such as the Elgate Stream Deck exist which use a
report size slightly over 8KB for the image data that is sent to the device.
Reports these large cannot be buffered by the regular HID subsystem currently,
thus the only way to use such device is to bypass the HID subsystem entirely.

This increases the maximum HID report size to 16KB, which should cover all
sanely designed HID devices.

Signed-off-by: Dean Camera <dean@fourwalledcubicle.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index b079146850e6..c39d71eb1fd0 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -494,7 +494,7 @@ struct hid_report_enum {
 };
 
 #define HID_MIN_BUFFER_SIZE	64		/* make sure there is at least a packet size of space */
-#define HID_MAX_BUFFER_SIZE	8192		/* 8kb */
+#define HID_MAX_BUFFER_SIZE	16384		/* 16kb */
 #define HID_CONTROL_FIFO_SIZE	256		/* to init devices with >100 reports */
 #define HID_OUTPUT_FIFO_SIZE	64
 
-- 
cgit 


From fd3bb8f54a88107570334c156efb0c724a261003 Mon Sep 17 00:00:00 2001
From: Evan Green <evgreen@chromium.org>
Date: Fri, 27 Nov 2020 10:28:34 +0000
Subject: nvmem: core: Add support for keepout regions

Introduce support into the nvmem core for arrays of register ranges
that should not result in actual device access. For these regions a
constant byte (repeated) is returned instead on read, and writes are
quietly ignored and returned as successful.

This is useful for instance if certain efuse regions are protected
from access by Linux because they contain secret info to another part
of the system (like an integrated modem).

Signed-off-by: Evan Green <evgreen@chromium.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20201127102837.19366-3-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 153 +++++++++++++++++++++++++++++++++++++++--
 include/linux/nvmem-provider.h |  17 +++++
 2 files changed, 166 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index a09ff8409f60..177f5bf27c6d 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -34,6 +34,8 @@ struct nvmem_device {
 	struct bin_attribute	eeprom;
 	struct device		*base_dev;
 	struct list_head	cells;
+	const struct nvmem_keepout *keepout;
+	unsigned int		nkeepout;
 	nvmem_reg_read_t	reg_read;
 	nvmem_reg_write_t	reg_write;
 	struct gpio_desc	*wp_gpio;
@@ -66,8 +68,8 @@ static LIST_HEAD(nvmem_lookup_list);
 
 static BLOCKING_NOTIFIER_HEAD(nvmem_notifier);
 
-static int nvmem_reg_read(struct nvmem_device *nvmem, unsigned int offset,
-			  void *val, size_t bytes)
+static int __nvmem_reg_read(struct nvmem_device *nvmem, unsigned int offset,
+			    void *val, size_t bytes)
 {
 	if (nvmem->reg_read)
 		return nvmem->reg_read(nvmem->priv, offset, val, bytes);
@@ -75,8 +77,8 @@ static int nvmem_reg_read(struct nvmem_device *nvmem, unsigned int offset,
 	return -EINVAL;
 }
 
-static int nvmem_reg_write(struct nvmem_device *nvmem, unsigned int offset,
-			   void *val, size_t bytes)
+static int __nvmem_reg_write(struct nvmem_device *nvmem, unsigned int offset,
+			     void *val, size_t bytes)
 {
 	int ret;
 
@@ -90,6 +92,88 @@ static int nvmem_reg_write(struct nvmem_device *nvmem, unsigned int offset,
 	return -EINVAL;
 }
 
+static int nvmem_access_with_keepouts(struct nvmem_device *nvmem,
+				      unsigned int offset, void *val,
+				      size_t bytes, int write)
+{
+
+	unsigned int end = offset + bytes;
+	unsigned int kend, ksize;
+	const struct nvmem_keepout *keepout = nvmem->keepout;
+	const struct nvmem_keepout *keepoutend = keepout + nvmem->nkeepout;
+	int rc;
+
+	/*
+	 * Skip all keepouts before the range being accessed.
+	 * Keepouts are sorted.
+	 */
+	while ((keepout < keepoutend) && (keepout->end <= offset))
+		keepout++;
+
+	while ((offset < end) && (keepout < keepoutend)) {
+		/* Access the valid portion before the keepout. */
+		if (offset < keepout->start) {
+			kend = min(end, keepout->start);
+			ksize = kend - offset;
+			if (write)
+				rc = __nvmem_reg_write(nvmem, offset, val, ksize);
+			else
+				rc = __nvmem_reg_read(nvmem, offset, val, ksize);
+
+			if (rc)
+				return rc;
+
+			offset += ksize;
+			val += ksize;
+		}
+
+		/*
+		 * Now we're aligned to the start of this keepout zone. Go
+		 * through it.
+		 */
+		kend = min(end, keepout->end);
+		ksize = kend - offset;
+		if (!write)
+			memset(val, keepout->value, ksize);
+
+		val += ksize;
+		offset += ksize;
+		keepout++;
+	}
+
+	/*
+	 * If we ran out of keepouts but there's still stuff to do, send it
+	 * down directly
+	 */
+	if (offset < end) {
+		ksize = end - offset;
+		if (write)
+			return __nvmem_reg_write(nvmem, offset, val, ksize);
+		else
+			return __nvmem_reg_read(nvmem, offset, val, ksize);
+	}
+
+	return 0;
+}
+
+static int nvmem_reg_read(struct nvmem_device *nvmem, unsigned int offset,
+			  void *val, size_t bytes)
+{
+	if (!nvmem->nkeepout)
+		return __nvmem_reg_read(nvmem, offset, val, bytes);
+
+	return nvmem_access_with_keepouts(nvmem, offset, val, bytes, false);
+}
+
+static int nvmem_reg_write(struct nvmem_device *nvmem, unsigned int offset,
+			   void *val, size_t bytes)
+{
+	if (!nvmem->nkeepout)
+		return __nvmem_reg_write(nvmem, offset, val, bytes);
+
+	return nvmem_access_with_keepouts(nvmem, offset, val, bytes, true);
+}
+
 #ifdef CONFIG_NVMEM_SYSFS
 static const char * const nvmem_type_str[] = {
 	[NVMEM_TYPE_UNKNOWN] = "Unknown",
@@ -533,6 +617,59 @@ nvmem_find_cell_by_name(struct nvmem_device *nvmem, const char *cell_id)
 	return cell;
 }
 
+static int nvmem_validate_keepouts(struct nvmem_device *nvmem)
+{
+	unsigned int cur = 0;
+	const struct nvmem_keepout *keepout = nvmem->keepout;
+	const struct nvmem_keepout *keepoutend = keepout + nvmem->nkeepout;
+
+	while (keepout < keepoutend) {
+		/* Ensure keepouts are sorted and don't overlap. */
+		if (keepout->start < cur) {
+			dev_err(&nvmem->dev,
+				"Keepout regions aren't sorted or overlap.\n");
+
+			return -ERANGE;
+		}
+
+		if (keepout->end < keepout->start) {
+			dev_err(&nvmem->dev,
+				"Invalid keepout region.\n");
+
+			return -EINVAL;
+		}
+
+		/*
+		 * Validate keepouts (and holes between) don't violate
+		 * word_size constraints.
+		 */
+		if ((keepout->end - keepout->start < nvmem->word_size) ||
+		    ((keepout->start != cur) &&
+		     (keepout->start - cur < nvmem->word_size))) {
+
+			dev_err(&nvmem->dev,
+				"Keepout regions violate word_size constraints.\n");
+
+			return -ERANGE;
+		}
+
+		/* Validate keepouts don't violate stride (alignment). */
+		if (!IS_ALIGNED(keepout->start, nvmem->stride) ||
+		    !IS_ALIGNED(keepout->end, nvmem->stride)) {
+
+			dev_err(&nvmem->dev,
+				"Keepout regions violate stride.\n");
+
+			return -EINVAL;
+		}
+
+		cur = keepout->end;
+		keepout++;
+	}
+
+	return 0;
+}
+
 static int nvmem_add_cells_from_of(struct nvmem_device *nvmem)
 {
 	struct device_node *parent, *child;
@@ -647,6 +784,8 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	nvmem->type = config->type;
 	nvmem->reg_read = config->reg_read;
 	nvmem->reg_write = config->reg_write;
+	nvmem->keepout = config->keepout;
+	nvmem->nkeepout = config->nkeepout;
 	if (!config->no_of_node)
 		nvmem->dev.of_node = config->dev->of_node;
 
@@ -671,6 +810,12 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	nvmem->dev.groups = nvmem_dev_groups;
 #endif
 
+	if (nvmem->nkeepout) {
+		rval = nvmem_validate_keepouts(nvmem);
+		if (rval)
+			goto err_put_device;
+	}
+
 	dev_dbg(&nvmem->dev, "Registering nvmem device %s\n", config->name);
 
 	rval = device_register(&nvmem->dev);
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 06409a6c40bc..e162b757b6d5 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -30,6 +30,19 @@ enum nvmem_type {
 #define NVMEM_DEVID_NONE	(-1)
 #define NVMEM_DEVID_AUTO	(-2)
 
+/**
+ * struct nvmem_keepout - NVMEM register keepout range.
+ *
+ * @start:	The first byte offset to avoid.
+ * @end:	One beyond the last byte offset to avoid.
+ * @value:	The byte to fill reads with for this region.
+ */
+struct nvmem_keepout {
+	unsigned int start;
+	unsigned int end;
+	unsigned char value;
+};
+
 /**
  * struct nvmem_config - NVMEM device configuration
  *
@@ -39,6 +52,8 @@ enum nvmem_type {
  * @owner:	Pointer to exporter module. Used for refcounting.
  * @cells:	Optional array of pre-defined NVMEM cells.
  * @ncells:	Number of elements in cells.
+ * @keepout:	Optional array of keepout ranges (sorted ascending by start).
+ * @nkeepout:	Number of elements in the keepout array.
  * @type:	Type of the nvmem storage
  * @read_only:	Device is read-only.
  * @root_only:	Device is accessibly to root only.
@@ -66,6 +81,8 @@ struct nvmem_config {
 	struct gpio_desc	*wp_gpio;
 	const struct nvmem_cell_info	*cells;
 	int			ncells;
+	const struct nvmem_keepout *keepout;
+	unsigned int		nkeepout;
 	enum nvmem_type		type;
 	bool			read_only;
 	bool			root_only;
-- 
cgit 


From 24c8a743336a1fdf42c0c768b4435633069c6a39 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Wed, 30 Sep 2020 20:48:02 +0200
Subject: pcmcia: at91_cf: move definitions locally

struct at91_cf_data is only used in the driver since all the platforms moved
to device tree, move its definition locally.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20200930184804.3127757-1-alexandre.belloni@bootlin.com
---
 drivers/pcmcia/at91_cf.c            | 12 +++++++++++-
 include/linux/platform_data/atmel.h | 12 ------------
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c
index 7db0e9c74dfc..ed60c4dffecb 100644
--- a/drivers/pcmcia/at91_cf.c
+++ b/drivers/pcmcia/at91_cf.c
@@ -13,7 +13,6 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/gpio.h>
-#include <linux/platform_data/atmel.h>
 #include <linux/io.h>
 #include <linux/sizes.h>
 #include <linux/mfd/syscon.h>
@@ -35,6 +34,17 @@
 #define	CF_IO_PHYS	(1 << 23)
 #define	CF_MEM_PHYS	(0x017ff800)
 
+struct at91_cf_data {
+	int	irq_pin;		/* I/O IRQ */
+	int	det_pin;		/* Card detect */
+	int	vcc_pin;		/* power switching */
+	int	rst_pin;		/* card reset */
+	u8	chipselect;		/* EBI Chip Select number */
+	u8	flags;
+#define AT91_CF_TRUE_IDE	0x01
+#define AT91_IDE_SWAP_A0_A2	0x02
+};
+
 struct regmap *mc;
 
 /*--------------------------------------------------------------------------*/
diff --git a/include/linux/platform_data/atmel.h b/include/linux/platform_data/atmel.h
index 99e6069c5fd8..73f63be509c4 100644
--- a/include/linux/platform_data/atmel.h
+++ b/include/linux/platform_data/atmel.h
@@ -6,18 +6,6 @@
 #ifndef __ATMEL_H__
 #define __ATMEL_H__
 
- /* Compact Flash */
-struct at91_cf_data {
-	int	irq_pin;		/* I/O IRQ */
-	int	det_pin;		/* Card detect */
-	int	vcc_pin;		/* power switching */
-	int	rst_pin;		/* card reset */
-	u8	chipselect;		/* EBI Chip Select number */
-	u8	flags;
-#define AT91_CF_TRUE_IDE	0x01
-#define AT91_IDE_SWAP_A0_A2	0x02
-};
-
 /* FIXME: this needs a better location, but gets stuff building again */
 #ifdef CONFIG_ATMEL_PM
 extern int at91_suspend_entering_slow_clock(void);
-- 
cgit 


From a69dcdfc2dd21f86cb1f79f98fc94c52f96cff64 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Mon, 2 Nov 2020 08:04:38 +0800
Subject: soc / drm: mediatek: cmdq: Remove timeout handler in helper function

For each client driver, its timeout handler need to dump hardware register
or its state machine information, and their way to detect timeout are
also different, so remove timeout handler in helper function and
let client driver implement its own timeout handler.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Acked-by: Matthias Brugger <matthias.bgg@gmail.com>
Link: https://lore.kernel.org/r/20201102000438.29225-1-chunkuang.hu@kernel.org
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/gpu/drm/mediatek/mtk_drm_crtc.c |  3 +--
 drivers/soc/mediatek/mtk-cmdq-helper.c  | 41 +--------------------------------
 include/linux/soc/mediatek/mtk-cmdq.h   | 10 +-------
 3 files changed, 3 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
index ac038572164d..4be5d1fccf2e 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
@@ -824,8 +824,7 @@ int mtk_drm_crtc_create(struct drm_device *drm_dev,
 #if IS_REACHABLE(CONFIG_MTK_CMDQ)
 	mtk_crtc->cmdq_client =
 			cmdq_mbox_create(mtk_crtc->mmsys_dev,
-					 drm_crtc_index(&mtk_crtc->base),
-					 2000);
+					 drm_crtc_index(&mtk_crtc->base));
 	if (IS_ERR(mtk_crtc->cmdq_client)) {
 		dev_dbg(dev, "mtk_crtc %d failed to create mailbox client, writing register by CPU now\n",
 			drm_crtc_index(&mtk_crtc->base));
diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 505651b0d715..280d3bd9f675 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -70,14 +70,7 @@ int cmdq_dev_get_client_reg(struct device *dev,
 }
 EXPORT_SYMBOL(cmdq_dev_get_client_reg);
 
-static void cmdq_client_timeout(struct timer_list *t)
-{
-	struct cmdq_client *client = from_timer(client, t, timer);
-
-	dev_err(client->client.dev, "cmdq timeout!\n");
-}
-
-struct cmdq_client *cmdq_mbox_create(struct device *dev, int index, u32 timeout)
+struct cmdq_client *cmdq_mbox_create(struct device *dev, int index)
 {
 	struct cmdq_client *client;
 
@@ -85,12 +78,6 @@ struct cmdq_client *cmdq_mbox_create(struct device *dev, int index, u32 timeout)
 	if (!client)
 		return (struct cmdq_client *)-ENOMEM;
 
-	client->timeout_ms = timeout;
-	if (timeout != CMDQ_NO_TIMEOUT) {
-		spin_lock_init(&client->lock);
-		timer_setup(&client->timer, cmdq_client_timeout, 0);
-	}
-	client->pkt_cnt = 0;
 	client->client.dev = dev;
 	client->client.tx_block = false;
 	client->client.knows_txdone = true;
@@ -112,11 +99,6 @@ EXPORT_SYMBOL(cmdq_mbox_create);
 
 void cmdq_mbox_destroy(struct cmdq_client *client)
 {
-	if (client->timeout_ms != CMDQ_NO_TIMEOUT) {
-		spin_lock(&client->lock);
-		del_timer_sync(&client->timer);
-		spin_unlock(&client->lock);
-	}
 	mbox_free_channel(client->chan);
 	kfree(client);
 }
@@ -449,18 +431,6 @@ static void cmdq_pkt_flush_async_cb(struct cmdq_cb_data data)
 	struct cmdq_task_cb *cb = &pkt->cb;
 	struct cmdq_client *client = (struct cmdq_client *)pkt->cl;
 
-	if (client->timeout_ms != CMDQ_NO_TIMEOUT) {
-		unsigned long flags = 0;
-
-		spin_lock_irqsave(&client->lock, flags);
-		if (--client->pkt_cnt == 0)
-			del_timer(&client->timer);
-		else
-			mod_timer(&client->timer, jiffies +
-				  msecs_to_jiffies(client->timeout_ms));
-		spin_unlock_irqrestore(&client->lock, flags);
-	}
-
 	dma_sync_single_for_cpu(client->chan->mbox->dev, pkt->pa_base,
 				pkt->cmd_buf_size, DMA_TO_DEVICE);
 	if (cb->cb) {
@@ -473,7 +443,6 @@ int cmdq_pkt_flush_async(struct cmdq_pkt *pkt, cmdq_async_flush_cb cb,
 			 void *data)
 {
 	int err;
-	unsigned long flags = 0;
 	struct cmdq_client *client = (struct cmdq_client *)pkt->cl;
 
 	pkt->cb.cb = cb;
@@ -484,14 +453,6 @@ int cmdq_pkt_flush_async(struct cmdq_pkt *pkt, cmdq_async_flush_cb cb,
 	dma_sync_single_for_device(client->chan->mbox->dev, pkt->pa_base,
 				   pkt->cmd_buf_size, DMA_TO_DEVICE);
 
-	if (client->timeout_ms != CMDQ_NO_TIMEOUT) {
-		spin_lock_irqsave(&client->lock, flags);
-		if (client->pkt_cnt++ == 0)
-			mod_timer(&client->timer, jiffies +
-				  msecs_to_jiffies(client->timeout_ms));
-		spin_unlock_irqrestore(&client->lock, flags);
-	}
-
 	err = mbox_send_message(client->chan, pkt);
 	if (err < 0)
 		return err;
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 960704d75994..8e9996610978 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -11,7 +11,6 @@
 #include <linux/mailbox/mtk-cmdq-mailbox.h>
 #include <linux/timer.h>
 
-#define CMDQ_NO_TIMEOUT		0xffffffffu
 #define CMDQ_ADDR_HIGH(addr)	((u32)(((addr) >> 16) & GENMASK(31, 0)))
 #define CMDQ_ADDR_LOW(addr)	((u16)(addr) | BIT(1))
 
@@ -24,12 +23,8 @@ struct cmdq_client_reg {
 };
 
 struct cmdq_client {
-	spinlock_t lock;
-	u32 pkt_cnt;
 	struct mbox_client client;
 	struct mbox_chan *chan;
-	struct timer_list timer;
-	u32 timeout_ms; /* in unit of microsecond */
 };
 
 /**
@@ -51,13 +46,10 @@ int cmdq_dev_get_client_reg(struct device *dev,
  * cmdq_mbox_create() - create CMDQ mailbox client and channel
  * @dev:	device of CMDQ mailbox client
  * @index:	index of CMDQ mailbox channel
- * @timeout:	timeout of a pkt execution by GCE, in unit of microsecond, set
- *		CMDQ_NO_TIMEOUT if a timer is not used.
  *
  * Return: CMDQ mailbox client pointer
  */
-struct cmdq_client *cmdq_mbox_create(struct device *dev, int index,
-				     u32 timeout);
+struct cmdq_client *cmdq_mbox_create(struct device *dev, int index);
 
 /**
  * cmdq_mbox_destroy() - destroy CMDQ mailbox client and channel
-- 
cgit 


From 51c0e618b219c025ddaaf14baea8942cb7e2105b Mon Sep 17 00:00:00 2001
From: Yongqiang Niu <yongqiang.niu@mediatek.com>
Date: Tue, 6 Oct 2020 21:33:17 +0200
Subject: soc / drm: mediatek: Move DDP component defines into mtk-mmsys.h

MMSYS is the driver which controls the routing of these DDP components,
so the definition of the mtk_ddp_comp_id enum should be placed in mtk-mmsys.h

Signed-off-by: Yongqiang Niu <yongqiang.niu@mediatek.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Link: https://lore.kernel.org/r/20201006193320.405529-2-enric.balletbo@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h | 34 +----------------------------
 drivers/soc/mediatek/mtk-mmsys.c            |  4 +---
 include/linux/soc/mediatek/mtk-mmsys.h      | 33 ++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h b/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h
index 1d9e00b69462..5aa52b7afeec 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h
+++ b/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h
@@ -7,6 +7,7 @@
 #define MTK_DRM_DDP_COMP_H
 
 #include <linux/io.h>
+#include <linux/soc/mediatek/mtk-mmsys.h>
 
 struct device;
 struct device_node;
@@ -35,39 +36,6 @@ enum mtk_ddp_comp_type {
 	MTK_DDP_COMP_TYPE_MAX,
 };
 
-enum mtk_ddp_comp_id {
-	DDP_COMPONENT_AAL0,
-	DDP_COMPONENT_AAL1,
-	DDP_COMPONENT_BLS,
-	DDP_COMPONENT_CCORR,
-	DDP_COMPONENT_COLOR0,
-	DDP_COMPONENT_COLOR1,
-	DDP_COMPONENT_DITHER,
-	DDP_COMPONENT_DPI0,
-	DDP_COMPONENT_DPI1,
-	DDP_COMPONENT_DSI0,
-	DDP_COMPONENT_DSI1,
-	DDP_COMPONENT_DSI2,
-	DDP_COMPONENT_DSI3,
-	DDP_COMPONENT_GAMMA,
-	DDP_COMPONENT_OD0,
-	DDP_COMPONENT_OD1,
-	DDP_COMPONENT_OVL0,
-	DDP_COMPONENT_OVL_2L0,
-	DDP_COMPONENT_OVL_2L1,
-	DDP_COMPONENT_OVL1,
-	DDP_COMPONENT_PWM0,
-	DDP_COMPONENT_PWM1,
-	DDP_COMPONENT_PWM2,
-	DDP_COMPONENT_RDMA0,
-	DDP_COMPONENT_RDMA1,
-	DDP_COMPONENT_RDMA2,
-	DDP_COMPONENT_UFOE,
-	DDP_COMPONENT_WDMA0,
-	DDP_COMPONENT_WDMA1,
-	DDP_COMPONENT_ID_MAX,
-};
-
 struct mtk_ddp_comp;
 struct cmdq_pkt;
 struct mtk_ddp_comp_funcs {
diff --git a/drivers/soc/mediatek/mtk-mmsys.c b/drivers/soc/mediatek/mtk-mmsys.c
index a55f25511173..36ad66bb221b 100644
--- a/drivers/soc/mediatek/mtk-mmsys.c
+++ b/drivers/soc/mediatek/mtk-mmsys.c
@@ -5,13 +5,11 @@
  */
 
 #include <linux/device.h>
+#include <linux/io.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/soc/mediatek/mtk-mmsys.h>
 
-#include "../../gpu/drm/mediatek/mtk_drm_ddp.h"
-#include "../../gpu/drm/mediatek/mtk_drm_ddp_comp.h"
-
 #define DISP_REG_CONFIG_DISP_OVL0_MOUT_EN	0x040
 #define DISP_REG_CONFIG_DISP_OVL1_MOUT_EN	0x044
 #define DISP_REG_CONFIG_DISP_OD_MOUT_EN		0x048
diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h
index 7bab5d9a3d31..2228bf6133da 100644
--- a/include/linux/soc/mediatek/mtk-mmsys.h
+++ b/include/linux/soc/mediatek/mtk-mmsys.h
@@ -9,6 +9,39 @@
 enum mtk_ddp_comp_id;
 struct device;
 
+enum mtk_ddp_comp_id {
+	DDP_COMPONENT_AAL0,
+	DDP_COMPONENT_AAL1,
+	DDP_COMPONENT_BLS,
+	DDP_COMPONENT_CCORR,
+	DDP_COMPONENT_COLOR0,
+	DDP_COMPONENT_COLOR1,
+	DDP_COMPONENT_DITHER,
+	DDP_COMPONENT_DPI0,
+	DDP_COMPONENT_DPI1,
+	DDP_COMPONENT_DSI0,
+	DDP_COMPONENT_DSI1,
+	DDP_COMPONENT_DSI2,
+	DDP_COMPONENT_DSI3,
+	DDP_COMPONENT_GAMMA,
+	DDP_COMPONENT_OD0,
+	DDP_COMPONENT_OD1,
+	DDP_COMPONENT_OVL0,
+	DDP_COMPONENT_OVL_2L0,
+	DDP_COMPONENT_OVL_2L1,
+	DDP_COMPONENT_OVL1,
+	DDP_COMPONENT_PWM0,
+	DDP_COMPONENT_PWM1,
+	DDP_COMPONENT_PWM2,
+	DDP_COMPONENT_RDMA0,
+	DDP_COMPONENT_RDMA1,
+	DDP_COMPONENT_RDMA2,
+	DDP_COMPONENT_UFOE,
+	DDP_COMPONENT_WDMA0,
+	DDP_COMPONENT_WDMA1,
+	DDP_COMPONENT_ID_MAX,
+};
+
 void mtk_mmsys_ddp_connect(struct device *dev,
 			   enum mtk_ddp_comp_id cur,
 			   enum mtk_ddp_comp_id next);
-- 
cgit 


From 206e7383b34316cf56cdde96eab9d97e9a1dbd70 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Thu, 26 Nov 2020 11:20:35 +0100
Subject: bus: mhi: core: Indexed MHI controller name

Today the MHI controller name is simply cloned from the underlying
bus device (its parent), that gives the following device structure
for e.g. a MHI/PCI controller:
devices/pci0000:00/0000:00:01.2/0000:02:00.0/0000:02:00.0
devices/pci0000:00/0000:00:01.2/0000:02:00.0/0000:02:00.0/0000:02:00.0_IPCR
...

That's quite misleading/confusing and can cause device registering
issues because of duplicate dev name (e.g. if a PCI device register
two different MHI instances).

This patch changes MHI core to create indexed mhi controller names
(mhi0, mhi1...) in the same way as other busses (i2c0, usb0...).

The previous example becomes:
devices/pci0000:00/0000:00:01.2/0000:02:00.0/mhi0
devices/pci0000:00/0000:00:01.2/0000:02:00.0/mhi0/mhi0_IPCR
...
 v2: move index field at the end of mhi_controller struct (before bool)
     to avoid breaking well packed alignment.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Reviewed-by: Jeffrey Hugo <jhugo@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/init.c | 18 ++++++++++++++++--
 drivers/bus/mhi/core/main.c |  2 +-
 include/linux/mhi.h         |  2 ++
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c
index 655d539c6808..1d6f7b6c1fcd 100644
--- a/drivers/bus/mhi/core/init.c
+++ b/drivers/bus/mhi/core/init.c
@@ -8,6 +8,7 @@
 #include <linux/device.h>
 #include <linux/dma-direction.h>
 #include <linux/dma-mapping.h>
+#include <linux/idr.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/mhi.h>
@@ -18,6 +19,8 @@
 #include <linux/wait.h>
 #include "internal.h"
 
+static DEFINE_IDA(mhi_controller_ida);
+
 const char * const mhi_ee_str[MHI_EE_MAX] = {
 	[MHI_EE_PBL] = "PBL",
 	[MHI_EE_SBL] = "SBL",
@@ -940,6 +943,12 @@ int mhi_register_controller(struct mhi_controller *mhi_cntrl,
 	mhi_cntrl->minor_version = (soc_info & SOC_HW_VERSION_MINOR_VER_BMSK) >>
 					SOC_HW_VERSION_MINOR_VER_SHFT;
 
+	mhi_cntrl->index = ida_alloc(&mhi_controller_ida, GFP_KERNEL);
+	if (mhi_cntrl->index < 0) {
+		ret = mhi_cntrl->index;
+		goto error_ida_alloc;
+	}
+
 	/* Register controller with MHI bus */
 	mhi_dev = mhi_alloc_device(mhi_cntrl);
 	if (IS_ERR(mhi_dev)) {
@@ -950,8 +959,8 @@ int mhi_register_controller(struct mhi_controller *mhi_cntrl,
 
 	mhi_dev->dev_type = MHI_DEVICE_CONTROLLER;
 	mhi_dev->mhi_cntrl = mhi_cntrl;
-	dev_set_name(&mhi_dev->dev, "%s", dev_name(mhi_cntrl->cntrl_dev));
-	mhi_dev->name = dev_name(mhi_cntrl->cntrl_dev);
+	dev_set_name(&mhi_dev->dev, "mhi%d", mhi_cntrl->index);
+	mhi_dev->name = dev_name(&mhi_dev->dev);
 
 	/* Init wakeup source */
 	device_init_wakeup(&mhi_dev->dev, true);
@@ -970,6 +979,9 @@ error_add_dev:
 	put_device(&mhi_dev->dev);
 
 error_alloc_dev:
+	ida_free(&mhi_controller_ida, mhi_cntrl->index);
+
+error_ida_alloc:
 	kfree(mhi_cntrl->mhi_cmd);
 
 error_alloc_cmd:
@@ -1004,6 +1016,8 @@ void mhi_unregister_controller(struct mhi_controller *mhi_cntrl)
 
 	device_del(&mhi_dev->dev);
 	put_device(&mhi_dev->dev);
+
+	ida_free(&mhi_controller_ida, mhi_cntrl->index);
 }
 EXPORT_SYMBOL_GPL(mhi_unregister_controller);
 
diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c
index 4eb93d8bea1d..702c31b6aefa 100644
--- a/drivers/bus/mhi/core/main.c
+++ b/drivers/bus/mhi/core/main.c
@@ -331,7 +331,7 @@ void mhi_create_devices(struct mhi_controller *mhi_cntrl)
 		/* Channel name is same for both UL and DL */
 		mhi_dev->name = mhi_chan->name;
 		dev_set_name(&mhi_dev->dev, "%s_%s",
-			     dev_name(mhi_cntrl->cntrl_dev),
+			     dev_name(&mhi_cntrl->mhi_dev->dev),
 			     mhi_dev->name);
 
 		/* Init wakeup source if available */
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index d31efcf02ae7..aa9757e71f1f 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -348,6 +348,7 @@ struct mhi_controller_config {
  * @read_reg: Read a MHI register via the physical link (required)
  * @write_reg: Write a MHI register via the physical link (required)
  * @buffer_len: Bounce buffer length
+ * @index: Index of the MHI controller instance
  * @bounce_buf: Use of bounce buffer
  * @fbc_download: MHI host needs to do complete image transfer (optional)
  * @pre_init: MHI host needs to do pre-initialization before power up
@@ -438,6 +439,7 @@ struct mhi_controller {
 			  u32 val);
 
 	size_t buffer_len;
+	int index;
 	bool bounce_buf;
 	bool fbc_download;
 	bool pre_init;
-- 
cgit 


From 7776bcd241e08e13ef009926c6dea84dc3b2f8ff Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 3 Nov 2020 00:48:44 +0100
Subject: power: supply: s3c-adc-battery: Convert to GPIO descriptors

This converts the S3C ADC battery to use GPIO descriptors
instead of a global GPIO number for the charging completed
GPIO. Using the pattern from the GPIO charger we name this
GPIO line "charge-status" in the board file.

Cc: linux-samsung-soc@vger.kernel.org
Cc: Sergiy Kibrik <sakib@darkstar.site>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 arch/arm/mach-s3c/mach-h1940.c         | 12 +++++--
 arch/arm/mach-s3c/mach-rx1950.c        | 11 ++++++-
 drivers/power/supply/s3c_adc_battery.c | 57 +++++++++++++++++-----------------
 include/linux/s3c_adc_battery.h        |  3 --
 4 files changed, 49 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c/mach-h1940.c b/arch/arm/mach-s3c/mach-h1940.c
index 53d51aa83200..8a43ed1c4c4d 100644
--- a/arch/arm/mach-s3c/mach-h1940.c
+++ b/arch/arm/mach-s3c/mach-h1940.c
@@ -297,6 +297,15 @@ static const struct s3c_adc_bat_thresh bat_lut_acin[] = {
 	{ .volt = 3841, .cur = 0, .level = 0},
 };
 
+static struct gpiod_lookup_table h1940_bat_gpio_table = {
+	.dev_id = "s3c-adc-battery",
+	.table = {
+		/* Charge status S3C2410_GPF(3) */
+		GPIO_LOOKUP("GPIOF", 3, "charge-status", GPIO_ACTIVE_LOW),
+		{ },
+	},
+};
+
 static int h1940_bat_init(void)
 {
 	int ret;
@@ -330,8 +339,6 @@ static struct s3c_adc_bat_pdata h1940_bat_cfg = {
 	.exit = h1940_bat_exit,
 	.enable_charger = h1940_enable_charger,
 	.disable_charger = h1940_disable_charger,
-	.gpio_charge_finished = S3C2410_GPF(3),
-	.gpio_inverted = 1,
 	.lut_noac = bat_lut_noac,
 	.lut_noac_cnt = ARRAY_SIZE(bat_lut_noac),
 	.lut_acin = bat_lut_acin,
@@ -720,6 +727,7 @@ static void __init h1940_init(void)
 	s3c24xx_fb_set_platdata(&h1940_fb_info);
 	gpiod_add_lookup_table(&h1940_mmc_gpio_table);
 	gpiod_add_lookup_table(&h1940_audio_gpio_table);
+	gpiod_add_lookup_table(&h1940_bat_gpio_table);
 	/* Configure the I2S pins (GPE0...GPE4) in correct mode */
 	s3c_gpio_cfgall_range(S3C2410_GPE(0), 5, S3C_GPIO_SFN(2),
 			      S3C_GPIO_PULL_NONE);
diff --git a/arch/arm/mach-s3c/mach-rx1950.c b/arch/arm/mach-s3c/mach-rx1950.c
index b9758f0a9a14..6e19add158a9 100644
--- a/arch/arm/mach-s3c/mach-rx1950.c
+++ b/arch/arm/mach-s3c/mach-rx1950.c
@@ -206,6 +206,15 @@ static const struct s3c_adc_bat_thresh bat_lut_acin[] = {
 	{ .volt = 3820, .cur = 0, .level = 0},
 };
 
+static struct gpiod_lookup_table rx1950_bat_gpio_table = {
+	.dev_id = "s3c-adc-battery",
+	.table = {
+		/* Charge status S3C2410_GPF(3) */
+		GPIO_LOOKUP("GPIOF", 3, "charge-status", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static int rx1950_bat_init(void)
 {
 	int ret;
@@ -331,7 +340,6 @@ static struct s3c_adc_bat_pdata rx1950_bat_cfg = {
 	.exit = rx1950_bat_exit,
 	.enable_charger = rx1950_enable_charger,
 	.disable_charger = rx1950_disable_charger,
-	.gpio_charge_finished = S3C2410_GPF(3),
 	.lut_noac = bat_lut_noac,
 	.lut_noac_cnt = ARRAY_SIZE(bat_lut_noac),
 	.lut_acin = bat_lut_acin,
@@ -840,6 +848,7 @@ static void __init rx1950_init_machine(void)
 
 	pwm_add_table(rx1950_pwm_lookup, ARRAY_SIZE(rx1950_pwm_lookup));
 	gpiod_add_lookup_table(&rx1950_audio_gpio_table);
+	gpiod_add_lookup_table(&rx1950_bat_gpio_table);
 	/* Configure the I2S pins (GPE0...GPE4) in correct mode */
 	s3c_gpio_cfgall_range(S3C2410_GPE(0), 5, S3C_GPIO_SFN(2),
 			      S3C_GPIO_PULL_NONE);
diff --git a/drivers/power/supply/s3c_adc_battery.c b/drivers/power/supply/s3c_adc_battery.c
index 60b7f41ab063..a2addc24ee8b 100644
--- a/drivers/power/supply/s3c_adc_battery.c
+++ b/drivers/power/supply/s3c_adc_battery.c
@@ -13,7 +13,7 @@
 #include <linux/platform_device.h>
 #include <linux/power_supply.h>
 #include <linux/leds.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/err.h>
 #include <linux/timer.h>
 #include <linux/jiffies.h>
@@ -31,6 +31,7 @@ struct s3c_adc_bat {
 	struct power_supply		*psy;
 	struct s3c_adc_client		*client;
 	struct s3c_adc_bat_pdata	*pdata;
+	struct gpio_desc		*charge_finished;
 	int				volt_value;
 	int				cur_value;
 	unsigned int			timestamp;
@@ -132,9 +133,7 @@ static int calc_full_volt(int volt_val, int cur_val, int impedance)
 
 static int charge_finished(struct s3c_adc_bat *bat)
 {
-	return bat->pdata->gpio_inverted ?
-		!gpio_get_value(bat->pdata->gpio_charge_finished) :
-		gpio_get_value(bat->pdata->gpio_charge_finished);
+	return gpiod_get_value(bat->charge_finished);
 }
 
 static int s3c_adc_bat_get_property(struct power_supply *psy,
@@ -169,7 +168,7 @@ static int s3c_adc_bat_get_property(struct power_supply *psy,
 	}
 
 	if (bat->cable_plugged &&
-		((bat->pdata->gpio_charge_finished < 0) ||
+		(!bat->charge_finished ||
 		!charge_finished(bat))) {
 		lut = bat->pdata->lut_acin;
 		lut_size = bat->pdata->lut_acin_cnt;
@@ -206,7 +205,7 @@ static int s3c_adc_bat_get_property(struct power_supply *psy,
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
-		if (bat->pdata->gpio_charge_finished < 0)
+		if (!bat->charge_finished)
 			val->intval = bat->level == 100000 ?
 				POWER_SUPPLY_STATUS_FULL : bat->status;
 		else
@@ -265,7 +264,7 @@ static void s3c_adc_bat_work(struct work_struct *work)
 			bat->status = POWER_SUPPLY_STATUS_DISCHARGING;
 		}
 	} else {
-		if ((bat->pdata->gpio_charge_finished >= 0) && is_plugged) {
+		if (bat->charge_finished && is_plugged) {
 			is_charged = charge_finished(&main_bat);
 			if (is_charged) {
 				if (bat->pdata->disable_charger)
@@ -294,6 +293,7 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 	struct s3c_adc_client	*client;
 	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
 	struct power_supply_config psy_cfg = {};
+	struct gpio_desc *gpiod;
 	int ret;
 
 	client = s3c_adc_register(pdev, NULL, NULL, 0);
@@ -304,8 +304,17 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, client);
 
+	gpiod = devm_gpiod_get_optional(&pdev->dev, "charge-status", GPIOD_IN);
+	if (IS_ERR(gpiod)) {
+		/* Could be probe deferral etc */
+		ret = PTR_ERR(gpiod);
+		dev_err(&pdev->dev, "no GPIO %d\n", ret);
+		return ret;
+	}
+
 	main_bat.client = client;
 	main_bat.pdata = pdata;
+	main_bat.charge_finished = gpiod;
 	main_bat.volt_value = -1;
 	main_bat.cur_value = -1;
 	main_bat.cable_plugged = 0;
@@ -323,6 +332,7 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 
 		backup_bat.client = client;
 		backup_bat.pdata = pdev->dev.platform_data;
+		backup_bat.charge_finished = gpiod;
 		backup_bat.volt_value = -1;
 		backup_bat.psy = power_supply_register(&pdev->dev,
 						       &backup_bat_desc,
@@ -335,12 +345,8 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 
 	INIT_DELAYED_WORK(&bat_work, s3c_adc_bat_work);
 
-	if (pdata->gpio_charge_finished >= 0) {
-		ret = gpio_request(pdata->gpio_charge_finished, "charged");
-		if (ret)
-			goto err_gpio;
-
-		ret = request_irq(gpio_to_irq(pdata->gpio_charge_finished),
+	if (gpiod) {
+		ret = request_irq(gpiod_to_irq(gpiod),
 				s3c_adc_bat_charged,
 				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
 				"battery charged", NULL);
@@ -364,12 +370,9 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 	return 0;
 
 err_platform:
-	if (pdata->gpio_charge_finished >= 0)
-		free_irq(gpio_to_irq(pdata->gpio_charge_finished), NULL);
+	if (gpiod)
+		free_irq(gpiod_to_irq(gpiod), NULL);
 err_irq:
-	if (pdata->gpio_charge_finished >= 0)
-		gpio_free(pdata->gpio_charge_finished);
-err_gpio:
 	if (pdata->backup_volt_mult)
 		power_supply_unregister(backup_bat.psy);
 err_reg_backup:
@@ -389,10 +392,8 @@ static int s3c_adc_bat_remove(struct platform_device *pdev)
 
 	s3c_adc_release(client);
 
-	if (pdata->gpio_charge_finished >= 0) {
-		free_irq(gpio_to_irq(pdata->gpio_charge_finished), NULL);
-		gpio_free(pdata->gpio_charge_finished);
-	}
+	if (main_bat.charge_finished)
+		free_irq(gpiod_to_irq(main_bat.charge_finished), NULL);
 
 	cancel_delayed_work(&bat_work);
 
@@ -408,12 +409,12 @@ static int s3c_adc_bat_suspend(struct platform_device *pdev,
 {
 	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
 
-	if (pdata->gpio_charge_finished >= 0) {
+	if (main_bat.charge_finished) {
 		if (device_may_wakeup(&pdev->dev))
 			enable_irq_wake(
-				gpio_to_irq(pdata->gpio_charge_finished));
+				gpiod_to_irq(main_bat.charge_finished));
 		else {
-			disable_irq(gpio_to_irq(pdata->gpio_charge_finished));
+			disable_irq(gpiod_to_irq(main_bat.charge_finished));
 			main_bat.pdata->disable_charger();
 		}
 	}
@@ -425,12 +426,12 @@ static int s3c_adc_bat_resume(struct platform_device *pdev)
 {
 	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
 
-	if (pdata->gpio_charge_finished >= 0) {
+	if (main_bat.charge_finished) {
 		if (device_may_wakeup(&pdev->dev))
 			disable_irq_wake(
-				gpio_to_irq(pdata->gpio_charge_finished));
+				gpiod_to_irq(main_bat.charge_finished));
 		else
-			enable_irq(gpio_to_irq(pdata->gpio_charge_finished));
+			enable_irq(gpiod_to_irq(main_bat.charge_finished));
 	}
 
 	/* Schedule timer to check current status */
diff --git a/include/linux/s3c_adc_battery.h b/include/linux/s3c_adc_battery.h
index 833871dcf6fd..57f982c375f8 100644
--- a/include/linux/s3c_adc_battery.h
+++ b/include/linux/s3c_adc_battery.h
@@ -14,9 +14,6 @@ struct s3c_adc_bat_pdata {
 	void (*enable_charger)(void);
 	void (*disable_charger)(void);
 
-	int gpio_charge_finished;
-	int gpio_inverted;
-
 	const struct s3c_adc_bat_thresh *lut_noac;
 	unsigned int lut_noac_cnt;
 	const struct s3c_adc_bat_thresh *lut_acin;
-- 
cgit 


From b0327ffb133fb2148fc3bc2afb39af2871ab21cb Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 30 Oct 2020 13:24:24 +0100
Subject: power: supply: generic-adc-battery: Use GPIO descriptors

This driver uses platform data to pass GPIO lines using the
deprecated global GPIO numbers. There are no in-tree users
of this platform data.

Any out-of-tree or coming users of this driver can easily be
migrated to use machine descriptor tables as described in
Documentation/driver-api/gpio/board.rst
section "platform data".

Cc: Anish Kumar <anish198519851985@gmail.com>
Cc: H. Nikolaus Schaller <hns@goldelico.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/generic-adc-battery.c | 31 +++++++++++-------------------
 include/linux/power/generic-adc-battery.h  |  4 ----
 2 files changed, 11 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/generic-adc-battery.c b/drivers/power/supply/generic-adc-battery.c
index caa829738ef7..0032069fbc2b 100644
--- a/drivers/power/supply/generic-adc-battery.c
+++ b/drivers/power/supply/generic-adc-battery.c
@@ -12,7 +12,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <linux/power_supply.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/err.h>
 #include <linux/timer.h>
 #include <linux/jiffies.h>
@@ -52,6 +52,7 @@ struct gab {
 	int	level;
 	int	status;
 	bool cable_plugged;
+	struct gpio_desc *charge_finished;
 };
 
 static struct gab *to_generic_bat(struct power_supply *psy)
@@ -91,13 +92,9 @@ static const enum power_supply_property gab_dyn_props[] = {
 
 static bool gab_charge_finished(struct gab *adc_bat)
 {
-	struct gab_platform_data *pdata = adc_bat->pdata;
-	bool ret = gpio_get_value(pdata->gpio_charge_finished);
-	bool inv = pdata->gpio_inverted;
-
-	if (!gpio_is_valid(pdata->gpio_charge_finished))
+	if (!adc_bat->charge_finished)
 		return false;
-	return ret ^ inv;
+	return gpiod_get_value(adc_bat->charge_finished);
 }
 
 static int gab_get_status(struct gab *adc_bat)
@@ -327,18 +324,17 @@ static int gab_probe(struct platform_device *pdev)
 
 	INIT_DELAYED_WORK(&adc_bat->bat_work, gab_work);
 
-	if (gpio_is_valid(pdata->gpio_charge_finished)) {
+	adc_bat->charge_finished = devm_gpiod_get_optional(&pdev->dev,
+							   "charged", GPIOD_IN);
+	if (adc_bat->charge_finished) {
 		int irq;
-		ret = gpio_request(pdata->gpio_charge_finished, "charged");
-		if (ret)
-			goto gpio_req_fail;
 
-		irq = gpio_to_irq(pdata->gpio_charge_finished);
+		irq = gpiod_to_irq(adc_bat->charge_finished);
 		ret = request_any_context_irq(irq, gab_charged,
 				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
 				"battery charged", adc_bat);
 		if (ret < 0)
-			goto err_gpio;
+			goto gpio_req_fail;
 	}
 
 	platform_set_drvdata(pdev, adc_bat);
@@ -348,8 +344,6 @@ static int gab_probe(struct platform_device *pdev)
 			msecs_to_jiffies(0));
 	return 0;
 
-err_gpio:
-	gpio_free(pdata->gpio_charge_finished);
 gpio_req_fail:
 	power_supply_unregister(adc_bat->psy);
 err_reg_fail:
@@ -367,14 +361,11 @@ static int gab_remove(struct platform_device *pdev)
 {
 	int chan;
 	struct gab *adc_bat = platform_get_drvdata(pdev);
-	struct gab_platform_data *pdata = adc_bat->pdata;
 
 	power_supply_unregister(adc_bat->psy);
 
-	if (gpio_is_valid(pdata->gpio_charge_finished)) {
-		free_irq(gpio_to_irq(pdata->gpio_charge_finished), adc_bat);
-		gpio_free(pdata->gpio_charge_finished);
-	}
+	if (adc_bat->charge_finished)
+		free_irq(gpiod_to_irq(adc_bat->charge_finished), adc_bat);
 
 	for (chan = 0; chan < ARRAY_SIZE(gab_chan_name); chan++) {
 		if (adc_bat->channel[chan])
diff --git a/include/linux/power/generic-adc-battery.h b/include/linux/power/generic-adc-battery.h
index 40f9c7628f7b..c68cbf34cd34 100644
--- a/include/linux/power/generic-adc-battery.h
+++ b/include/linux/power/generic-adc-battery.h
@@ -11,16 +11,12 @@
  * @battery_info:         recommended structure to specify static power supply
  *			   parameters
  * @cal_charge:           calculate charge level.
- * @gpio_charge_finished: gpio for the charger.
- * @gpio_inverted:        Should be 1 if the GPIO is active low otherwise 0
  * @jitter_delay:         delay required after the interrupt to check battery
  *			  status.Default set is 10ms.
  */
 struct gab_platform_data {
 	struct power_supply_info battery_info;
 	int	(*cal_charge)(long value);
-	int	gpio_charge_finished;
-	bool	gpio_inverted;
 	int     jitter_delay;
 };
 
-- 
cgit 


From 51e7bf4534da678da27c0f51e7ff21804fae88ca Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:05 +0200
Subject: mtd: nand: ecc: Add an I/O request tweaking mechanism

Currently, BCH and Hamming engine are sharing the same
tweaking/restoring I/O mechanism: they need the I/O request to fully
cover the main/OOB area. Let's make this code generic as sharing the
code between two drivers is already a win. Maybe other ECC engine
drivers will need it too.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-2-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc.c   | 106 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h |  32 ++++++++++++++
 2 files changed, 138 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc.c b/drivers/mtd/nand/ecc.c
index 4a56e6c0da67..541137736fd3 100644
--- a/drivers/mtd/nand/ecc.c
+++ b/drivers/mtd/nand/ecc.c
@@ -95,6 +95,7 @@
 
 #include <linux/module.h>
 #include <linux/mtd/nand.h>
+#include <linux/slab.h>
 
 /**
  * nand_ecc_init_ctx - Init the ECC engine context
@@ -479,6 +480,111 @@ bool nand_ecc_is_strong_enough(struct nand_device *nand)
 }
 EXPORT_SYMBOL(nand_ecc_is_strong_enough);
 
+/* ECC engine driver internal helpers */
+int nand_ecc_init_req_tweaking(struct nand_ecc_req_tweak_ctx *ctx,
+			       struct nand_device *nand)
+{
+	unsigned int total_buffer_size;
+
+	ctx->nand = nand;
+
+	/* Let the user decide the exact length of each buffer */
+	if (!ctx->page_buffer_size)
+		ctx->page_buffer_size = nanddev_page_size(nand);
+	if (!ctx->oob_buffer_size)
+		ctx->oob_buffer_size = nanddev_per_page_oobsize(nand);
+
+	total_buffer_size = ctx->page_buffer_size + ctx->oob_buffer_size;
+
+	ctx->spare_databuf = kzalloc(total_buffer_size, GFP_KERNEL);
+	if (!ctx->spare_databuf)
+		return -ENOMEM;
+
+	ctx->spare_oobbuf = ctx->spare_databuf + ctx->page_buffer_size;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nand_ecc_init_req_tweaking);
+
+void nand_ecc_cleanup_req_tweaking(struct nand_ecc_req_tweak_ctx *ctx)
+{
+	kfree(ctx->spare_databuf);
+}
+EXPORT_SYMBOL_GPL(nand_ecc_cleanup_req_tweaking);
+
+/*
+ * Ensure data and OOB area is fully read/written otherwise the correction might
+ * not work as expected.
+ */
+void nand_ecc_tweak_req(struct nand_ecc_req_tweak_ctx *ctx,
+			struct nand_page_io_req *req)
+{
+	struct nand_device *nand = ctx->nand;
+	struct nand_page_io_req *orig, *tweak;
+
+	/* Save the original request */
+	ctx->orig_req = *req;
+	ctx->bounce_data = false;
+	ctx->bounce_oob = false;
+	orig = &ctx->orig_req;
+	tweak = req;
+
+	/* Ensure the request covers the entire page */
+	if (orig->datalen < nanddev_page_size(nand)) {
+		ctx->bounce_data = true;
+		tweak->dataoffs = 0;
+		tweak->datalen = nanddev_page_size(nand);
+		tweak->databuf.in = ctx->spare_databuf;
+		memset(tweak->databuf.in, 0xFF, ctx->page_buffer_size);
+	}
+
+	if (orig->ooblen < nanddev_per_page_oobsize(nand)) {
+		ctx->bounce_oob = true;
+		tweak->ooboffs = 0;
+		tweak->ooblen = nanddev_per_page_oobsize(nand);
+		tweak->oobbuf.in = ctx->spare_oobbuf;
+		memset(tweak->oobbuf.in, 0xFF, ctx->oob_buffer_size);
+	}
+
+	/* Copy the data that must be writen in the bounce buffers, if needed */
+	if (orig->type == NAND_PAGE_WRITE) {
+		if (ctx->bounce_data)
+			memcpy((void *)tweak->databuf.out + orig->dataoffs,
+			       orig->databuf.out, orig->datalen);
+
+		if (ctx->bounce_oob)
+			memcpy((void *)tweak->oobbuf.out + orig->ooboffs,
+			       orig->oobbuf.out, orig->ooblen);
+	}
+}
+EXPORT_SYMBOL_GPL(nand_ecc_tweak_req);
+
+void nand_ecc_restore_req(struct nand_ecc_req_tweak_ctx *ctx,
+			  struct nand_page_io_req *req)
+{
+	struct nand_page_io_req *orig, *tweak;
+
+	orig = &ctx->orig_req;
+	tweak = req;
+
+	/* Restore the data read from the bounce buffers, if needed */
+	if (orig->type == NAND_PAGE_READ) {
+		if (ctx->bounce_data)
+			memcpy(orig->databuf.in,
+			       tweak->databuf.in + orig->dataoffs,
+			       orig->datalen);
+
+		if (ctx->bounce_oob)
+			memcpy(orig->oobbuf.in,
+			       tweak->oobbuf.in + orig->ooboffs,
+			       orig->ooblen);
+	}
+
+	/* Ensure the original request is restored */
+	*req = *orig;
+}
+EXPORT_SYMBOL_GPL(nand_ecc_restore_req);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Miquel Raynal <miquel.raynal@bootlin.com>");
 MODULE_DESCRIPTION("Generic ECC engine");
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 697ea2474a7c..36e4fe08d0ea 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -278,6 +278,38 @@ int nand_ecc_finish_io_req(struct nand_device *nand,
 			   struct nand_page_io_req *req);
 bool nand_ecc_is_strong_enough(struct nand_device *nand);
 
+/**
+ * struct nand_ecc_req_tweak_ctx - Help for automatically tweaking requests
+ * @orig_req: Pointer to the original IO request
+ * @nand: Related NAND device, to have access to its memory organization
+ * @page_buffer_size: Real size of the page buffer to use (can be set by the
+ *                    user before the tweaking mechanism initialization)
+ * @oob_buffer_size: Real size of the OOB buffer to use (can be set by the
+ *                   user before the tweaking mechanism initialization)
+ * @spare_databuf: Data bounce buffer
+ * @spare_oobbuf: OOB bounce buffer
+ * @bounce_data: Flag indicating a data bounce buffer is used
+ * @bounce_oob: Flag indicating an OOB bounce buffer is used
+ */
+struct nand_ecc_req_tweak_ctx {
+	struct nand_page_io_req orig_req;
+	struct nand_device *nand;
+	unsigned int page_buffer_size;
+	unsigned int oob_buffer_size;
+	void *spare_databuf;
+	void *spare_oobbuf;
+	bool bounce_data;
+	bool bounce_oob;
+};
+
+int nand_ecc_init_req_tweaking(struct nand_ecc_req_tweak_ctx *ctx,
+			       struct nand_device *nand);
+void nand_ecc_cleanup_req_tweaking(struct nand_ecc_req_tweak_ctx *ctx);
+void nand_ecc_tweak_req(struct nand_ecc_req_tweak_ctx *ctx,
+			struct nand_page_io_req *req);
+void nand_ecc_restore_req(struct nand_ecc_req_tweak_ctx *ctx,
+			  struct nand_page_io_req *req);
+
 /**
  * struct nand_ecc - Information relative to the ECC
  * @defaults: Default values, depend on the underlying subsystem
-- 
cgit 


From cdbe8df5e28e452c232c0c16b205edfd390d28e5 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:06 +0200
Subject: mtd: nand: ecc-bch: Move BCH code to the generic NAND layer

BCH ECC code might be later re-used by the SPI NAND layer.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-3-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/Kconfig            |  11 ++
 drivers/mtd/nand/Makefile           |   1 +
 drivers/mtd/nand/ecc-sw-bch.c       | 219 ++++++++++++++++++++++++++++++++++++
 drivers/mtd/nand/raw/Kconfig        |  10 --
 drivers/mtd/nand/raw/Makefile       |   1 -
 drivers/mtd/nand/raw/nand_base.c    |   2 +-
 drivers/mtd/nand/raw/nand_bch.c     | 219 ------------------------------------
 drivers/mtd/nand/raw/nandsim.c      |   2 +-
 drivers/mtd/nand/raw/omap2.c        |   2 +-
 include/linux/mtd/nand-ecc-sw-bch.h |  66 +++++++++++
 include/linux/mtd/nand_bch.h        |  66 -----------
 11 files changed, 300 insertions(+), 299 deletions(-)
 create mode 100644 drivers/mtd/nand/ecc-sw-bch.c
 delete mode 100644 drivers/mtd/nand/raw/nand_bch.c
 create mode 100644 include/linux/mtd/nand-ecc-sw-bch.h
 delete mode 100644 include/linux/mtd/nand_bch.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 4a9aed4f0104..55c17fb0dee1 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -15,6 +15,17 @@ config MTD_NAND_ECC
        bool
        depends on MTD_NAND_CORE
 
+config MTD_NAND_ECC_SW_BCH
+	bool "Software BCH ECC engine"
+	select BCH
+	select MTD_NAND_ECC
+	default n
+	help
+	  This enables support for software BCH error correction. Binary BCH
+	  codes are more powerful and cpu intensive than traditional Hamming
+	  ECC codes. They are used with NAND devices requiring more than 1 bit
+	  of error correction.
+
 endmenu
 
 endmenu
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 981372953b56..c7179ff23753 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -8,3 +8,4 @@ obj-y	+= raw/
 obj-y	+= spi/
 
 nandcore-$(CONFIG_MTD_NAND_ECC) += ecc.o
+nandcore-$(CONFIG_MTD_NAND_ECC_SW_BCH) += ecc-sw-bch.o
diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
new file mode 100644
index 000000000000..6da14d0263d9
--- /dev/null
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file provides ECC correction for more than 1 bit per block of data,
+ * using binary BCH codes. It relies on the generic BCH library lib/bch.c.
+ *
+ * Copyright © 2011 Ivan Djelic <ivan.djelic@parrot.com>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/rawnand.h>
+#include <linux/mtd/nand-ecc-sw-bch.h>
+#include <linux/bch.h>
+
+/**
+ * struct nand_bch_control - private NAND BCH control structure
+ * @bch:       BCH control structure
+ * @errloc:    error location array
+ * @eccmask:   XOR ecc mask, allows erased pages to be decoded as valid
+ */
+struct nand_bch_control {
+	struct bch_control   *bch;
+	unsigned int         *errloc;
+	unsigned char        *eccmask;
+};
+
+/**
+ * nand_bch_calculate_ecc - [NAND Interface] Calculate ECC for data block
+ * @chip:	NAND chip object
+ * @buf:	input buffer with raw data
+ * @code:	output buffer with ECC
+ */
+int nand_bch_calculate_ecc(struct nand_chip *chip, const unsigned char *buf,
+			   unsigned char *code)
+{
+	struct nand_bch_control *nbc = chip->ecc.priv;
+	unsigned int i;
+
+	memset(code, 0, chip->ecc.bytes);
+	bch_encode(nbc->bch, buf, chip->ecc.size, code);
+
+	/* apply mask so that an erased page is a valid codeword */
+	for (i = 0; i < chip->ecc.bytes; i++)
+		code[i] ^= nbc->eccmask[i];
+
+	return 0;
+}
+EXPORT_SYMBOL(nand_bch_calculate_ecc);
+
+/**
+ * nand_bch_correct_data - [NAND Interface] Detect and correct bit error(s)
+ * @chip:	NAND chip object
+ * @buf:	raw data read from the chip
+ * @read_ecc:	ECC from the chip
+ * @calc_ecc:	the ECC calculated from raw data
+ *
+ * Detect and correct bit errors for a data byte block
+ */
+int nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
+			  unsigned char *read_ecc, unsigned char *calc_ecc)
+{
+	struct nand_bch_control *nbc = chip->ecc.priv;
+	unsigned int *errloc = nbc->errloc;
+	int i, count;
+
+	count = bch_decode(nbc->bch, NULL, chip->ecc.size, read_ecc, calc_ecc,
+			   NULL, errloc);
+	if (count > 0) {
+		for (i = 0; i < count; i++) {
+			if (errloc[i] < (chip->ecc.size*8))
+				/* error is located in data, correct it */
+				buf[errloc[i] >> 3] ^= (1 << (errloc[i] & 7));
+			/* else error in ecc, no action needed */
+
+			pr_debug("%s: corrected bitflip %u\n", __func__,
+					errloc[i]);
+		}
+	} else if (count < 0) {
+		pr_err("ecc unrecoverable error\n");
+		count = -EBADMSG;
+	}
+	return count;
+}
+EXPORT_SYMBOL(nand_bch_correct_data);
+
+/**
+ * nand_bch_init - [NAND Interface] Initialize NAND BCH error correction
+ * @mtd:	MTD block structure
+ *
+ * Returns:
+ *  a pointer to a new NAND BCH control structure, or NULL upon failure
+ *
+ * Initialize NAND BCH error correction. Parameters @eccsize and @eccbytes
+ * are used to compute BCH parameters m (Galois field order) and t (error
+ * correction capability). @eccbytes should be equal to the number of bytes
+ * required to store m*t bits, where m is such that 2^m-1 > @eccsize*8.
+ *
+ * Example: to configure 4 bit correction per 512 bytes, you should pass
+ * @eccsize = 512  (thus, m=13 is the smallest integer such that 2^m-1 > 512*8)
+ * @eccbytes = 7   (7 bytes are required to store m*t = 13*4 = 52 bits)
+ */
+struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
+{
+	struct nand_chip *nand = mtd_to_nand(mtd);
+	unsigned int m, t, eccsteps, i;
+	struct nand_bch_control *nbc = NULL;
+	unsigned char *erased_page;
+	unsigned int eccsize = nand->ecc.size;
+	unsigned int eccbytes = nand->ecc.bytes;
+	unsigned int eccstrength = nand->ecc.strength;
+
+	if (!eccbytes && eccstrength) {
+		eccbytes = DIV_ROUND_UP(eccstrength * fls(8 * eccsize), 8);
+		nand->ecc.bytes = eccbytes;
+	}
+
+	if (!eccsize || !eccbytes) {
+		pr_warn("ecc parameters not supplied\n");
+		goto fail;
+	}
+
+	m = fls(1+8*eccsize);
+	t = (eccbytes*8)/m;
+
+	nbc = kzalloc(sizeof(*nbc), GFP_KERNEL);
+	if (!nbc)
+		goto fail;
+
+	nbc->bch = bch_init(m, t, 0, false);
+	if (!nbc->bch)
+		goto fail;
+
+	/* verify that eccbytes has the expected value */
+	if (nbc->bch->ecc_bytes != eccbytes) {
+		pr_warn("invalid eccbytes %u, should be %u\n",
+			eccbytes, nbc->bch->ecc_bytes);
+		goto fail;
+	}
+
+	eccsteps = mtd->writesize/eccsize;
+
+	/* Check that we have an oob layout description. */
+	if (!mtd->ooblayout) {
+		pr_warn("missing oob scheme");
+		goto fail;
+	}
+
+	/* sanity checks */
+	if (8*(eccsize+eccbytes) >= (1 << m)) {
+		pr_warn("eccsize %u is too large\n", eccsize);
+		goto fail;
+	}
+
+	/*
+	 * ecc->steps and ecc->total might be used by mtd->ooblayout->ecc(),
+	 * which is called by mtd_ooblayout_count_eccbytes().
+	 * Make sure they are properly initialized before calling
+	 * mtd_ooblayout_count_eccbytes().
+	 * FIXME: we should probably rework the sequencing in nand_scan_tail()
+	 * to avoid setting those fields twice.
+	 */
+	nand->ecc.steps = eccsteps;
+	nand->ecc.total = eccsteps * eccbytes;
+	nand->base.ecc.ctx.total = nand->ecc.total;
+	if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
+		pr_warn("invalid ecc layout\n");
+		goto fail;
+	}
+
+	nbc->eccmask = kzalloc(eccbytes, GFP_KERNEL);
+	nbc->errloc = kmalloc_array(t, sizeof(*nbc->errloc), GFP_KERNEL);
+	if (!nbc->eccmask || !nbc->errloc)
+		goto fail;
+	/*
+	 * compute and store the inverted ecc of an erased ecc block
+	 */
+	erased_page = kmalloc(eccsize, GFP_KERNEL);
+	if (!erased_page)
+		goto fail;
+
+	memset(erased_page, 0xff, eccsize);
+	bch_encode(nbc->bch, erased_page, eccsize, nbc->eccmask);
+	kfree(erased_page);
+
+	for (i = 0; i < eccbytes; i++)
+		nbc->eccmask[i] ^= 0xff;
+
+	if (!eccstrength)
+		nand->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
+
+	return nbc;
+fail:
+	nand_bch_free(nbc);
+	return NULL;
+}
+EXPORT_SYMBOL(nand_bch_init);
+
+/**
+ * nand_bch_free - [NAND Interface] Release NAND BCH ECC resources
+ * @nbc:	NAND BCH control structure
+ */
+void nand_bch_free(struct nand_bch_control *nbc)
+{
+	if (nbc) {
+		bch_free(nbc->bch);
+		kfree(nbc->errloc);
+		kfree(nbc->eccmask);
+		kfree(nbc);
+	}
+}
+EXPORT_SYMBOL(nand_bch_free);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
+MODULE_DESCRIPTION("NAND software BCH ECC support");
diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
index 6c46f25b57e2..b73860aa77c6 100644
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -22,16 +22,6 @@ menuconfig MTD_RAW_NAND
 
 if MTD_RAW_NAND
 
-config MTD_NAND_ECC_SW_BCH
-	bool "Support software BCH ECC"
-	select BCH
-	default n
-	help
-	  This enables support for software BCH error correction. Binary BCH
-	  codes are more powerful and cpu intensive than traditional Hamming
-	  ECC codes. They are used with NAND devices requiring more than 1 bit
-	  of error correction.
-
 comment "Raw/parallel NAND flash controllers"
 
 config MTD_NAND_DENALI
diff --git a/drivers/mtd/nand/raw/Makefile b/drivers/mtd/nand/raw/Makefile
index 2930f5b9015d..76904305d091 100644
--- a/drivers/mtd/nand/raw/Makefile
+++ b/drivers/mtd/nand/raw/Makefile
@@ -2,7 +2,6 @@
 
 obj-$(CONFIG_MTD_RAW_NAND)		+= nand.o
 obj-$(CONFIG_MTD_NAND_ECC_SW_HAMMING)	+= nand_ecc.o
-nand-$(CONFIG_MTD_NAND_ECC_SW_BCH)	+= nand_bch.o
 obj-$(CONFIG_MTD_SM_COMMON) 		+= sm_common.o
 
 obj-$(CONFIG_MTD_NAND_CAFE)		+= cafe_nand.o
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 1f0d542d5923..7edcf08a2e81 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -36,7 +36,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/nand_ecc.h>
-#include <linux/mtd/nand_bch.h>
+#include <linux/mtd/nand-ecc-sw-bch.h>
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/io.h>
diff --git a/drivers/mtd/nand/raw/nand_bch.c b/drivers/mtd/nand/raw/nand_bch.c
deleted file mode 100644
index 9d19ac14c196..000000000000
--- a/drivers/mtd/nand/raw/nand_bch.c
+++ /dev/null
@@ -1,219 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * This file provides ECC correction for more than 1 bit per block of data,
- * using binary BCH codes. It relies on the generic BCH library lib/bch.c.
- *
- * Copyright © 2011 Ivan Djelic <ivan.djelic@parrot.com>
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_bch.h>
-#include <linux/bch.h>
-
-/**
- * struct nand_bch_control - private NAND BCH control structure
- * @bch:       BCH control structure
- * @errloc:    error location array
- * @eccmask:   XOR ecc mask, allows erased pages to be decoded as valid
- */
-struct nand_bch_control {
-	struct bch_control   *bch;
-	unsigned int         *errloc;
-	unsigned char        *eccmask;
-};
-
-/**
- * nand_bch_calculate_ecc - [NAND Interface] Calculate ECC for data block
- * @chip:	NAND chip object
- * @buf:	input buffer with raw data
- * @code:	output buffer with ECC
- */
-int nand_bch_calculate_ecc(struct nand_chip *chip, const unsigned char *buf,
-			   unsigned char *code)
-{
-	struct nand_bch_control *nbc = chip->ecc.priv;
-	unsigned int i;
-
-	memset(code, 0, chip->ecc.bytes);
-	bch_encode(nbc->bch, buf, chip->ecc.size, code);
-
-	/* apply mask so that an erased page is a valid codeword */
-	for (i = 0; i < chip->ecc.bytes; i++)
-		code[i] ^= nbc->eccmask[i];
-
-	return 0;
-}
-EXPORT_SYMBOL(nand_bch_calculate_ecc);
-
-/**
- * nand_bch_correct_data - [NAND Interface] Detect and correct bit error(s)
- * @chip:	NAND chip object
- * @buf:	raw data read from the chip
- * @read_ecc:	ECC from the chip
- * @calc_ecc:	the ECC calculated from raw data
- *
- * Detect and correct bit errors for a data byte block
- */
-int nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
-			  unsigned char *read_ecc, unsigned char *calc_ecc)
-{
-	struct nand_bch_control *nbc = chip->ecc.priv;
-	unsigned int *errloc = nbc->errloc;
-	int i, count;
-
-	count = bch_decode(nbc->bch, NULL, chip->ecc.size, read_ecc, calc_ecc,
-			   NULL, errloc);
-	if (count > 0) {
-		for (i = 0; i < count; i++) {
-			if (errloc[i] < (chip->ecc.size*8))
-				/* error is located in data, correct it */
-				buf[errloc[i] >> 3] ^= (1 << (errloc[i] & 7));
-			/* else error in ecc, no action needed */
-
-			pr_debug("%s: corrected bitflip %u\n", __func__,
-					errloc[i]);
-		}
-	} else if (count < 0) {
-		pr_err("ecc unrecoverable error\n");
-		count = -EBADMSG;
-	}
-	return count;
-}
-EXPORT_SYMBOL(nand_bch_correct_data);
-
-/**
- * nand_bch_init - [NAND Interface] Initialize NAND BCH error correction
- * @mtd:	MTD block structure
- *
- * Returns:
- *  a pointer to a new NAND BCH control structure, or NULL upon failure
- *
- * Initialize NAND BCH error correction. Parameters @eccsize and @eccbytes
- * are used to compute BCH parameters m (Galois field order) and t (error
- * correction capability). @eccbytes should be equal to the number of bytes
- * required to store m*t bits, where m is such that 2^m-1 > @eccsize*8.
- *
- * Example: to configure 4 bit correction per 512 bytes, you should pass
- * @eccsize = 512  (thus, m=13 is the smallest integer such that 2^m-1 > 512*8)
- * @eccbytes = 7   (7 bytes are required to store m*t = 13*4 = 52 bits)
- */
-struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
-{
-	struct nand_chip *nand = mtd_to_nand(mtd);
-	unsigned int m, t, eccsteps, i;
-	struct nand_bch_control *nbc = NULL;
-	unsigned char *erased_page;
-	unsigned int eccsize = nand->ecc.size;
-	unsigned int eccbytes = nand->ecc.bytes;
-	unsigned int eccstrength = nand->ecc.strength;
-
-	if (!eccbytes && eccstrength) {
-		eccbytes = DIV_ROUND_UP(eccstrength * fls(8 * eccsize), 8);
-		nand->ecc.bytes = eccbytes;
-	}
-
-	if (!eccsize || !eccbytes) {
-		pr_warn("ecc parameters not supplied\n");
-		goto fail;
-	}
-
-	m = fls(1+8*eccsize);
-	t = (eccbytes*8)/m;
-
-	nbc = kzalloc(sizeof(*nbc), GFP_KERNEL);
-	if (!nbc)
-		goto fail;
-
-	nbc->bch = bch_init(m, t, 0, false);
-	if (!nbc->bch)
-		goto fail;
-
-	/* verify that eccbytes has the expected value */
-	if (nbc->bch->ecc_bytes != eccbytes) {
-		pr_warn("invalid eccbytes %u, should be %u\n",
-			eccbytes, nbc->bch->ecc_bytes);
-		goto fail;
-	}
-
-	eccsteps = mtd->writesize/eccsize;
-
-	/* Check that we have an oob layout description. */
-	if (!mtd->ooblayout) {
-		pr_warn("missing oob scheme");
-		goto fail;
-	}
-
-	/* sanity checks */
-	if (8*(eccsize+eccbytes) >= (1 << m)) {
-		pr_warn("eccsize %u is too large\n", eccsize);
-		goto fail;
-	}
-
-	/*
-	 * ecc->steps and ecc->total might be used by mtd->ooblayout->ecc(),
-	 * which is called by mtd_ooblayout_count_eccbytes().
-	 * Make sure they are properly initialized before calling
-	 * mtd_ooblayout_count_eccbytes().
-	 * FIXME: we should probably rework the sequencing in nand_scan_tail()
-	 * to avoid setting those fields twice.
-	 */
-	nand->ecc.steps = eccsteps;
-	nand->ecc.total = eccsteps * eccbytes;
-	nand->base.ecc.ctx.total = nand->ecc.total;
-	if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
-		pr_warn("invalid ecc layout\n");
-		goto fail;
-	}
-
-	nbc->eccmask = kzalloc(eccbytes, GFP_KERNEL);
-	nbc->errloc = kmalloc_array(t, sizeof(*nbc->errloc), GFP_KERNEL);
-	if (!nbc->eccmask || !nbc->errloc)
-		goto fail;
-	/*
-	 * compute and store the inverted ecc of an erased ecc block
-	 */
-	erased_page = kmalloc(eccsize, GFP_KERNEL);
-	if (!erased_page)
-		goto fail;
-
-	memset(erased_page, 0xff, eccsize);
-	bch_encode(nbc->bch, erased_page, eccsize, nbc->eccmask);
-	kfree(erased_page);
-
-	for (i = 0; i < eccbytes; i++)
-		nbc->eccmask[i] ^= 0xff;
-
-	if (!eccstrength)
-		nand->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
-
-	return nbc;
-fail:
-	nand_bch_free(nbc);
-	return NULL;
-}
-EXPORT_SYMBOL(nand_bch_init);
-
-/**
- * nand_bch_free - [NAND Interface] Release NAND BCH ECC resources
- * @nbc:	NAND BCH control structure
- */
-void nand_bch_free(struct nand_bch_control *nbc)
-{
-	if (nbc) {
-		bch_free(nbc->bch);
-		kfree(nbc->errloc);
-		kfree(nbc->eccmask);
-		kfree(nbc);
-	}
-}
-EXPORT_SYMBOL(nand_bch_free);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
-MODULE_DESCRIPTION("NAND software BCH ECC support");
diff --git a/drivers/mtd/nand/raw/nandsim.c b/drivers/mtd/nand/raw/nandsim.c
index a8048cb8d220..9c940ead66fd 100644
--- a/drivers/mtd/nand/raw/nandsim.c
+++ b/drivers/mtd/nand/raw/nandsim.c
@@ -23,7 +23,7 @@
 #include <linux/string.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_bch.h>
+#include <linux/mtd/nand-ecc-sw-bch.h>
 #include <linux/mtd/partitions.h>
 #include <linux/delay.h>
 #include <linux/list.h>
diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
index 512f60780a50..0ef209e1cd87 100644
--- a/drivers/mtd/nand/raw/omap2.c
+++ b/drivers/mtd/nand/raw/omap2.c
@@ -23,7 +23,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <linux/mtd/nand_bch.h>
+#include <linux/mtd/nand-ecc-sw-bch.h>
 #include <linux/platform_data/elm.h>
 
 #include <linux/omap-gpmc.h>
diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
new file mode 100644
index 000000000000..1e1ee3af82b1
--- /dev/null
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2011 Ivan Djelic <ivan.djelic@parrot.com>
+ *
+ * This file is the header for the NAND BCH ECC implementation.
+ */
+
+#ifndef __MTD_NAND_ECC_SW_BCH_H__
+#define __MTD_NAND_ECC_SW_BCH_H__
+
+struct mtd_info;
+struct nand_chip;
+struct nand_bch_control;
+
+#if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
+
+static inline int mtd_nand_has_bch(void) { return 1; }
+
+/*
+ * Calculate BCH ecc code
+ */
+int nand_bch_calculate_ecc(struct nand_chip *chip, const u_char *dat,
+			   u_char *ecc_code);
+
+/*
+ * Detect and correct bit errors
+ */
+int nand_bch_correct_data(struct nand_chip *chip, u_char *dat,
+			  u_char *read_ecc, u_char *calc_ecc);
+/*
+ * Initialize BCH encoder/decoder
+ */
+struct nand_bch_control *nand_bch_init(struct mtd_info *mtd);
+/*
+ * Release BCH encoder/decoder resources
+ */
+void nand_bch_free(struct nand_bch_control *nbc);
+
+#else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
+
+static inline int mtd_nand_has_bch(void) { return 0; }
+
+static inline int
+nand_bch_calculate_ecc(struct nand_chip *chip, const u_char *dat,
+		       u_char *ecc_code)
+{
+	return -1;
+}
+
+static inline int
+nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
+		      unsigned char *read_ecc, unsigned char *calc_ecc)
+{
+	return -ENOTSUPP;
+}
+
+static inline struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
+{
+	return NULL;
+}
+
+static inline void nand_bch_free(struct nand_bch_control *nbc) {}
+
+#endif /* CONFIG_MTD_NAND_ECC_SW_BCH */
+
+#endif /* __MTD_NAND_ECC_SW_BCH_H__ */
diff --git a/include/linux/mtd/nand_bch.h b/include/linux/mtd/nand_bch.h
deleted file mode 100644
index d5956cc48ba9..000000000000
--- a/include/linux/mtd/nand_bch.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright © 2011 Ivan Djelic <ivan.djelic@parrot.com>
- *
- * This file is the header for the NAND BCH ECC implementation.
- */
-
-#ifndef __MTD_NAND_BCH_H__
-#define __MTD_NAND_BCH_H__
-
-struct mtd_info;
-struct nand_chip;
-struct nand_bch_control;
-
-#if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
-
-static inline int mtd_nand_has_bch(void) { return 1; }
-
-/*
- * Calculate BCH ecc code
- */
-int nand_bch_calculate_ecc(struct nand_chip *chip, const u_char *dat,
-			   u_char *ecc_code);
-
-/*
- * Detect and correct bit errors
- */
-int nand_bch_correct_data(struct nand_chip *chip, u_char *dat,
-			  u_char *read_ecc, u_char *calc_ecc);
-/*
- * Initialize BCH encoder/decoder
- */
-struct nand_bch_control *nand_bch_init(struct mtd_info *mtd);
-/*
- * Release BCH encoder/decoder resources
- */
-void nand_bch_free(struct nand_bch_control *nbc);
-
-#else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
-
-static inline int mtd_nand_has_bch(void) { return 0; }
-
-static inline int
-nand_bch_calculate_ecc(struct nand_chip *chip, const u_char *dat,
-		       u_char *ecc_code)
-{
-	return -1;
-}
-
-static inline int
-nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
-		      unsigned char *read_ecc, unsigned char *calc_ecc)
-{
-	return -ENOTSUPP;
-}
-
-static inline struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
-{
-	return NULL;
-}
-
-static inline void nand_bch_free(struct nand_bch_control *nbc) {}
-
-#endif /* CONFIG_MTD_NAND_ECC_SW_BCH */
-
-#endif /* __MTD_NAND_BCH_H__ */
-- 
cgit 


From 3c0fe36abebee55821badaa9d6cecd03799f7843 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:08 +0200
Subject: mtd: nand: ecc-bch: Stop exporting the private structure

The NAND BCH control structure has nothing to do outside of this
driver, all users of the nand_bch_init/free() functions just save it
to chip->ecc.priv so do it in this driver directly and return a
regular error code instead.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-5-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-bch.c       | 38 ++++++++++++++++++++-----------------
 drivers/mtd/nand/raw/fsmc_nand.c    |  2 +-
 drivers/mtd/nand/raw/nand_base.c    | 12 +++++++-----
 drivers/mtd/nand/raw/omap2.c        | 16 ++++++++--------
 include/linux/mtd/nand-ecc-sw-bch.h | 11 +++++------
 5 files changed, 42 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
index f98a03e8efae..b6bfee9805ae 100644
--- a/drivers/mtd/nand/ecc-sw-bch.c
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -90,7 +90,7 @@ EXPORT_SYMBOL(nand_bch_correct_data);
 
 /**
  * nand_bch_init - Initialize software BCH ECC engine
- * @mtd: MTD device
+ * @chip: NAND chip object
  *
  * Returns: a pointer to a new NAND BCH control structure, or NULL upon failure
  *
@@ -105,24 +105,24 @@ EXPORT_SYMBOL(nand_bch_correct_data);
  * @eccsize = 512 (thus, m = 13 is the smallest integer such that 2^m - 1 > 512 * 8)
  * @eccbytes = 7 (7 bytes are required to store m * t = 13 * 4 = 52 bits)
  */
-struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
+int nand_bch_init(struct nand_chip *chip)
 {
-	struct nand_chip *nand = mtd_to_nand(mtd);
+	struct mtd_info *mtd = nand_to_mtd(chip);
 	unsigned int m, t, eccsteps, i;
 	struct nand_bch_control *nbc = NULL;
 	unsigned char *erased_page;
-	unsigned int eccsize = nand->ecc.size;
-	unsigned int eccbytes = nand->ecc.bytes;
-	unsigned int eccstrength = nand->ecc.strength;
+	unsigned int eccsize = chip->ecc.size;
+	unsigned int eccbytes = chip->ecc.bytes;
+	unsigned int eccstrength = chip->ecc.strength;
 
 	if (!eccbytes && eccstrength) {
 		eccbytes = DIV_ROUND_UP(eccstrength * fls(8 * eccsize), 8);
-		nand->ecc.bytes = eccbytes;
+		chip->ecc.bytes = eccbytes;
 	}
 
 	if (!eccsize || !eccbytes) {
 		pr_warn("ecc parameters not supplied\n");
-		goto fail;
+		return -EINVAL;
 	}
 
 	m = fls(1+8*eccsize);
@@ -130,7 +130,9 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 
 	nbc = kzalloc(sizeof(*nbc), GFP_KERNEL);
 	if (!nbc)
-		goto fail;
+		return -ENOMEM;
+
+	chip->ecc.priv = nbc;
 
 	nbc->bch = bch_init(m, t, 0, false);
 	if (!nbc->bch)
@@ -165,9 +167,9 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 	 * FIXME: we should probably rework the sequencing in nand_scan_tail()
 	 * to avoid setting those fields twice.
 	 */
-	nand->ecc.steps = eccsteps;
-	nand->ecc.total = eccsteps * eccbytes;
-	nand->base.ecc.ctx.total = nand->ecc.total;
+	chip->ecc.steps = eccsteps;
+	chip->ecc.total = eccsteps * eccbytes;
+	nand->base.ecc.ctx.total = chip->ecc.total;
 	if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
 		pr_warn("invalid ecc layout\n");
 		goto fail;
@@ -193,12 +195,12 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 		nbc->eccmask[i] ^= 0xff;
 
 	if (!eccstrength)
-		nand->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
+		chip->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
 
-	return nbc;
+	return 0;
 fail:
-	nand_bch_free(nbc);
-	return NULL;
+	nand_bch_free(chip);
+	return -EINVAL;
 }
 EXPORT_SYMBOL(nand_bch_init);
 
@@ -206,8 +208,10 @@ EXPORT_SYMBOL(nand_bch_init);
  * nand_bch_free - Release NAND BCH ECC resources
  * @nbc: NAND BCH control structure
  */
-void nand_bch_free(struct nand_bch_control *nbc)
+void nand_bch_free(struct nand_chip *chip)
 {
+	struct nand_bch_control *nbc = chip->ecc.priv;
+
 	if (nbc) {
 		bch_free(nbc->bch);
 		kfree(nbc->errloc);
diff --git a/drivers/mtd/nand/raw/fsmc_nand.c b/drivers/mtd/nand/raw/fsmc_nand.c
index c88421a1c078..984b05e6bd38 100644
--- a/drivers/mtd/nand/raw/fsmc_nand.c
+++ b/drivers/mtd/nand/raw/fsmc_nand.c
@@ -942,7 +942,7 @@ static int fsmc_nand_attach_chip(struct nand_chip *nand)
 
 	/*
 	 * Don't set layout for BCH4 SW ECC. This will be
-	 * generated later in nand_bch_init() later.
+	 * generated later during BCH initialization.
 	 */
 	if (nand->ecc.engine_type == NAND_ECC_ENGINE_TYPE_ON_HOST) {
 		switch (mtd->oobsize) {
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 7edcf08a2e81..287c521a3d43 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5203,6 +5203,7 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	struct nand_device *nanddev = mtd_to_nanddev(mtd);
 	struct nand_ecc_ctrl *ecc = &chip->ecc;
+	int ret;
 
 	if (WARN_ON(ecc->engine_type != NAND_ECC_ENGINE_TYPE_SOFT))
 		return -EINVAL;
@@ -5289,13 +5290,14 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 			ecc->strength = bytes * 8 / fls(8 * ecc->size);
 		}
 
-		/* See nand_bch_init() for details. */
+		/* See the software BCH ECC initialization for details */
 		ecc->bytes = 0;
-		ecc->priv = nand_bch_init(mtd);
-		if (!ecc->priv) {
+		ret = nand_bch_init(chip);
+		if (ret) {
 			WARN(1, "BCH ECC initialization failed!\n");
-			return -EINVAL;
+			return ret;
 		}
+
 		return 0;
 	default:
 		WARN(1, "Unsupported ECC algorithm!\n");
@@ -5955,7 +5957,7 @@ void nand_cleanup(struct nand_chip *chip)
 {
 	if (chip->ecc.engine_type == NAND_ECC_ENGINE_TYPE_SOFT &&
 	    chip->ecc.algo == NAND_ECC_ALGO_BCH)
-		nand_bch_free((struct nand_bch_control *)chip->ecc.priv);
+		nand_bch_free(chip);
 
 	nanddev_cleanup(&chip->base);
 
diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
index 0ef209e1cd87..6aab57336690 100644
--- a/drivers/mtd/nand/raw/omap2.c
+++ b/drivers/mtd/nand/raw/omap2.c
@@ -2047,10 +2047,10 @@ static int omap_nand_attach_chip(struct nand_chip *chip)
 		/* Reserve one byte for the OMAP marker */
 		oobbytes_per_step	= chip->ecc.bytes + 1;
 		/* Software BCH library is used for locating errors */
-		chip->ecc.priv		= nand_bch_init(mtd);
-		if (!chip->ecc.priv) {
+		err = nand_bch_init(chip);
+		if (err) {
 			dev_err(dev, "Unable to use BCH library\n");
-			return -EINVAL;
+			return err;
 		}
 		break;
 
@@ -2089,10 +2089,10 @@ static int omap_nand_attach_chip(struct nand_chip *chip)
 		/* Reserve one byte for the OMAP marker */
 		oobbytes_per_step	= chip->ecc.bytes + 1;
 		/* Software BCH library is used for locating errors */
-		chip->ecc.priv		= nand_bch_init(mtd);
-		if (!chip->ecc.priv) {
+		err = nand_bch_init(chip);
+		if (err) {
 			dev_err(dev, "unable to use BCH library\n");
-			return -EINVAL;
+			return err;
 		}
 		break;
 
@@ -2272,7 +2272,7 @@ return_error:
 	if (!IS_ERR_OR_NULL(info->dma))
 		dma_release_channel(info->dma);
 	if (nand_chip->ecc.priv) {
-		nand_bch_free(nand_chip->ecc.priv);
+		nand_bch_free(nand_chip);
 		nand_chip->ecc.priv = NULL;
 	}
 	return err;
@@ -2286,7 +2286,7 @@ static int omap_nand_remove(struct platform_device *pdev)
 	int ret;
 
 	if (nand_chip->ecc.priv) {
-		nand_bch_free(nand_chip->ecc.priv);
+		nand_bch_free(nand_chip);
 		nand_chip->ecc.priv = NULL;
 	}
 	if (info->dma)
diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
index 1e1ee3af82b1..b62b8bd4669f 100644
--- a/include/linux/mtd/nand-ecc-sw-bch.h
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -10,7 +10,6 @@
 
 struct mtd_info;
 struct nand_chip;
-struct nand_bch_control;
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
 
@@ -30,11 +29,11 @@ int nand_bch_correct_data(struct nand_chip *chip, u_char *dat,
 /*
  * Initialize BCH encoder/decoder
  */
-struct nand_bch_control *nand_bch_init(struct mtd_info *mtd);
+int nand_bch_init(struct nand_chip *chip);
 /*
  * Release BCH encoder/decoder resources
  */
-void nand_bch_free(struct nand_bch_control *nbc);
+void nand_bch_free(struct nand_chip *chip);
 
 #else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
 
@@ -54,12 +53,12 @@ nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
 	return -ENOTSUPP;
 }
 
-static inline struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
+static inline int nand_bch_init(struct nand_chip *chip)
 {
-	return NULL;
+	return -ENOTSUPP;
 }
 
-static inline void nand_bch_free(struct nand_bch_control *nbc) {}
+static inline void nand_bch_free(struct nand_chip *chip) {}
 
 #endif /* CONFIG_MTD_NAND_ECC_SW_BCH */
 
-- 
cgit 


From e3010bd3ef1eda13f08155fe43846a64d0990a86 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:09 +0200
Subject: mtd: nand: ecc-bch: Return only valid error codes

When a function is not available, returning -ENOTSUPP makes much more
sense than returning -1.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-6-miquel.raynal@bootlin.com
---
 include/linux/mtd/nand-ecc-sw-bch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
index b62b8bd4669f..d92d2abcfcef 100644
--- a/include/linux/mtd/nand-ecc-sw-bch.h
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -43,7 +43,7 @@ static inline int
 nand_bch_calculate_ecc(struct nand_chip *chip, const u_char *dat,
 		       u_char *ecc_code)
 {
-	return -1;
+	return -ENOTSUPP;
 }
 
 static inline int
-- 
cgit 


From 127aae6077562e3926ebad7c782123c2afe95846 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:10 +0200
Subject: mtd: nand: ecc-bch: Drop mtd_nand_has_bch()

Like for any other compilation option, use the IS_ENABLED() macro
instead of hardcoding it.

By droping this helper we can get rid of the BCH header in nandsim.c.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-7-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/nand_base.c    | 2 +-
 drivers/mtd/nand/raw/nandsim.c      | 3 +--
 include/linux/mtd/nand-ecc-sw-bch.h | 4 ----
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 287c521a3d43..b49483d04755 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5231,7 +5231,7 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 
 		return 0;
 	case NAND_ECC_ALGO_BCH:
-		if (!mtd_nand_has_bch()) {
+		if (!IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)) {
 			WARN(1, "CONFIG_MTD_NAND_ECC_SW_BCH not enabled\n");
 			return -EINVAL;
 		}
diff --git a/drivers/mtd/nand/raw/nandsim.c b/drivers/mtd/nand/raw/nandsim.c
index 9c940ead66fd..f2b9250c0ea8 100644
--- a/drivers/mtd/nand/raw/nandsim.c
+++ b/drivers/mtd/nand/raw/nandsim.c
@@ -23,7 +23,6 @@
 #include <linux/string.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-bch.h>
 #include <linux/mtd/partitions.h>
 #include <linux/delay.h>
 #include <linux/list.h>
@@ -2214,7 +2213,7 @@ static int ns_attach_chip(struct nand_chip *chip)
 	if (!bch)
 		return 0;
 
-	if (!mtd_nand_has_bch()) {
+	if (!IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)) {
 		NS_ERR("BCH ECC support is disabled\n");
 		return -EINVAL;
 	}
diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
index d92d2abcfcef..7b62996d9e01 100644
--- a/include/linux/mtd/nand-ecc-sw-bch.h
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -13,8 +13,6 @@ struct nand_chip;
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
 
-static inline int mtd_nand_has_bch(void) { return 1; }
-
 /*
  * Calculate BCH ecc code
  */
@@ -37,8 +35,6 @@ void nand_bch_free(struct nand_chip *chip);
 
 #else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
 
-static inline int mtd_nand_has_bch(void) { return 0; }
-
 static inline int
 nand_bch_calculate_ecc(struct nand_chip *chip, const u_char *dat,
 		       u_char *ecc_code)
-- 
cgit 


From ea146d7fbf5081b5eb2777df5e30ed70ca68985b Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:11 +0200
Subject: mtd: nand: ecc-bch: Update the prototypes to be more generic

These functions must be usable by the main NAND core, so their names
must be technology-agnostic as well as the parameters. Hence, we pass
a generic nand_device instead of a raw nand_chip structure.

As it seems that changing the raw NAND functions to always pass a
generic NAND device is a lost of time, we prefer to create dedicated
raw NAND wrappers that will be useful in the near future to do the
translation.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-8-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-bch.c       | 47 +++++++++++++++++++++----------------
 drivers/mtd/nand/raw/nand_base.c    | 44 ++++++++++++++++++++++++++++++----
 drivers/mtd/nand/raw/omap2.c        | 23 +++++++-----------
 include/linux/mtd/nand-ecc-sw-bch.h | 45 +++++++++++++----------------------
 include/linux/mtd/rawnand.h         |  5 ++++
 5 files changed, 97 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
index b6bfee9805ae..eae81bace01c 100644
--- a/drivers/mtd/nand/ecc-sw-bch.c
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -13,6 +13,7 @@
 #include <linux/bitops.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
+#include <linux/mtd/nand.h>
 #include <linux/mtd/nand-ecc-sw-bch.h>
 #include <linux/bch.h>
 
@@ -29,14 +30,15 @@ struct nand_bch_control {
 };
 
 /**
- * nand_bch_calcuate_ecc - Calculate the ECC corresponding to a data block
- * @chip: NAND chip object
+ * nand_ecc_sw_bch_calculate - Calculate the ECC corresponding to a data block
+ * @nand: NAND device
  * @buf: Input buffer with raw data
  * @code: Output buffer with ECC
  */
-int nand_bch_calculate_ecc(struct nand_chip *chip, const unsigned char *buf,
-			   unsigned char *code)
+int nand_ecc_sw_bch_calculate(struct nand_device *nand,
+			      const unsigned char *buf, unsigned char *code)
 {
+	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
 	struct nand_bch_control *nbc = chip->ecc.priv;
 	unsigned int i;
 
@@ -49,20 +51,21 @@ int nand_bch_calculate_ecc(struct nand_chip *chip, const unsigned char *buf,
 
 	return 0;
 }
-EXPORT_SYMBOL(nand_bch_calculate_ecc);
+EXPORT_SYMBOL(nand_ecc_sw_bch_calculate);
 
 /**
- * nand_bch_correct_data - Detect, correct and report bit error(s)
- * @chip: NAND chip object
+ * nand_ecc_sw_bch_correct - Detect, correct and report bit error(s)
+ * @nand: NAND device
  * @buf: Raw data read from the chip
  * @read_ecc: ECC bytes from the chip
  * @calc_ecc: ECC calculated from the raw data
  *
  * Detect and correct bit errors for a data block.
  */
-int nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
-			  unsigned char *read_ecc, unsigned char *calc_ecc)
+int nand_ecc_sw_bch_correct(struct nand_device *nand, unsigned char *buf,
+			    unsigned char *read_ecc, unsigned char *calc_ecc)
 {
+	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
 	struct nand_bch_control *nbc = chip->ecc.priv;
 	unsigned int *errloc = nbc->errloc;
 	int i, count;
@@ -86,11 +89,11 @@ int nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
 
 	return count;
 }
-EXPORT_SYMBOL(nand_bch_correct_data);
+EXPORT_SYMBOL(nand_ecc_sw_bch_correct);
 
 /**
- * nand_bch_init - Initialize software BCH ECC engine
- * @chip: NAND chip object
+ * nand_ecc_sw_bch_init - Initialize software BCH ECC engine
+ * @nand: NAND device
  *
  * Returns: a pointer to a new NAND BCH control structure, or NULL upon failure
  *
@@ -105,9 +108,10 @@ EXPORT_SYMBOL(nand_bch_correct_data);
  * @eccsize = 512 (thus, m = 13 is the smallest integer such that 2^m - 1 > 512 * 8)
  * @eccbytes = 7 (7 bytes are required to store m * t = 13 * 4 = 52 bits)
  */
-int nand_bch_init(struct nand_chip *chip)
+int nand_ecc_sw_bch_init(struct nand_device *nand)
 {
-	struct mtd_info *mtd = nand_to_mtd(chip);
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	struct nand_chip *chip = mtd_to_nand(mtd);
 	unsigned int m, t, eccsteps, i;
 	struct nand_bch_control *nbc = NULL;
 	unsigned char *erased_page;
@@ -198,18 +202,21 @@ int nand_bch_init(struct nand_chip *chip)
 		chip->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
 
 	return 0;
+
 fail:
-	nand_bch_free(chip);
+	nand_ecc_sw_bch_cleanup(nand);
+
 	return -EINVAL;
 }
-EXPORT_SYMBOL(nand_bch_init);
+EXPORT_SYMBOL(nand_ecc_sw_bch_init);
 
 /**
- * nand_bch_free - Release NAND BCH ECC resources
- * @nbc: NAND BCH control structure
+ * nand_ecc_sw_bch_cleanup - Cleanup software BCH ECC resources
+ * @nand: NAND device
  */
-void nand_bch_free(struct nand_chip *chip)
+void nand_ecc_sw_bch_cleanup(struct nand_device *nand)
 {
+	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
 	struct nand_bch_control *nbc = chip->ecc.priv;
 
 	if (nbc) {
@@ -219,7 +226,7 @@ void nand_bch_free(struct nand_chip *chip)
 		kfree(nbc);
 	}
 }
-EXPORT_SYMBOL(nand_bch_free);
+EXPORT_SYMBOL(nand_ecc_sw_bch_cleanup);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index b49483d04755..2ef674c10ac8 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5139,6 +5139,42 @@ static void nand_scan_ident_cleanup(struct nand_chip *chip)
 	kfree(chip->parameters.onfi);
 }
 
+int rawnand_sw_bch_init(struct nand_chip *chip)
+{
+	struct nand_device *base = &chip->base;
+
+	return nand_ecc_sw_bch_init(base);
+}
+EXPORT_SYMBOL(rawnand_sw_bch_init);
+
+static int rawnand_sw_bch_calculate(struct nand_chip *chip,
+				    const unsigned char *buf,
+				    unsigned char *code)
+{
+	struct nand_device *base = &chip->base;
+
+	return nand_ecc_sw_bch_calculate(base, buf, code);
+}
+
+int rawnand_sw_bch_correct(struct nand_chip *chip, unsigned char *buf,
+			   unsigned char *read_ecc, unsigned char *calc_ecc)
+{
+	struct nand_device *base = &chip->base;
+
+	return nand_ecc_sw_bch_correct(base, buf, read_ecc, calc_ecc);
+}
+EXPORT_SYMBOL(rawnand_sw_bch_correct);
+
+void rawnand_sw_bch_cleanup(struct nand_chip *chip)
+{
+	struct nand_device *base = &chip->base;
+
+	nand_ecc_sw_bch_cleanup(base);
+
+	chip->ecc.priv = NULL;
+}
+EXPORT_SYMBOL(rawnand_sw_bch_cleanup);
+
 static int nand_set_ecc_on_host_ops(struct nand_chip *chip)
 {
 	struct nand_ecc_ctrl *ecc = &chip->ecc;
@@ -5235,8 +5271,8 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 			WARN(1, "CONFIG_MTD_NAND_ECC_SW_BCH not enabled\n");
 			return -EINVAL;
 		}
-		ecc->calculate = nand_bch_calculate_ecc;
-		ecc->correct = nand_bch_correct_data;
+		ecc->calculate = rawnand_sw_bch_calculate;
+		ecc->correct = rawnand_sw_bch_correct;
 		ecc->read_page = nand_read_page_swecc;
 		ecc->read_subpage = nand_read_subpage;
 		ecc->write_page = nand_write_page_swecc;
@@ -5292,7 +5328,7 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 
 		/* See the software BCH ECC initialization for details */
 		ecc->bytes = 0;
-		ret = nand_bch_init(chip);
+		ret = rawnand_sw_bch_init(chip);
 		if (ret) {
 			WARN(1, "BCH ECC initialization failed!\n");
 			return ret;
@@ -5957,7 +5993,7 @@ void nand_cleanup(struct nand_chip *chip)
 {
 	if (chip->ecc.engine_type == NAND_ECC_ENGINE_TYPE_SOFT &&
 	    chip->ecc.algo == NAND_ECC_ALGO_BCH)
-		nand_bch_free(chip);
+		rawnand_sw_bch_cleanup(chip);
 
 	nanddev_cleanup(&chip->base);
 
diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
index 6aab57336690..4cc47ab7f01a 100644
--- a/drivers/mtd/nand/raw/omap2.c
+++ b/drivers/mtd/nand/raw/omap2.c
@@ -23,7 +23,6 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <linux/mtd/nand-ecc-sw-bch.h>
 #include <linux/platform_data/elm.h>
 
 #include <linux/omap-gpmc.h>
@@ -2041,13 +2040,13 @@ static int omap_nand_attach_chip(struct nand_chip *chip)
 		chip->ecc.bytes		= 7;
 		chip->ecc.strength	= 4;
 		chip->ecc.hwctl		= omap_enable_hwecc_bch;
-		chip->ecc.correct	= nand_bch_correct_data;
+		chip->ecc.correct	= rawnand_sw_bch_correct;
 		chip->ecc.calculate	= omap_calculate_ecc_bch_sw;
 		mtd_set_ooblayout(mtd, &omap_sw_ooblayout_ops);
 		/* Reserve one byte for the OMAP marker */
 		oobbytes_per_step	= chip->ecc.bytes + 1;
 		/* Software BCH library is used for locating errors */
-		err = nand_bch_init(chip);
+		err = rawnand_sw_bch_init(chip);
 		if (err) {
 			dev_err(dev, "Unable to use BCH library\n");
 			return err;
@@ -2083,13 +2082,13 @@ static int omap_nand_attach_chip(struct nand_chip *chip)
 		chip->ecc.bytes		= 13;
 		chip->ecc.strength	= 8;
 		chip->ecc.hwctl		= omap_enable_hwecc_bch;
-		chip->ecc.correct	= nand_bch_correct_data;
+		chip->ecc.correct	= rawnand_sw_bch_correct;
 		chip->ecc.calculate	= omap_calculate_ecc_bch_sw;
 		mtd_set_ooblayout(mtd, &omap_sw_ooblayout_ops);
 		/* Reserve one byte for the OMAP marker */
 		oobbytes_per_step	= chip->ecc.bytes + 1;
 		/* Software BCH library is used for locating errors */
-		err = nand_bch_init(chip);
+		err = rawnand_sw_bch_init(chip);
 		if (err) {
 			dev_err(dev, "unable to use BCH library\n");
 			return err;
@@ -2195,7 +2194,6 @@ static int omap_nand_probe(struct platform_device *pdev)
 	nand_chip		= &info->nand;
 	mtd			= nand_to_mtd(nand_chip);
 	mtd->dev.parent		= &pdev->dev;
-	nand_chip->ecc.priv	= NULL;
 	nand_set_flash_node(nand_chip, dev->of_node);
 
 	if (!mtd->name) {
@@ -2271,10 +2269,9 @@ cleanup_nand:
 return_error:
 	if (!IS_ERR_OR_NULL(info->dma))
 		dma_release_channel(info->dma);
-	if (nand_chip->ecc.priv) {
-		nand_bch_free(nand_chip);
-		nand_chip->ecc.priv = NULL;
-	}
+
+	rawnand_sw_bch_cleanup(nand_chip);
+
 	return err;
 }
 
@@ -2285,10 +2282,8 @@ static int omap_nand_remove(struct platform_device *pdev)
 	struct omap_nand_info *info = mtd_to_omap(mtd);
 	int ret;
 
-	if (nand_chip->ecc.priv) {
-		nand_bch_free(nand_chip);
-		nand_chip->ecc.priv = NULL;
-	}
+	rawnand_sw_bch_cleanup(nand_chip);
+
 	if (info->dma)
 		dma_release_channel(info->dma);
 	ret = mtd_device_unregister(mtd);
diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
index 7b62996d9e01..f0caee3b03d0 100644
--- a/include/linux/mtd/nand-ecc-sw-bch.h
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -8,53 +8,40 @@
 #ifndef __MTD_NAND_ECC_SW_BCH_H__
 #define __MTD_NAND_ECC_SW_BCH_H__
 
-struct mtd_info;
-struct nand_chip;
+#include <linux/mtd/nand.h>
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
 
-/*
- * Calculate BCH ecc code
- */
-int nand_bch_calculate_ecc(struct nand_chip *chip, const u_char *dat,
-			   u_char *ecc_code);
-
-/*
- * Detect and correct bit errors
- */
-int nand_bch_correct_data(struct nand_chip *chip, u_char *dat,
-			  u_char *read_ecc, u_char *calc_ecc);
-/*
- * Initialize BCH encoder/decoder
- */
-int nand_bch_init(struct nand_chip *chip);
-/*
- * Release BCH encoder/decoder resources
- */
-void nand_bch_free(struct nand_chip *chip);
+int nand_ecc_sw_bch_calculate(struct nand_device *nand,
+			      const unsigned char *buf, unsigned char *code);
+int nand_ecc_sw_bch_correct(struct nand_device *nand, unsigned char *buf,
+			    unsigned char *read_ecc, unsigned char *calc_ecc);
+int nand_ecc_sw_bch_init(struct nand_device *nand);
+void nand_ecc_sw_bch_cleanup(struct nand_device *nand);
 
 #else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
 
-static inline int
-nand_bch_calculate_ecc(struct nand_chip *chip, const u_char *dat,
-		       u_char *ecc_code)
+static inline int nand_ecc_sw_bch_calculate(struct nand_device *nand,
+					    const unsigned char *buf,
+					    unsigned char *code)
 {
 	return -ENOTSUPP;
 }
 
-static inline int
-nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
-		      unsigned char *read_ecc, unsigned char *calc_ecc)
+static inline int nand_ecc_sw_bch_correct(struct nand_device *nand,
+					  unsigned char *buf,
+					  unsigned char *read_ecc,
+					  unsigned char *calc_ecc)
 {
 	return -ENOTSUPP;
 }
 
-static inline int nand_bch_init(struct nand_chip *chip)
+static inline int nand_ecc_sw_bch_init(struct nand_device *nand)
 {
 	return -ENOTSUPP;
 }
 
-static inline void nand_bch_free(struct nand_chip *chip) {}
+static inline void nand_ecc_sw_bch_cleanup(struct nand_device *nand) {}
 
 #endif /* CONFIG_MTD_NAND_ECC_SW_BCH */
 
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index aac07940de09..23623beaad1d 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1303,6 +1303,11 @@ static inline int nand_opcode_8bits(unsigned int command)
 	return 0;
 }
 
+int rawnand_sw_bch_init(struct nand_chip *chip);
+int rawnand_sw_bch_correct(struct nand_chip *chip, unsigned char *buf,
+			   unsigned char *read_ecc, unsigned char *calc_ecc);
+void rawnand_sw_bch_cleanup(struct nand_chip *chip);
+
 int nand_check_erased_ecc_chunk(void *data, int datalen,
 				void *ecc, int ecclen,
 				void *extraoob, int extraooblen,
-- 
cgit 


From 80fe603160a4732a08f0f08f3e3312a3f3a79eee Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:12 +0200
Subject: mtd: nand: ecc-bch: Stop using raw NAND structures

This code is meant to be reused by the SPI-NAND core. Now that the
driver has been cleaned and reorganized, use a generic ECC engine
object to store the driver's data instead of accessing members of the
nand_chip structure.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-9-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-bch.c       | 114 +++++++++++++-----------------------
 drivers/mtd/nand/raw/nand_base.c    |  38 ++++++++++--
 include/linux/mtd/nand-ecc-sw-bch.h |  25 ++++++++
 3 files changed, 98 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
index eae81bace01c..16a54bd2ca31 100644
--- a/drivers/mtd/nand/ecc-sw-bch.c
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -11,23 +11,8 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/bitops.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/nand-ecc-sw-bch.h>
-#include <linux/bch.h>
-
-/**
- * struct nand_bch_control - private NAND BCH control structure
- * @bch:       BCH control structure
- * @errloc:    error location array
- * @eccmask:   XOR ecc mask, allows erased pages to be decoded as valid
- */
-struct nand_bch_control {
-	struct bch_control   *bch;
-	unsigned int         *errloc;
-	unsigned char        *eccmask;
-};
 
 /**
  * nand_ecc_sw_bch_calculate - Calculate the ECC corresponding to a data block
@@ -38,16 +23,15 @@ struct nand_bch_control {
 int nand_ecc_sw_bch_calculate(struct nand_device *nand,
 			      const unsigned char *buf, unsigned char *code)
 {
-	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
-	struct nand_bch_control *nbc = chip->ecc.priv;
+	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
 	unsigned int i;
 
-	memset(code, 0, chip->ecc.bytes);
-	bch_encode(nbc->bch, buf, chip->ecc.size, code);
+	memset(code, 0, engine_conf->code_size);
+	bch_encode(engine_conf->bch, buf, nand->ecc.ctx.conf.step_size, code);
 
 	/* apply mask so that an erased page is a valid codeword */
-	for (i = 0; i < chip->ecc.bytes; i++)
-		code[i] ^= nbc->eccmask[i];
+	for (i = 0; i < engine_conf->code_size; i++)
+		code[i] ^= engine_conf->eccmask[i];
 
 	return 0;
 }
@@ -65,16 +49,16 @@ EXPORT_SYMBOL(nand_ecc_sw_bch_calculate);
 int nand_ecc_sw_bch_correct(struct nand_device *nand, unsigned char *buf,
 			    unsigned char *read_ecc, unsigned char *calc_ecc)
 {
-	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
-	struct nand_bch_control *nbc = chip->ecc.priv;
-	unsigned int *errloc = nbc->errloc;
+	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
+	unsigned int step_size = nand->ecc.ctx.conf.step_size;
+	unsigned int *errloc = engine_conf->errloc;
 	int i, count;
 
-	count = bch_decode(nbc->bch, NULL, chip->ecc.size, read_ecc, calc_ecc,
-			   NULL, errloc);
+	count = bch_decode(engine_conf->bch, NULL, step_size, read_ecc,
+			   calc_ecc, NULL, errloc);
 	if (count > 0) {
 		for (i = 0; i < count; i++) {
-			if (errloc[i] < (chip->ecc.size * 8))
+			if (errloc[i] < (step_size * 8))
 				/* The error is in the data area: correct it */
 				buf[errloc[i] >> 3] ^= (1 << (errloc[i] & 7));
 
@@ -97,31 +81,30 @@ EXPORT_SYMBOL(nand_ecc_sw_bch_correct);
  *
  * Returns: a pointer to a new NAND BCH control structure, or NULL upon failure
  *
- * Initialize NAND BCH error correction. Parameters @eccsize and @eccbytes
- * are used to compute the following BCH parameters:
+ * Initialize NAND BCH error correction. @nand.ecc parameters 'step_size' and
+ * 'bytes' are used to compute the following BCH parameters:
  *     m, the Galois field order
  *     t, the error correction capability
- * @eccbytes should be equal to the number of bytes required to store m * t
+ * 'bytes' should be equal to the number of bytes required to store m * t
  * bits, where m is such that 2^m - 1 > step_size * 8.
  *
  * Example: to configure 4 bit correction per 512 bytes, you should pass
- * @eccsize = 512 (thus, m = 13 is the smallest integer such that 2^m - 1 > 512 * 8)
- * @eccbytes = 7 (7 bytes are required to store m * t = 13 * 4 = 52 bits)
+ * step_size = 512 (thus, m = 13 is the smallest integer such that 2^m - 1 > 512 * 8)
+ * bytes = 7 (7 bytes are required to store m * t = 13 * 4 = 52 bits)
  */
 int nand_ecc_sw_bch_init(struct nand_device *nand)
 {
 	struct mtd_info *mtd = nanddev_to_mtd(nand);
-	struct nand_chip *chip = mtd_to_nand(mtd);
 	unsigned int m, t, eccsteps, i;
-	struct nand_bch_control *nbc = NULL;
+	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
 	unsigned char *erased_page;
-	unsigned int eccsize = chip->ecc.size;
-	unsigned int eccbytes = chip->ecc.bytes;
-	unsigned int eccstrength = chip->ecc.strength;
+	unsigned int eccsize = nand->ecc.ctx.conf.step_size;
+	unsigned int eccbytes = engine_conf->code_size;
+	unsigned int eccstrength = nand->ecc.ctx.conf.strength;
 
 	if (!eccbytes && eccstrength) {
 		eccbytes = DIV_ROUND_UP(eccstrength * fls(8 * eccsize), 8);
-		chip->ecc.bytes = eccbytes;
+		engine_conf->code_size = eccbytes;
 	}
 
 	if (!eccsize || !eccbytes) {
@@ -132,20 +115,14 @@ int nand_ecc_sw_bch_init(struct nand_device *nand)
 	m = fls(1+8*eccsize);
 	t = (eccbytes*8)/m;
 
-	nbc = kzalloc(sizeof(*nbc), GFP_KERNEL);
-	if (!nbc)
-		return -ENOMEM;
-
-	chip->ecc.priv = nbc;
-
-	nbc->bch = bch_init(m, t, 0, false);
-	if (!nbc->bch)
-		goto fail;
+	engine_conf->bch = bch_init(m, t, 0, false);
+	if (!engine_conf->bch)
+		return -EINVAL;
 
 	/* verify that eccbytes has the expected value */
-	if (nbc->bch->ecc_bytes != eccbytes) {
+	if (engine_conf->bch->ecc_bytes != eccbytes) {
 		pr_warn("invalid eccbytes %u, should be %u\n",
-			eccbytes, nbc->bch->ecc_bytes);
+			eccbytes, engine_conf->bch->ecc_bytes);
 		goto fail;
 	}
 
@@ -163,25 +140,15 @@ int nand_ecc_sw_bch_init(struct nand_device *nand)
 		goto fail;
 	}
 
-	/*
-	 * ecc->steps and ecc->total might be used by mtd->ooblayout->ecc(),
-	 * which is called by mtd_ooblayout_count_eccbytes().
-	 * Make sure they are properly initialized before calling
-	 * mtd_ooblayout_count_eccbytes().
-	 * FIXME: we should probably rework the sequencing in nand_scan_tail()
-	 * to avoid setting those fields twice.
-	 */
-	chip->ecc.steps = eccsteps;
-	chip->ecc.total = eccsteps * eccbytes;
-	nand->base.ecc.ctx.total = chip->ecc.total;
 	if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
 		pr_warn("invalid ecc layout\n");
 		goto fail;
 	}
 
-	nbc->eccmask = kzalloc(eccbytes, GFP_KERNEL);
-	nbc->errloc = kmalloc_array(t, sizeof(*nbc->errloc), GFP_KERNEL);
-	if (!nbc->eccmask || !nbc->errloc)
+	engine_conf->eccmask = kzalloc(eccbytes, GFP_KERNEL);
+	engine_conf->errloc = kmalloc_array(t, sizeof(*engine_conf->errloc),
+					    GFP_KERNEL);
+	if (!engine_conf->eccmask || !engine_conf->errloc)
 		goto fail;
 
 	/*
@@ -192,14 +159,15 @@ int nand_ecc_sw_bch_init(struct nand_device *nand)
 		goto fail;
 
 	memset(erased_page, 0xff, eccsize);
-	bch_encode(nbc->bch, erased_page, eccsize, nbc->eccmask);
+	bch_encode(engine_conf->bch, erased_page, eccsize,
+		   engine_conf->eccmask);
 	kfree(erased_page);
 
 	for (i = 0; i < eccbytes; i++)
-		nbc->eccmask[i] ^= 0xff;
+		engine_conf->eccmask[i] ^= 0xff;
 
 	if (!eccstrength)
-		chip->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
+		nand->ecc.ctx.conf.strength = (eccbytes * 8) / fls(8 * eccsize);
 
 	return 0;
 
@@ -216,14 +184,12 @@ EXPORT_SYMBOL(nand_ecc_sw_bch_init);
  */
 void nand_ecc_sw_bch_cleanup(struct nand_device *nand)
 {
-	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
-	struct nand_bch_control *nbc = chip->ecc.priv;
-
-	if (nbc) {
-		bch_free(nbc->bch);
-		kfree(nbc->errloc);
-		kfree(nbc->eccmask);
-		kfree(nbc);
+	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
+
+	if (engine_conf) {
+		bch_free(engine_conf->bch);
+		kfree(engine_conf->errloc);
+		kfree(engine_conf->eccmask);
 	}
 }
 EXPORT_SYMBOL(nand_ecc_sw_bch_cleanup);
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 2ef674c10ac8..03106bf629dd 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5142,8 +5142,33 @@ static void nand_scan_ident_cleanup(struct nand_chip *chip)
 int rawnand_sw_bch_init(struct nand_chip *chip)
 {
 	struct nand_device *base = &chip->base;
+	struct nand_ecc_sw_bch_conf *engine_conf;
+	int ret;
+
+	base->ecc.user_conf.engine_type = NAND_ECC_ENGINE_TYPE_SOFT;
+	base->ecc.user_conf.algo = NAND_ECC_ALGO_BCH;
+	base->ecc.user_conf.step_size = chip->ecc.size;
+	base->ecc.user_conf.strength = chip->ecc.strength;
+
+	engine_conf = kzalloc(sizeof(*engine_conf), GFP_KERNEL);
+	if (!engine_conf)
+		return -ENOMEM;
+
+	engine_conf->code_size = chip->ecc.bytes;
+
+	base->ecc.ctx.priv = engine_conf;
 
-	return nand_ecc_sw_bch_init(base);
+	ret = nand_ecc_sw_bch_init(base);
+	if (ret)
+		kfree(base->ecc.ctx.priv);
+
+	chip->ecc.size = base->ecc.ctx.conf.step_size;
+	chip->ecc.strength = base->ecc.ctx.conf.strength;
+	chip->ecc.total = base->ecc.ctx.total;
+	chip->ecc.steps = engine_conf->nsteps;
+	chip->ecc.bytes = engine_conf->code_size;
+
+	return ret;
 }
 EXPORT_SYMBOL(rawnand_sw_bch_init);
 
@@ -5171,7 +5196,7 @@ void rawnand_sw_bch_cleanup(struct nand_chip *chip)
 
 	nand_ecc_sw_bch_cleanup(base);
 
-	chip->ecc.priv = NULL;
+	kfree(base->ecc.ctx.priv);
 }
 EXPORT_SYMBOL(rawnand_sw_bch_cleanup);
 
@@ -5794,15 +5819,18 @@ static int nand_scan_tail(struct nand_chip *chip)
 	 * Set the number of read / write steps for one page depending on ECC
 	 * mode.
 	 */
-	ecc->steps = mtd->writesize / ecc->size;
+	if (!ecc->steps)
+		ecc->steps = mtd->writesize / ecc->size;
 	if (ecc->steps * ecc->size != mtd->writesize) {
 		WARN(1, "Invalid ECC parameters\n");
 		ret = -EINVAL;
 		goto err_nand_manuf_cleanup;
 	}
 
-	ecc->total = ecc->steps * ecc->bytes;
-	chip->base.ecc.ctx.total = ecc->total;
+	if (!ecc->total) {
+		ecc->total = ecc->steps * ecc->bytes;
+		chip->base.ecc.ctx.total = ecc->total;
+	}
 
 	if (ecc->total > mtd->oobsize) {
 		WARN(1, "Total number of ECC bytes exceeded oobsize\n");
diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
index f0caee3b03d0..ce005528e55f 100644
--- a/include/linux/mtd/nand-ecc-sw-bch.h
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -9,6 +9,31 @@
 #define __MTD_NAND_ECC_SW_BCH_H__
 
 #include <linux/mtd/nand.h>
+#include <linux/bch.h>
+
+/**
+ * struct nand_ecc_sw_bch_conf - private software BCH ECC engine structure
+ * @reqooblen: Save the actual user OOB length requested before overwriting it
+ * @spare_oobbuf: Spare OOB buffer if none is provided
+ * @code_size: Number of bytes needed to store a code (one code per step)
+ * @nsteps: Number of steps
+ * @calc_buf: Buffer to use when calculating ECC bytes
+ * @code_buf: Buffer to use when reading (raw) ECC bytes from the chip
+ * @bch: BCH control structure
+ * @errloc: error location array
+ * @eccmask: XOR ecc mask, allows erased pages to be decoded as valid
+ */
+struct nand_ecc_sw_bch_conf {
+	unsigned int reqooblen;
+	void *spare_oobbuf;
+	unsigned int code_size;
+	unsigned int nsteps;
+	u8 *calc_buf;
+	u8 *code_buf;
+	struct bch_control *bch;
+	unsigned int *errloc;
+	unsigned char *eccmask;
+};
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
 
-- 
cgit 


From 57e3cebd022fbc035dcf190ac789fd2ffc747f5b Mon Sep 17 00:00:00 2001
From: Shenming Lu <lushenming@huawei.com>
Date: Sat, 28 Nov 2020 22:18:57 +0800
Subject: KVM: arm64: Delay the polling of the GICR_VPENDBASER.Dirty bit

In order to reduce the impact of the VPT parsing happening on the GIC,
we can split the vcpu reseidency in two phases:

- programming GICR_VPENDBASER: this still happens in vcpu_load()
- checking for the VPT parsing to be complete: this can happen
  on vcpu entry (in kvm_vgic_flush_hwstate())

This allows the GIC and the CPU to work in parallel, rewmoving some
of the entry overhead.

Suggested-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Shenming Lu <lushenming@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201128141857.983-3-lushenming@huawei.com
---
 arch/arm64/kvm/vgic/vgic-v4.c      | 12 ++++++++++++
 arch/arm64/kvm/vgic/vgic.c         |  3 +++
 drivers/irqchip/irq-gic-v3-its.c   | 12 ++++++++----
 drivers/irqchip/irq-gic-v4.c       | 19 +++++++++++++++++++
 include/kvm/arm_vgic.h             |  1 +
 include/linux/irqchip/arm-gic-v4.h |  4 ++++
 6 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index b5fa73c9fd35..66508b03094f 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -353,6 +353,18 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
 	return err;
 }
 
+void vgic_v4_commit(struct kvm_vcpu *vcpu)
+{
+	struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+	/*
+	 * No need to wait for the vPE to be ready across a shallow guest
+	 * exit, as only a vcpu_put will invalidate it.
+	 */
+	if (!vpe->ready)
+		its_commit_vpe(vpe);
+}
+
 static struct vgic_its *vgic_get_its(struct kvm *kvm,
 				     struct kvm_kernel_irq_routing_entry *irq_entry)
 {
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index c3643b7f101b..1c597c9885fa 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -915,6 +915,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 
 	if (can_access_vgic_from_kernel())
 		vgic_restore_state(vcpu);
+
+	if (vgic_supports_direct_msis(vcpu->kvm))
+		vgic_v4_commit(vcpu);
 }
 
 void kvm_vgic_load(struct kvm_vcpu *vcpu)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 0fec31931e11..7db602434ac5 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3842,8 +3842,6 @@ static void its_vpe_schedule(struct its_vpe *vpe)
 	val |= vpe->idai ? GICR_VPENDBASER_IDAI : 0;
 	val |= GICR_VPENDBASER_Valid;
 	gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
-
-	its_wait_vpt_parse_complete();
 }
 
 static void its_vpe_deschedule(struct its_vpe *vpe)
@@ -3891,6 +3889,10 @@ static int its_vpe_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
 		its_vpe_deschedule(vpe);
 		return 0;
 
+	case COMMIT_VPE:
+		its_wait_vpt_parse_complete();
+		return 0;
+
 	case INVALL_VPE:
 		its_vpe_invall(vpe);
 		return 0;
@@ -4052,8 +4054,6 @@ static void its_vpe_4_1_schedule(struct its_vpe *vpe,
 	val |= FIELD_PREP(GICR_VPENDBASER_4_1_VPEID, vpe->vpe_id);
 
 	gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
-
-	its_wait_vpt_parse_complete();
 }
 
 static void its_vpe_4_1_deschedule(struct its_vpe *vpe,
@@ -4128,6 +4128,10 @@ static int its_vpe_4_1_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
 		its_vpe_4_1_deschedule(vpe, info);
 		return 0;
 
+	case COMMIT_VPE:
+		its_wait_vpt_parse_complete();
+		return 0;
+
 	case INVALL_VPE:
 		its_vpe_4_1_invall(vpe);
 		return 0;
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 0c18714ae13e..5d1dc9915272 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -232,6 +232,8 @@ int its_make_vpe_non_resident(struct its_vpe *vpe, bool db)
 	if (!ret)
 		vpe->resident = false;
 
+	vpe->ready = false;
+
 	return ret;
 }
 
@@ -258,6 +260,23 @@ int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, bool g1en)
 	return ret;
 }
 
+int its_commit_vpe(struct its_vpe *vpe)
+{
+	struct its_cmd_info info = {
+		.cmd_type = COMMIT_VPE,
+	};
+	int ret;
+
+	WARN_ON(preemptible());
+
+	ret = its_send_vpe_cmd(vpe, &info);
+	if (!ret)
+		vpe->ready = true;
+
+	return ret;
+}
+
+
 int its_invall_vpe(struct its_vpe *vpe)
 {
 	struct its_cmd_info info = {
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index a8d8fdcd3723..3d74f1060bd1 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -402,6 +402,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq,
 				 struct kvm_kernel_irq_routing_entry *irq_entry);
 
 int vgic_v4_load(struct kvm_vcpu *vcpu);
+void vgic_v4_commit(struct kvm_vcpu *vcpu);
 int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db);
 
 #endif /* __KVM_ARM_VGIC_H */
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 6976b8331b60..943c3411ca10 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -39,6 +39,8 @@ struct its_vpe {
 	irq_hw_number_t		vpe_db_lpi;
 	/* VPE resident */
 	bool			resident;
+	/* VPT parse complete */
+	bool			ready;
 	union {
 		/* GICv4.0 implementations */
 		struct {
@@ -104,6 +106,7 @@ enum its_vcpu_info_cmd_type {
 	PROP_UPDATE_AND_INV_VLPI,
 	SCHEDULE_VPE,
 	DESCHEDULE_VPE,
+	COMMIT_VPE,
 	INVALL_VPE,
 	PROP_UPDATE_VSGI,
 };
@@ -129,6 +132,7 @@ int its_alloc_vcpu_irqs(struct its_vm *vm);
 void its_free_vcpu_irqs(struct its_vm *vm);
 int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, bool g1en);
 int its_make_vpe_non_resident(struct its_vpe *vpe, bool db);
+int its_commit_vpe(struct its_vpe *vpe);
 int its_invall_vpe(struct its_vpe *vpe);
 int its_map_vlpi(int irq, struct its_vlpi_map *map);
 int its_get_vlpi(int irq, struct its_vlpi_map *map);
-- 
cgit 


From bb4c6910c8b41623104c2e64a30615682689a54d Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 26 Nov 2020 09:28:51 +0100
Subject: genirq/irqdomain: Add an irq_create_mapping_affinity() function

There is currently no way to convey the affinity of an interrupt
via irq_create_mapping(), which creates issues for devices that
expect that affinity to be managed by the kernel.

In order to sort this out, rename irq_create_mapping() to
irq_create_mapping_affinity() with an additional affinity parameter that
can be passed down to irq_domain_alloc_descs().

irq_create_mapping() is re-implemented as a wrapper around
irq_create_mapping_affinity().

No functional change.

Fixes: e75eafb9b039 ("genirq/msi: Switch to new irq spreading infrastructure")
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kurz <groug@kaod.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20201126082852.1178497-2-lvivier@redhat.com
---
 include/linux/irqdomain.h | 12 ++++++++++--
 kernel/irq/irqdomain.c    | 13 ++++++++-----
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 71535e87109f..ea5a337e0f8b 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -384,11 +384,19 @@ extern void irq_domain_associate_many(struct irq_domain *domain,
 extern void irq_domain_disassociate(struct irq_domain *domain,
 				    unsigned int irq);
 
-extern unsigned int irq_create_mapping(struct irq_domain *host,
-				       irq_hw_number_t hwirq);
+extern unsigned int irq_create_mapping_affinity(struct irq_domain *host,
+				      irq_hw_number_t hwirq,
+				      const struct irq_affinity_desc *affinity);
 extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec);
 extern void irq_dispose_mapping(unsigned int virq);
 
+static inline unsigned int irq_create_mapping(struct irq_domain *host,
+					      irq_hw_number_t hwirq)
+{
+	return irq_create_mapping_affinity(host, hwirq, NULL);
+}
+
+
 /**
  * irq_linear_revmap() - Find a linux irq from a hw irq number.
  * @domain: domain owning this hardware interrupt
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf8b374b892d..e4ca69608f3b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -624,17 +624,19 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
 EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
 
 /**
- * irq_create_mapping() - Map a hardware interrupt into linux irq space
+ * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
  * @domain: domain owning this hardware interrupt or NULL for default domain
  * @hwirq: hardware irq number in that domain space
+ * @affinity: irq affinity
  *
  * Only one mapping per hardware interrupt is permitted. Returns a linux
  * irq number.
  * If the sense/trigger is to be specified, set_irq_type() should be called
  * on the number returned from that call.
  */
-unsigned int irq_create_mapping(struct irq_domain *domain,
-				irq_hw_number_t hwirq)
+unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
+				       irq_hw_number_t hwirq,
+				       const struct irq_affinity_desc *affinity)
 {
 	struct device_node *of_node;
 	int virq;
@@ -660,7 +662,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
 	}
 
 	/* Allocate a virtual interrupt number */
-	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
+	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+				      affinity);
 	if (virq <= 0) {
 		pr_debug("-> virq allocation failed\n");
 		return 0;
@@ -676,7 +679,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
 
 	return virq;
 }
-EXPORT_SYMBOL_GPL(irq_create_mapping);
+EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
 
 /**
  * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
-- 
cgit 


From 63e2fffa59a9dd91e443b08832656399fd80b7f0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 15 Nov 2020 17:37:37 -0500
Subject: pNFS/flexfiles: Fix array overflow when flexfiles mirroring is
 enabled

If the flexfiles mirroring is enabled, then the read code expects to be
able to set pgio->pg_mirror_idx to point to the data server that is
being used for this particular read. However it does not change the
pg_mirror_count because we only need to send a single read.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 27 +++++++++++++++++++------
 fs/nfs/pagelist.c                      | 36 +++++++++++++++++++++++++---------
 include/linux/nfs_page.h               |  4 ++++
 3 files changed, 52 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index a163533446fa..24bf5797f88a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -838,7 +838,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 	struct nfs_pgio_mirror *pgm;
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_pnfs_ds *ds;
-	u32 ds_idx, i;
+	u32 ds_idx;
 
 retry:
 	ff_layout_pg_check_layout(pgio, req);
@@ -864,11 +864,9 @@ retry:
 		goto retry;
 	}
 
-	for (i = 0; i < pgio->pg_mirror_count; i++) {
-		mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
-		pgm = &pgio->pg_mirrors[i];
-		pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
-	}
+	mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+	pgm = &pgio->pg_mirrors[0];
+	pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
 
 	pgio->pg_mirror_idx = ds_idx;
 
@@ -985,6 +983,21 @@ out:
 	return 1;
 }
 
+static u32
+ff_layout_pg_set_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+	u32 old = desc->pg_mirror_idx;
+
+	desc->pg_mirror_idx = idx;
+	return old;
+}
+
+static struct nfs_pgio_mirror *
+ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+	return &desc->pg_mirrors[idx];
+}
+
 static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
 	.pg_init = ff_layout_pg_init_read,
 	.pg_test = pnfs_generic_pg_test,
@@ -998,6 +1011,8 @@ static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
 	.pg_doio = pnfs_generic_pg_writepages,
 	.pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
 	.pg_cleanup = pnfs_generic_pg_cleanup,
+	.pg_get_mirror = ff_layout_pg_get_mirror_write,
+	.pg_set_mirror = ff_layout_pg_set_mirror_write,
 };
 
 static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6985cacf4700..78c9c4bdef2b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -31,13 +31,29 @@
 static struct kmem_cache *nfs_page_cachep;
 static const struct rpc_call_ops nfs_pgio_common_ops;
 
+static struct nfs_pgio_mirror *
+nfs_pgio_get_mirror(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+	if (desc->pg_ops->pg_get_mirror)
+		return desc->pg_ops->pg_get_mirror(desc, idx);
+	return &desc->pg_mirrors[0];
+}
+
 struct nfs_pgio_mirror *
 nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
 {
-	return &desc->pg_mirrors[desc->pg_mirror_idx];
+	return nfs_pgio_get_mirror(desc, desc->pg_mirror_idx);
 }
 EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
 
+static u32
+nfs_pgio_set_current_mirror(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+	if (desc->pg_ops->pg_set_mirror)
+		return desc->pg_ops->pg_set_mirror(desc, idx);
+	return desc->pg_mirror_idx;
+}
+
 void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 		       struct nfs_pgio_header *hdr,
 		       void (*release)(struct nfs_pgio_header *hdr))
@@ -1259,7 +1275,7 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
 		return;
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
-		mirror = &desc->pg_mirrors[midx];
+		mirror = nfs_pgio_get_mirror(desc, midx);
 		desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
 				desc->pg_error);
 	}
@@ -1293,12 +1309,12 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			goto out_failed;
 		}
 
-		desc->pg_mirror_idx = midx;
+		nfs_pgio_set_current_mirror(desc, midx);
 		if (!nfs_pageio_add_request_mirror(desc, dupreq))
 			goto out_cleanup_subreq;
 	}
 
-	desc->pg_mirror_idx = 0;
+	nfs_pgio_set_current_mirror(desc, 0);
 	if (!nfs_pageio_add_request_mirror(desc, req))
 		goto out_failed;
 
@@ -1320,10 +1336,12 @@ out_failed:
 static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
 				       u32 mirror_idx)
 {
-	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
-	u32 restore_idx = desc->pg_mirror_idx;
+	struct nfs_pgio_mirror *mirror;
+	u32 restore_idx;
+
+	restore_idx = nfs_pgio_set_current_mirror(desc, mirror_idx);
+	mirror = nfs_pgio_current_mirror(desc);
 
-	desc->pg_mirror_idx = mirror_idx;
 	for (;;) {
 		nfs_pageio_doio(desc);
 		if (desc->pg_error < 0 || !mirror->pg_recoalesce)
@@ -1331,7 +1349,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
 		if (!nfs_do_recoalesce(desc))
 			break;
 	}
-	desc->pg_mirror_idx = restore_idx;
+	nfs_pgio_set_current_mirror(desc, restore_idx);
 }
 
 /*
@@ -1405,7 +1423,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
 	u32 midx;
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
-		mirror = &desc->pg_mirrors[midx];
+		mirror = nfs_pgio_get_mirror(desc, midx);
 		if (!list_empty(&mirror->pg_list)) {
 			prev = nfs_list_entry(mirror->pg_list.prev);
 			if (index != prev->wb_index + 1) {
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index c32c15216da3..f0373a6cb5fb 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -55,6 +55,7 @@ struct nfs_page {
 	unsigned short		wb_nio;		/* Number of I/O attempts */
 };
 
+struct nfs_pgio_mirror;
 struct nfs_pageio_descriptor;
 struct nfs_pageio_ops {
 	void	(*pg_init)(struct nfs_pageio_descriptor *, struct nfs_page *);
@@ -64,6 +65,9 @@ struct nfs_pageio_ops {
 	unsigned int	(*pg_get_mirror_count)(struct nfs_pageio_descriptor *,
 				       struct nfs_page *);
 	void	(*pg_cleanup)(struct nfs_pageio_descriptor *);
+	struct nfs_pgio_mirror *
+		(*pg_get_mirror)(struct nfs_pageio_descriptor *, u32);
+	u32	(*pg_set_mirror)(struct nfs_pageio_descriptor *, u32);
 };
 
 struct nfs_rw_ops {
-- 
cgit 


From 5a7e702670adc368caa1b64c2138956d8cba0d4f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 13 Mar 2020 10:42:09 -0400
Subject: SUNRPC: Adjust synopsis of xdr_buf_subsegment()

Clean up: This enables xdr_buf_subsegment()'s callers to pass in a
const pointer to that buffer.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdr.h | 3 ++-
 net/sunrpc/xdr.c           | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 9548d075e06d..ec2a22ccdc2a 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -183,7 +183,8 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
  */
 extern void xdr_shift_buf(struct xdr_buf *, size_t);
 extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
-extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
+extern int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
+			      unsigned int base, unsigned int len);
 extern void xdr_buf_trim(struct xdr_buf *, unsigned int);
 extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
 extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 71e03b930b70..28f81769a27c 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1379,9 +1379,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
  *
  * Returns -1 if base of length are out of bounds.
  */
-int
-xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
-			unsigned int base, unsigned int len)
+int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
+		       unsigned int base, unsigned int len)
 {
 	subbuf->buflen = subbuf->len = len;
 	if (base < buf->head[0].iov_len) {
-- 
cgit 


From 03493bca084fdca48abc59b00e06ce733aa9eb7d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 10 Jun 2020 10:36:42 -0400
Subject: SUNRPC: Rename svc_encode_read_payload()

Clean up: "result payload" is a less confusing name for these
payloads. "READ payload" reflects only the NFS usage.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c                        |  2 +-
 include/linux/sunrpc/svc.h               |  6 +++---
 include/linux/sunrpc/svc_rdma.h          |  4 ++--
 include/linux/sunrpc/svc_xprt.h          |  4 ++--
 net/sunrpc/svc.c                         | 11 ++++++-----
 net/sunrpc/svcsock.c                     |  8 ++++----
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    |  8 ++++----
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  2 +-
 8 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 833a2c64dfe8..7e24fb3ca36e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3829,7 +3829,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
 	read->rd_length = maxcount;
 	if (nfserr)
 		return nfserr;
-	if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount))
+	if (svc_encode_result_payload(resp->rqstp, starting_len + 8, maxcount))
 		return nfserr_io;
 	xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount));
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 386628b36bc7..c220b734fa69 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -519,9 +519,9 @@ void		   svc_wake_up(struct svc_serv *);
 void		   svc_reserve(struct svc_rqst *rqstp, int space);
 struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
 char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
-int		   svc_encode_read_payload(struct svc_rqst *rqstp,
-					   unsigned int offset,
-					   unsigned int length);
+int		   svc_encode_result_payload(struct svc_rqst *rqstp,
+					     unsigned int offset,
+					     unsigned int length);
 unsigned int	   svc_fill_write_vector(struct svc_rqst *rqstp,
 					 struct page **pages,
 					 struct kvec *first, size_t total);
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 9dc3a3b88391..2b870a3f391b 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -207,8 +207,8 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 				    struct svc_rdma_recv_ctxt *rctxt,
 				    int status);
 extern int svc_rdma_sendto(struct svc_rqst *);
-extern int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset,
-				 unsigned int length);
+extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+				   unsigned int length);
 
 /* svc_rdma_transport.c */
 extern struct svc_xprt_class svc_rdma_class;
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index aca35ab5cff2..92455e0d5244 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -21,8 +21,8 @@ struct svc_xprt_ops {
 	int		(*xpo_has_wspace)(struct svc_xprt *);
 	int		(*xpo_recvfrom)(struct svc_rqst *);
 	int		(*xpo_sendto)(struct svc_rqst *);
-	int		(*xpo_read_payload)(struct svc_rqst *, unsigned int,
-					    unsigned int);
+	int		(*xpo_result_payload)(struct svc_rqst *, unsigned int,
+					      unsigned int);
 	void		(*xpo_release_rqst)(struct svc_rqst *);
 	void		(*xpo_detach)(struct svc_xprt *);
 	void		(*xpo_free)(struct svc_xprt *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c211b607239e..b41500645c3f 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1622,7 +1622,7 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
 EXPORT_SYMBOL_GPL(svc_max_payload);
 
 /**
- * svc_encode_read_payload - mark a range of bytes as a READ payload
+ * svc_encode_result_payload - mark a range of bytes as a result payload
  * @rqstp: svc_rqst to operate on
  * @offset: payload's byte offset in rqstp->rq_res
  * @length: size of payload, in bytes
@@ -1630,12 +1630,13 @@ EXPORT_SYMBOL_GPL(svc_max_payload);
  * Returns zero on success, or a negative errno if a permanent
  * error occurred.
  */
-int svc_encode_read_payload(struct svc_rqst *rqstp, unsigned int offset,
-			    unsigned int length)
+int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+			      unsigned int length)
 {
-	return rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, offset, length);
+	return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset,
+							   length);
 }
-EXPORT_SYMBOL_GPL(svc_encode_read_payload);
+EXPORT_SYMBOL_GPL(svc_encode_result_payload);
 
 /**
  * svc_fill_write_vector - Construct data argument for VFS write call
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c2752e2b9ce3..b248f2349437 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -181,8 +181,8 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
 	}
 }
 
-static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset,
-				 unsigned int length)
+static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+				   unsigned int length)
 {
 	return 0;
 }
@@ -635,7 +635,7 @@ static const struct svc_xprt_ops svc_udp_ops = {
 	.xpo_create = svc_udp_create,
 	.xpo_recvfrom = svc_udp_recvfrom,
 	.xpo_sendto = svc_udp_sendto,
-	.xpo_read_payload = svc_sock_read_payload,
+	.xpo_result_payload = svc_sock_result_payload,
 	.xpo_release_rqst = svc_udp_release_rqst,
 	.xpo_detach = svc_sock_detach,
 	.xpo_free = svc_sock_free,
@@ -1123,7 +1123,7 @@ static const struct svc_xprt_ops svc_tcp_ops = {
 	.xpo_create = svc_tcp_create,
 	.xpo_recvfrom = svc_tcp_recvfrom,
 	.xpo_sendto = svc_tcp_sendto,
-	.xpo_read_payload = svc_sock_read_payload,
+	.xpo_result_payload = svc_sock_result_payload,
 	.xpo_release_rqst = svc_tcp_release_rqst,
 	.xpo_detach = svc_tcp_sock_detach,
 	.xpo_free = svc_sock_free,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index c3d588b149aa..c8411b4f3492 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -979,19 +979,19 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 }
 
 /**
- * svc_rdma_read_payload - special processing for a READ payload
+ * svc_rdma_result_payload - special processing for a result payload
  * @rqstp: svc_rqst to operate on
  * @offset: payload's byte offset in @xdr
  * @length: size of payload, in bytes
  *
  * Returns zero on success.
  *
- * For the moment, just record the xdr_buf location of the READ
+ * For the moment, just record the xdr_buf location of the result
  * payload. svc_rdma_sendto will use that location later when
  * we actually send the payload.
  */
-int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset,
-			  unsigned int length)
+int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+			    unsigned int length)
 {
 	struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fb044792b571..afba4e9d5425 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -80,7 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = {
 	.xpo_create = svc_rdma_create,
 	.xpo_recvfrom = svc_rdma_recvfrom,
 	.xpo_sendto = svc_rdma_sendto,
-	.xpo_read_payload = svc_rdma_read_payload,
+	.xpo_result_payload = svc_rdma_result_payload,
 	.xpo_release_rqst = svc_rdma_release_rqst,
 	.xpo_detach = svc_rdma_detach,
 	.xpo_free = svc_rdma_free,
-- 
cgit 


From f6ad77590a5d432589a5d8a211c4e8e50cd8bb63 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 13 Mar 2020 10:42:10 -0400
Subject: svcrdma: Post RDMA Writes while XDR encoding replies

The only RPC/RDMA ordering requirement between RDMA Writes and RDMA
Sends is that the responder must post the Writes on the Send queue
before posting the Send that conveys the RPC Reply for that Write
payload.

The Linux NFS server implementation now has a transport method that
can post result Payload Writes earlier than svc_rdma_sendto:

   ->xpo_result_payload()

This gets RDMA Writes going earlier so they are more likely to be
complete at the remote end before the Send completes.

Some care must be taken with pulled-up Replies. We don't want to
push the Write chunk and then send the same payload data via Send.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       |  4 +--
 net/sunrpc/xprtrdma/svc_rdma_rw.c     | 52 ++++++++++++++++++++++++------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 60 ++++++++++++++++++++---------------
 3 files changed, 77 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 2b870a3f391b..f5a3c852bb90 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -183,9 +183,7 @@ extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma,
 				    struct svc_rqst *rqstp,
 				    struct svc_rdma_recv_ctxt *head, __be32 *p);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
-				     __be32 *wr_ch, struct xdr_buf *xdr,
-				     unsigned int offset,
-				     unsigned long length);
+				     __be32 *wr_ch, const struct xdr_buf *xdr);
 extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_recv_ctxt *rctxt,
 				     struct xdr_buf *xdr);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index a75f8a133bef..f2ed1bf50251 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -538,13 +538,49 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
 				     length);
 }
 
+/**
+ * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf
+ * @xdr: xdr_buf to write
+ * @info: pointer to write arguments
+ *
+ * Returns:
+ *   On succes, returns zero
+ *   %-E2BIG if the client-provided Write chunk is too small
+ *   %-ENOMEM if a resource has been exhausted
+ *   %-EIO if an rdma-rw error occurred
+ */
+static int svc_rdma_xb_write(const struct xdr_buf *xdr,
+			     struct svc_rdma_write_info *info)
+{
+	int ret;
+
+	if (xdr->head[0].iov_len) {
+		ret = svc_rdma_iov_write(info, &xdr->head[0]);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (xdr->page_len) {
+		ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len,
+					   xdr->page_len);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (xdr->tail[0].iov_len) {
+		ret = svc_rdma_iov_write(info, &xdr->tail[0]);
+		if (ret < 0)
+			return ret;
+	}
+
+	return xdr->len;
+}
+
 /**
  * svc_rdma_send_write_chunk - Write all segments in a Write chunk
  * @rdma: controlling RDMA transport
  * @wr_ch: Write chunk provided by client
  * @xdr: xdr_buf containing the data payload
- * @offset: payload's byte offset in @xdr
- * @length: size of payload, in bytes
  *
  * Returns a non-negative number of bytes the chunk consumed, or
  *	%-E2BIG if the payload was larger than the Write chunk,
@@ -554,21 +590,17 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
  *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
  */
 int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
-			      struct xdr_buf *xdr,
-			      unsigned int offset, unsigned long length)
+			      const struct xdr_buf *xdr)
 {
 	struct svc_rdma_write_info *info;
 	int ret;
 
-	if (!length)
-		return 0;
-
 	info = svc_rdma_write_info_alloc(rdma, wr_ch);
 	if (!info)
 		return -ENOMEM;
 
-	ret = svc_rdma_pages_write(info, xdr, offset, length);
-	if (ret < 0)
+	ret = svc_rdma_xb_write(xdr, info);
+	if (ret != xdr->len)
 		goto out_err;
 
 	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
@@ -576,7 +608,7 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
 		goto out_err;
 
 	trace_svcrdma_send_write_chunk(xdr->page_len);
-	return length;
+	return xdr->len;
 
 out_err:
 	svc_rdma_write_info_free(info);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index d6436c13d5c4..e8b0d030e1e6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -468,11 +468,14 @@ svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
 {
 	ssize_t len, ret;
 
-	ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt,
-					  rctxt->rc_read_payload_length);
-	if (ret < 0)
-		return ret;
-	len = ret;
+	len = 0;
+	if (rctxt->rc_write_list) {
+		ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt,
+						  rctxt->rc_read_payload_length);
+		if (ret < 0)
+			return ret;
+		len = ret;
+	}
 
 	/* Terminate the Write list */
 	ret = xdr_stream_encode_item_absent(&sctxt->sc_stream);
@@ -556,11 +559,13 @@ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
 				    const struct svc_rdma_recv_ctxt *rctxt,
 				    struct xdr_buf *xdr)
 {
+	bool write_chunk_present = rctxt && rctxt->rc_write_list;
 	int elements;
 
 	/* For small messages, copying bytes is cheaper than DMA mapping.
 	 */
-	if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
+	if (!write_chunk_present &&
+	    sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
 		return true;
 
 	/* Check whether the xdr_buf has more elements than can
@@ -893,9 +898,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
 	struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
 	__be32 *rdma_argp = rctxt->rc_recv_buf;
-	__be32 *wr_lst = rctxt->rc_write_list;
 	__be32 *rp_ch = rctxt->rc_reply_chunk;
-	struct xdr_buf *xdr = &rqstp->rq_res;
 	struct svc_rdma_send_ctxt *sctxt;
 	__be32 *p;
 	int ret;
@@ -920,19 +923,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
 	if (svc_rdma_encode_read_list(sctxt) < 0)
 		goto err0;
-	if (wr_lst) {
-		/* XXX: Presume the client sent only one Write chunk */
-		ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr,
-						rctxt->rc_read_payload_offset,
-						rctxt->rc_read_payload_length);
-		if (ret < 0)
-			goto err2;
-		if (svc_rdma_encode_write_list(rctxt, sctxt) < 0)
-			goto err0;
-	} else {
-		if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
-			goto err0;
-	}
+	if (svc_rdma_encode_write_list(rctxt, sctxt) < 0)
+		goto err0;
 	if (rp_ch) {
 		ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
 		if (ret < 0)
@@ -974,16 +966,25 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
  * @offset: payload's byte offset in @xdr
  * @length: size of payload, in bytes
  *
- * Returns zero on success.
- *
- * For the moment, just record the xdr_buf location of the result
- * payload. svc_rdma_sendto will use that location later when
- * we actually send the payload.
+ * Return values:
+ *   %0 if successful or nothing needed to be done
+ *   %-EMSGSIZE on XDR buffer overflow
+ *   %-E2BIG if the payload was larger than the Write chunk
+ *   %-EINVAL if client provided too many segments
+ *   %-ENOMEM if rdma_rw context pool was exhausted
+ *   %-ENOTCONN if posting failed (connection is lost)
+ *   %-EIO if rdma_rw initialization failed (DMA mapping, etc)
  */
 int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
 			    unsigned int length)
 {
 	struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+	struct svcxprt_rdma *rdma;
+	struct xdr_buf subbuf;
+	int ret;
+
+	if (!rctxt->rc_write_list || !length)
+		return 0;
 
 	/* XXX: Just one READ payload slot for now, since our
 	 * transport implementation currently supports only one
@@ -992,5 +993,12 @@ int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
 	rctxt->rc_read_payload_offset = offset;
 	rctxt->rc_read_payload_length = length;
 
+	if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length))
+		return -EMSGSIZE;
+
+	rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
+	ret = svc_rdma_send_write_chunk(rdma, rctxt->rc_write_list, &subbuf);
+	if (ret < 0)
+		return ret;
 	return 0;
 }
-- 
cgit 


From 78147ca8b4a9b6cf0e597ddd6bf17959e08376c2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 22 Jun 2020 10:15:41 -0400
Subject: svcrdma: Add a "parsed chunk list" data structure

This simple data structure binds the location of each data payload
inside of an RPC message to the chunk that will be used to push it
to or pull it from the client.

There are several benefits to this small additional overhead:

 * It enables support for more than one chunk in incoming Read and
   Write lists.

 * It translates the version-specific on-the-wire format into a
   generic in-memory structure, enabling support for multiple
   versions of the RPC/RDMA transport protocol.

 * It enables the server to re-organize a chunk list if it needs to
   adjust where Read chunk data lands in server memory without
   altering the contents of the XDR-encoded Receive buffer.

Construction of these lists is done while sanity checking each
incoming RPC/RDMA header. Subsequent patches will make use of the
generated data structures.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |  12 ++
 include/linux/sunrpc/svc_rdma_pcl.h     | 128 +++++++++++++
 include/trace/events/rpcrdma.h          |  75 +++++++-
 net/sunrpc/xprtrdma/Makefile            |   2 +-
 net/sunrpc/xprtrdma/svc_rdma_pcl.c      | 306 ++++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 196 +++++++++++---------
 6 files changed, 635 insertions(+), 84 deletions(-)
 create mode 100644 include/linux/sunrpc/svc_rdma_pcl.h
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma_pcl.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f5a3c852bb90..a89d4209fe2a 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -47,6 +47,8 @@
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/rpc_rdma.h>
 #include <linux/sunrpc/rpc_rdma_cid.h>
+#include <linux/sunrpc/svc_rdma_pcl.h>
+
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 
@@ -142,8 +144,18 @@ struct svc_rdma_recv_ctxt {
 	unsigned int		rc_page_count;
 	unsigned int		rc_hdr_count;
 	u32			rc_inv_rkey;
+
+	struct svc_rdma_pcl	rc_call_pcl;
+
+	struct svc_rdma_pcl	rc_read_pcl;
+
 	__be32			*rc_write_list;
+	struct svc_rdma_chunk	*rc_cur_result_payload;
+	struct svc_rdma_pcl	rc_write_pcl;
+
 	__be32			*rc_reply_chunk;
+	struct svc_rdma_pcl	rc_reply_pcl;
+
 	unsigned int		rc_read_payload_offset;
 	unsigned int		rc_read_payload_length;
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
diff --git a/include/linux/sunrpc/svc_rdma_pcl.h b/include/linux/sunrpc/svc_rdma_pcl.h
new file mode 100644
index 000000000000..7516ad0fae80
--- /dev/null
+++ b/include/linux/sunrpc/svc_rdma_pcl.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates
+ */
+
+#ifndef SVC_RDMA_PCL_H
+#define SVC_RDMA_PCL_H
+
+#include <linux/list.h>
+
+struct svc_rdma_segment {
+	u32			rs_handle;
+	u32			rs_length;
+	u64			rs_offset;
+};
+
+struct svc_rdma_chunk {
+	struct list_head	ch_list;
+
+	u32			ch_position;
+	u32			ch_length;
+	u32			ch_payload_length;
+
+	u32			ch_segcount;
+	struct svc_rdma_segment	ch_segments[];
+};
+
+struct svc_rdma_pcl {
+	unsigned int		cl_count;
+	struct list_head	cl_chunks;
+};
+
+/**
+ * pcl_init - Initialize a parsed chunk list
+ * @pcl: parsed chunk list to initialize
+ *
+ */
+static inline void pcl_init(struct svc_rdma_pcl *pcl)
+{
+	INIT_LIST_HEAD(&pcl->cl_chunks);
+}
+
+/**
+ * pcl_is_empty - Return true if parsed chunk list is empty
+ * @pcl: parsed chunk list
+ *
+ */
+static inline bool pcl_is_empty(const struct svc_rdma_pcl *pcl)
+{
+	return list_empty(&pcl->cl_chunks);
+}
+
+/**
+ * pcl_first_chunk - Return first chunk in a parsed chunk list
+ * @pcl: parsed chunk list
+ *
+ * Returns the first chunk in the list, or NULL if the list is empty.
+ */
+static inline struct svc_rdma_chunk *
+pcl_first_chunk(const struct svc_rdma_pcl *pcl)
+{
+	if (pcl_is_empty(pcl))
+		return NULL;
+	return list_first_entry(&pcl->cl_chunks, struct svc_rdma_chunk,
+				ch_list);
+}
+
+/**
+ * pcl_next_chunk - Return next chunk in a parsed chunk list
+ * @pcl: a parsed chunk list
+ * @chunk: chunk in @pcl
+ *
+ * Returns the next chunk in the list, or NULL if @chunk is already last.
+ */
+static inline struct svc_rdma_chunk *
+pcl_next_chunk(const struct svc_rdma_pcl *pcl, struct svc_rdma_chunk *chunk)
+{
+	if (list_is_last(&chunk->ch_list, &pcl->cl_chunks))
+		return NULL;
+	return list_next_entry(chunk, ch_list);
+}
+
+/**
+ * pcl_for_each_chunk - Iterate over chunks in a parsed chunk list
+ * @pos: the loop cursor
+ * @pcl: a parsed chunk list
+ */
+#define pcl_for_each_chunk(pos, pcl) \
+	for (pos = list_first_entry(&(pcl)->cl_chunks, struct svc_rdma_chunk, ch_list); \
+	     &pos->ch_list != &(pcl)->cl_chunks; \
+	     pos = list_next_entry(pos, ch_list))
+
+/**
+ * pcl_for_each_segment - Iterate over segments in a parsed chunk
+ * @pos: the loop cursor
+ * @chunk: a parsed chunk
+ */
+#define pcl_for_each_segment(pos, chunk) \
+	for (pos = &(chunk)->ch_segments[0]; \
+	     pos <= &(chunk)->ch_segments[(chunk)->ch_segcount - 1]; \
+	     pos++)
+
+/**
+ * pcl_chunk_end_offset - Return offset of byte range following @chunk
+ * @chunk: chunk in @pcl
+ *
+ * Returns starting offset of the region just after @chunk
+ */
+static inline unsigned int
+pcl_chunk_end_offset(const struct svc_rdma_chunk *chunk)
+{
+	return xdr_align_size(chunk->ch_position + chunk->ch_payload_length);
+}
+
+struct svc_rdma_recv_ctxt;
+
+extern void pcl_free(struct svc_rdma_pcl *pcl);
+extern bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p);
+extern bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p);
+extern bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
+			    struct svc_rdma_pcl *pcl, __be32 *p);
+extern int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl,
+				   const struct xdr_buf *xdr,
+				   int (*actor)(const struct xdr_buf *,
+						void *),
+				   void *data);
+
+#endif	/* SVC_RDMA_PCL_H */
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index bf1065772228..72b941aef43b 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -1446,12 +1446,83 @@ DECLARE_EVENT_CLASS(svcrdma_segment_event,
 				),					\
 				TP_ARGS(handle, length, offset))
 
-DEFINE_SEGMENT_EVENT(decode_wseg);
-DEFINE_SEGMENT_EVENT(encode_rseg);
 DEFINE_SEGMENT_EVENT(send_rseg);
 DEFINE_SEGMENT_EVENT(encode_wseg);
 DEFINE_SEGMENT_EVENT(send_wseg);
 
+TRACE_EVENT(svcrdma_decode_rseg,
+	TP_PROTO(
+		const struct rpc_rdma_cid *cid,
+		const struct svc_rdma_chunk *chunk,
+		const struct svc_rdma_segment *segment
+	),
+
+	TP_ARGS(cid, chunk, segment),
+
+	TP_STRUCT__entry(
+		__field(u32, cq_id)
+		__field(int, completion_id)
+		__field(u32, segno)
+		__field(u32, position)
+		__field(u32, handle)
+		__field(u32, length)
+		__field(u64, offset)
+	),
+
+	TP_fast_assign(
+		__entry->cq_id = cid->ci_queue_id;
+		__entry->completion_id = cid->ci_completion_id;
+		__entry->segno = chunk->ch_segcount;
+		__entry->position = chunk->ch_position;
+		__entry->handle = segment->rs_handle;
+		__entry->length = segment->rs_length;
+		__entry->offset = segment->rs_offset;
+	),
+
+	TP_printk("cq_id=%u cid=%d segno=%u position=%u %u@0x%016llx:0x%08x",
+		__entry->cq_id, __entry->completion_id,
+		__entry->segno, __entry->position, __entry->length,
+		(unsigned long long)__entry->offset, __entry->handle
+	)
+);
+
+TRACE_EVENT(svcrdma_decode_wseg,
+	TP_PROTO(
+		const struct rpc_rdma_cid *cid,
+		const struct svc_rdma_chunk *chunk,
+		u32 segno
+	),
+
+	TP_ARGS(cid, chunk, segno),
+
+	TP_STRUCT__entry(
+		__field(u32, cq_id)
+		__field(int, completion_id)
+		__field(u32, segno)
+		__field(u32, handle)
+		__field(u32, length)
+		__field(u64, offset)
+	),
+
+	TP_fast_assign(
+		const struct svc_rdma_segment *segment =
+			&chunk->ch_segments[segno];
+
+		__entry->cq_id = cid->ci_queue_id;
+		__entry->completion_id = cid->ci_completion_id;
+		__entry->segno = segno;
+		__entry->handle = segment->rs_handle;
+		__entry->length = segment->rs_length;
+		__entry->offset = segment->rs_offset;
+	),
+
+	TP_printk("cq_id=%u cid=%d segno=%u %u@0x%016llx:0x%08x",
+		__entry->cq_id, __entry->completion_id,
+		__entry->segno, __entry->length,
+		(unsigned long long)__entry->offset, __entry->handle
+	)
+);
+
 DECLARE_EVENT_CLASS(svcrdma_chunk_event,
 	TP_PROTO(
 		u32 length
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 8ed0377d7a18..55b21bae866d 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
 rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
 	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
 	svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
-	module.o
+	svc_rdma_pcl.o module.o
 rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
new file mode 100644
index 000000000000..b63cfeaa2923
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Oracle. All rights reserved.
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/rpc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+/**
+ * pcl_free - Release all memory associated with a parsed chunk list
+ * @pcl: parsed chunk list
+ *
+ */
+void pcl_free(struct svc_rdma_pcl *pcl)
+{
+	while (!list_empty(&pcl->cl_chunks)) {
+		struct svc_rdma_chunk *chunk;
+
+		chunk = pcl_first_chunk(pcl);
+		list_del(&chunk->ch_list);
+		kfree(chunk);
+	}
+}
+
+static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position)
+{
+	struct svc_rdma_chunk *chunk;
+
+	chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	chunk->ch_position = position;
+	chunk->ch_length = 0;
+	chunk->ch_payload_length = 0;
+	chunk->ch_segcount = 0;
+	return chunk;
+}
+
+static struct svc_rdma_chunk *
+pcl_lookup_position(struct svc_rdma_pcl *pcl, u32 position)
+{
+	struct svc_rdma_chunk *pos;
+
+	pcl_for_each_chunk(pos, pcl) {
+		if (pos->ch_position == position)
+			return pos;
+	}
+	return NULL;
+}
+
+static void pcl_insert_position(struct svc_rdma_pcl *pcl,
+				struct svc_rdma_chunk *chunk)
+{
+	struct svc_rdma_chunk *pos;
+
+	pcl_for_each_chunk(pos, pcl) {
+		if (pos->ch_position > chunk->ch_position)
+			break;
+	}
+	__list_add(&chunk->ch_list, pos->ch_list.prev, &pos->ch_list);
+	pcl->cl_count++;
+}
+
+static void pcl_set_read_segment(const struct svc_rdma_recv_ctxt *rctxt,
+				 struct svc_rdma_chunk *chunk,
+				 u32 handle, u32 length, u64 offset)
+{
+	struct svc_rdma_segment *segment;
+
+	segment = &chunk->ch_segments[chunk->ch_segcount];
+	segment->rs_handle = handle;
+	segment->rs_length = length;
+	segment->rs_offset = offset;
+
+	trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment);
+
+	chunk->ch_length += length;
+	chunk->ch_segcount++;
+}
+
+/**
+ * pcl_alloc_call - Construct a parsed chunk list for the Call body
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Assumptions:
+ * - The incoming Read list has already been sanity checked.
+ * - cl_count is already set to the number of segments in
+ *   the un-decoded list.
+ * - The list might not be in order by position.
+ *
+ * Return values:
+ *       %true: Parsed chunk list was successfully constructed, and
+ *              cl_count is updated to be the number of chunks (ie.
+ *              unique positions) in the Read list.
+ *      %false: Memory allocation failed.
+ */
+bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+	struct svc_rdma_pcl *pcl = &rctxt->rc_call_pcl;
+	unsigned int i, segcount = pcl->cl_count;
+
+	pcl->cl_count = 0;
+	for (i = 0; i < segcount; i++) {
+		struct svc_rdma_chunk *chunk;
+		u32 position, handle, length;
+		u64 offset;
+
+		p++;	/* skip the list discriminator */
+		p = xdr_decode_read_segment(p, &position, &handle,
+					    &length, &offset);
+		if (position != 0)
+			continue;
+
+		if (pcl_is_empty(pcl)) {
+			chunk = pcl_alloc_chunk(segcount, position);
+			if (!chunk)
+				return false;
+			pcl_insert_position(pcl, chunk);
+		} else {
+			chunk = list_first_entry(&pcl->cl_chunks,
+						 struct svc_rdma_chunk,
+						 ch_list);
+		}
+
+		pcl_set_read_segment(rctxt, chunk, handle, length, offset);
+	}
+
+	return true;
+}
+
+/**
+ * pcl_alloc_read - Construct a parsed chunk list for normal Read chunks
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Assumptions:
+ * - The incoming Read list has already been sanity checked.
+ * - cl_count is already set to the number of segments in
+ *   the un-decoded list.
+ * - The list might not be in order by position.
+ *
+ * Return values:
+ *       %true: Parsed chunk list was successfully constructed, and
+ *              cl_count is updated to be the number of chunks (ie.
+ *              unique position values) in the Read list.
+ *      %false: Memory allocation failed.
+ *
+ * TODO:
+ * - Check for chunk range overlaps
+ */
+bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+	struct svc_rdma_pcl *pcl = &rctxt->rc_read_pcl;
+	unsigned int i, segcount = pcl->cl_count;
+
+	pcl->cl_count = 0;
+	for (i = 0; i < segcount; i++) {
+		struct svc_rdma_chunk *chunk;
+		u32 position, handle, length;
+		u64 offset;
+
+		p++;	/* skip the list discriminator */
+		p = xdr_decode_read_segment(p, &position, &handle,
+					    &length, &offset);
+		if (position == 0)
+			continue;
+
+		chunk = pcl_lookup_position(pcl, position);
+		if (!chunk) {
+			chunk = pcl_alloc_chunk(segcount, position);
+			if (!chunk)
+				return false;
+			pcl_insert_position(pcl, chunk);
+		}
+
+		pcl_set_read_segment(rctxt, chunk, handle, length, offset);
+	}
+
+	return true;
+}
+
+/**
+ * pcl_alloc_write - Construct a parsed chunk list from a Write list
+ * @rctxt: Ingress receive context
+ * @pcl: Parsed chunk list to populate
+ * @p: Start of an un-decoded Write list
+ *
+ * Assumptions:
+ * - The incoming Write list has already been sanity checked, and
+ * - cl_count is set to the number of chunks in the un-decoded list.
+ *
+ * Return values:
+ *       %true: Parsed chunk list was successfully constructed.
+ *      %false: Memory allocation failed.
+ */
+bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
+		     struct svc_rdma_pcl *pcl, __be32 *p)
+{
+	struct svc_rdma_segment *segment;
+	struct svc_rdma_chunk *chunk;
+	unsigned int i, j;
+	u32 segcount;
+
+	for (i = 0; i < pcl->cl_count; i++) {
+		p++;	/* skip the list discriminator */
+		segcount = be32_to_cpup(p++);
+
+		chunk = pcl_alloc_chunk(segcount, 0);
+		if (!chunk)
+			return false;
+		list_add_tail(&chunk->ch_list, &pcl->cl_chunks);
+
+		for (j = 0; j < segcount; j++) {
+			segment = &chunk->ch_segments[j];
+			p = xdr_decode_rdma_segment(p, &segment->rs_handle,
+						    &segment->rs_length,
+						    &segment->rs_offset);
+			trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j);
+
+			chunk->ch_length += segment->rs_length;
+			chunk->ch_segcount++;
+		}
+	}
+	return true;
+}
+
+static int pcl_process_region(const struct xdr_buf *xdr,
+			      unsigned int offset, unsigned int length,
+			      int (*actor)(const struct xdr_buf *, void *),
+			      void *data)
+{
+	struct xdr_buf subbuf;
+
+	if (!length)
+		return 0;
+	if (xdr_buf_subsegment(xdr, &subbuf, offset, length))
+		return -EMSGSIZE;
+	return actor(&subbuf, data);
+}
+
+/**
+ * pcl_process_nonpayloads - Process non-payload regions inside @xdr
+ * @pcl: Chunk list to process
+ * @xdr: xdr_buf to process
+ * @actor: Function to invoke on each non-payload region
+ * @data: Arguments for @actor
+ *
+ * This mechanism must ignore not only result payloads that were already
+ * sent via RDMA Write, but also XDR padding for those payloads that
+ * the upper layer has added.
+ *
+ * Assumptions:
+ *  The xdr->len and ch_position fields are aligned to 4-byte multiples.
+ *
+ * Returns:
+ *   On success, zero,
+ *   %-EMSGSIZE on XDR buffer overflow, or
+ *   The return value of @actor
+ */
+int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl,
+			    const struct xdr_buf *xdr,
+			    int (*actor)(const struct xdr_buf *, void *),
+			    void *data)
+{
+	struct svc_rdma_chunk *chunk, *next;
+	unsigned int start;
+	int ret;
+
+	chunk = pcl_first_chunk(pcl);
+
+	/* No result payloads were generated */
+	if (!chunk || !chunk->ch_payload_length)
+		return actor(xdr, data);
+
+	/* Process the region before the first result payload */
+	ret = pcl_process_region(xdr, 0, chunk->ch_position, actor, data);
+	if (ret < 0)
+		return ret;
+
+	/* Process the regions between each middle result payload */
+	while ((next = pcl_next_chunk(pcl, chunk))) {
+		if (!next->ch_payload_length)
+			break;
+
+		start = pcl_chunk_end_offset(chunk);
+		ret = pcl_process_region(xdr, start, next->ch_position - start,
+					 actor, data);
+		if (ret < 0)
+			return ret;
+
+		chunk = next;
+	}
+
+	/* Process the region after the last result payload */
+	start = pcl_chunk_end_offset(chunk);
+	ret = pcl_process_region(xdr, start, xdr->len - start, actor, data);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index c6ea2903c21a..ec9d259b149c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -93,6 +93,7 @@
  * (see rdma_read_complete() below).
  */
 
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <asm/unaligned.h>
 #include <rdma/ib_verbs.h>
@@ -143,6 +144,10 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 		goto fail2;
 
 	svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
+	pcl_init(&ctxt->rc_call_pcl);
+	pcl_init(&ctxt->rc_read_pcl);
+	pcl_init(&ctxt->rc_write_pcl);
+	pcl_init(&ctxt->rc_reply_pcl);
 
 	ctxt->rc_recv_wr.next = NULL;
 	ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
@@ -226,6 +231,11 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
 	for (i = 0; i < ctxt->rc_page_count; i++)
 		put_page(ctxt->rc_pages[i]);
 
+	pcl_free(&ctxt->rc_call_pcl);
+	pcl_free(&ctxt->rc_read_pcl);
+	pcl_free(&ctxt->rc_write_pcl);
+	pcl_free(&ctxt->rc_reply_pcl);
+
 	if (!ctxt->rc_temp)
 		llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
 	else
@@ -385,100 +395,123 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	arg->len = ctxt->rc_byte_len;
 }
 
-/* This accommodates the largest possible Write chunk.
- */
-#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
-
-/* This accommodates the largest possible Position-Zero
- * Read chunk or Reply chunk.
- */
-#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
-
-/* Sanity check the Read list.
+/**
+ * xdr_count_read_segments - Count number of Read segments in Read list
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
  *
- * Implementation limits:
- * - This implementation supports only one Read chunk.
+ * Before allocating anything, ensure the ingress Read list is safe
+ * to use.
  *
- * Sanity checks:
- * - Read list does not overflow Receive buffer.
- * - Segment size limited by largest NFS data payload.
- *
- * The segment count is limited to how many segments can
- * fit in the transport header without overflowing the
- * buffer. That's about 40 Read segments for a 1KB inline
- * threshold.
+ * The segment count is limited to how many segments can fit in the
+ * transport header without overflowing the buffer. That's about 40
+ * Read segments for a 1KB inline threshold.
  *
  * Return values:
- *       %true: Read list is valid. @rctxt's xdr_stream is updated
- *		to point to the first byte past the Read list.
- *      %false: Read list is corrupt. @rctxt's xdr_stream is left
- *		in an unknown state.
+ *   %true: Read list is valid. @rctxt's xdr_stream is updated to point
+ *	    to the first byte past the Read list. rc_read_pcl and
+ *	    rc_call_pcl cl_count fields are set to the number of
+ *	    Read segments in the list.
+ *  %false: Read list is corrupt. @rctxt's xdr_stream is left in an
+ *	    unknown state.
  */
-static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
+static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
 {
-	u32 position, len;
-	bool first;
-	__be32 *p;
-
-	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
-	if (!p)
-		return false;
-
-	len = 0;
-	first = true;
+	rctxt->rc_call_pcl.cl_count = 0;
+	rctxt->rc_read_pcl.cl_count = 0;
 	while (xdr_item_is_present(p)) {
+		u32 position, handle, length;
+		u64 offset;
+
 		p = xdr_inline_decode(&rctxt->rc_stream,
 				      rpcrdma_readseg_maxsz * sizeof(*p));
 		if (!p)
 			return false;
 
-		if (first) {
-			position = be32_to_cpup(p);
-			first = false;
-		} else if (be32_to_cpup(p) != position) {
-			return false;
+		xdr_decode_read_segment(p, &position, &handle,
+					    &length, &offset);
+		if (position) {
+			if (position & 3)
+				return false;
+			++rctxt->rc_read_pcl.cl_count;
+		} else {
+			++rctxt->rc_call_pcl.cl_count;
 		}
-		p += 2;
-		len += be32_to_cpup(p);
 
 		p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
 		if (!p)
 			return false;
 	}
-	return len <= MAX_BYTES_SPECIAL_CHUNK;
+	return true;
 }
 
-/* The segment count is limited to how many segments can
- * fit in the transport header without overflowing the
- * buffer. That's about 60 Write segments for a 1KB inline
- * threshold.
+/* Sanity check the Read list.
+ *
+ * Sanity checks:
+ * - Read list does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ *   %true: Read list is valid. @rctxt's xdr_stream is updated
+ *	    to point to the first byte past the Read list.
+ *  %false: Read list is corrupt. @rctxt's xdr_stream is left
+ *	    in an unknown state.
  */
-static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
+static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
 {
-	u32 i, segcount, total;
 	__be32 *p;
 
 	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
 	if (!p)
 		return false;
-	segcount = be32_to_cpup(p);
+	if (!xdr_count_read_segments(rctxt, p))
+		return false;
+	if (!pcl_alloc_call(rctxt, p))
+		return false;
+	return pcl_alloc_read(rctxt, p);
+}
 
-	total = 0;
-	for (i = 0; i < segcount; i++) {
-		u32 handle, length;
-		u64 offset;
+static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt)
+{
+	u32 segcount;
+	__be32 *p;
 
-		p = xdr_inline_decode(&rctxt->rc_stream,
-				      rpcrdma_segment_maxsz * sizeof(*p));
-		if (!p)
-			return false;
+	if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount))
+		return false;
 
-		xdr_decode_rdma_segment(p, &handle, &length, &offset);
-		trace_svcrdma_decode_wseg(handle, length, offset);
+	/* A bogus segcount causes this buffer overflow check to fail. */
+	p = xdr_inline_decode(&rctxt->rc_stream,
+			      segcount * rpcrdma_segment_maxsz * sizeof(*p));
+	return p != NULL;
+}
 
-		total += length;
+/**
+ * xdr_count_write_chunks - Count number of Write chunks in Write list
+ * @rctxt: Received header and decoding state
+ * @p: start of an un-decoded Write list
+ *
+ * Before allocating anything, ensure the ingress Write list is
+ * safe to use.
+ *
+ * Return values:
+ *       %true: Write list is valid. @rctxt's xdr_stream is updated
+ *		to point to the first byte past the Write list, and
+ *		the number of Write chunks is in rc_write_pcl.cl_count.
+ *      %false: Write list is corrupt. @rctxt's xdr_stream is left
+ *		in an indeterminate state.
+ */
+static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+	rctxt->rc_write_pcl.cl_count = 0;
+	while (xdr_item_is_present(p)) {
+		if (!xdr_check_write_chunk(rctxt))
+			return false;
+		++rctxt->rc_write_pcl.cl_count;
+		p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+		if (!p)
+			return false;
 	}
-	return total <= maxlen;
+	return true;
 }
 
 /* Sanity check the Write list.
@@ -498,24 +531,22 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
  */
 static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
 {
-	u32 chcount = 0;
 	__be32 *p;
 
 	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
 	if (!p)
 		return false;
-	rctxt->rc_write_list = p;
-	while (xdr_item_is_present(p)) {
-		if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK))
-			return false;
-		++chcount;
-		p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
-		if (!p)
-			return false;
-	}
-	if (!chcount)
-		rctxt->rc_write_list = NULL;
-	return chcount < 2;
+
+	rctxt->rc_write_list = NULL;
+	if (!xdr_count_write_chunks(rctxt, p))
+		return false;
+	if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p))
+		return false;
+
+	if (!pcl_is_empty(&rctxt->rc_write_pcl))
+		rctxt->rc_write_list = p;
+	rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl);
+	return rctxt->rc_write_pcl.cl_count < 2;
 }
 
 /* Sanity check the Reply chunk.
@@ -537,13 +568,16 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
 	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
 	if (!p)
 		return false;
+
 	rctxt->rc_reply_chunk = NULL;
-	if (xdr_item_is_present(p)) {
-		if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK))
-			return false;
-		rctxt->rc_reply_chunk = p;
-	}
-	return true;
+	if (!xdr_item_is_present(p))
+		return true;
+	if (!xdr_check_write_chunk(rctxt))
+		return false;
+
+	rctxt->rc_reply_chunk = p;
+	rctxt->rc_reply_pcl.cl_count = 1;
+	return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p);
 }
 
 /* RPC-over-RDMA Version One private extension: Remote Invalidation.
-- 
cgit 


From 58b2e0fefa891c99f297120c8c062a35005dc562 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 22 Mar 2020 13:06:55 -0400
Subject: svcrdma: Use parsed chunk lists to detect reverse direction replies

Refactor: Don't duplicate header decoding smarts here. Instead, use
the new parsed chunk lists.

Note that the XID sanity test is also removed. The XID is already
looked up by the cb handler, and is rejected if it's not recognized.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |  1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 29 ++++++++++++++---------------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index a89d4209fe2a..74247a33b6c6 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -144,6 +144,7 @@ struct svc_rdma_recv_ctxt {
 	unsigned int		rc_page_count;
 	unsigned int		rc_hdr_count;
 	u32			rc_inv_rkey;
+	__be32			rc_msgtype;
 
 	struct svc_rdma_pcl	rc_call_pcl;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2755ca178b09..72b07e8aa3c9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -668,7 +668,8 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
 	if (*p != rpcrdma_version)
 		goto out_version;
 	p += 2;
-	switch (*p) {
+	rctxt->rc_msgtype = *p;
+	switch (rctxt->rc_msgtype) {
 	case rdma_msg:
 		break;
 	case rdma_nomsg:
@@ -762,30 +763,28 @@ static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
  * the RPC/RDMA header small and fixed in size, so it is
  * straightforward to check the RPC header's direction field.
  */
-static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
-					  __be32 *rdma_resp)
+static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
+						struct svc_rdma_recv_ctxt *rctxt)
 {
-	__be32 *p;
+	__be32 *p = rctxt->rc_recv_buf;
 
 	if (!xprt->xpt_bc_xprt)
 		return false;
 
-	p = rdma_resp + 3;
-	if (*p++ != rdma_msg)
+	if (rctxt->rc_msgtype != rdma_msg)
 		return false;
 
-	if (*p++ != xdr_zero)
+	if (!pcl_is_empty(&rctxt->rc_call_pcl))
+		return false;
+	if (!pcl_is_empty(&rctxt->rc_read_pcl))
 		return false;
-	if (*p++ != xdr_zero)
+	if (!pcl_is_empty(&rctxt->rc_write_pcl))
 		return false;
-	if (*p++ != xdr_zero)
+	if (!pcl_is_empty(&rctxt->rc_reply_pcl))
 		return false;
 
-	/* XID sanity */
-	if (*p++ != *rdma_resp)
-		return false;
-	/* call direction */
-	if (*p == cpu_to_be32(RPC_CALL))
+	/* RPC call direction */
+	if (*(p + 8) == cpu_to_be32(RPC_CALL))
 		return false;
 
 	return true;
@@ -868,7 +867,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		goto out_drop;
 	rqstp->rq_xprt_hlen = ret;
 
-	if (svc_rdma_is_backchannel_reply(xprt, p))
+	if (svc_rdma_is_reverse_direction_reply(xprt, ctxt))
 		goto out_backchannel;
 
 	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
-- 
cgit 


From 7a1cbfa18059a40d4752dab057384c3ca2de326c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 17 Jun 2020 11:07:00 -0400
Subject: svcrdma: Use parsed chunk lists to construct RDMA Writes

Refactor: Instead of re-parsing the ingress RPC Call transport
header when constructing RDMA Writes, use the new parsed chunk lists
for the Write list and Reply chunk, which are version-agnostic and
already XDR-decoded.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |  5 ++--
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  1 -
 net/sunrpc/xprtrdma/svc_rdma_rw.c       | 47 ++++++++++++++++-----------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c   | 22 ++++++++-------
 4 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 74247a33b6c6..d9148787efff 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -157,8 +157,6 @@ struct svc_rdma_recv_ctxt {
 	__be32			*rc_reply_chunk;
 	struct svc_rdma_pcl	rc_reply_pcl;
 
-	unsigned int		rc_read_payload_offset;
-	unsigned int		rc_read_payload_length;
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
@@ -196,7 +194,8 @@ extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma,
 				    struct svc_rqst *rqstp,
 				    struct svc_rdma_recv_ctxt *head, __be32 *p);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
-				     __be32 *wr_ch, const struct xdr_buf *xdr);
+				     const struct svc_rdma_chunk *chunk,
+				     const struct xdr_buf *xdr);
 extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_recv_ctxt *rctxt,
 				     struct xdr_buf *xdr);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 72b07e8aa3c9..7d44e9d2e7a3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -207,7 +207,6 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
 
 out:
 	ctxt->rc_page_count = 0;
-	ctxt->rc_read_payload_length = 0;
 	return ctxt;
 
 out_empty:
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index e6050230b49f..ed7def7b801b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -190,11 +190,11 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
  *  - Stores arguments for the SGL constructor functions
  */
 struct svc_rdma_write_info {
+	const struct svc_rdma_chunk	*wi_chunk;
+
 	/* write state of this chunk */
 	unsigned int		wi_seg_off;
 	unsigned int		wi_seg_no;
-	unsigned int		wi_nsegs;
-	__be32			*wi_segs;
 
 	/* SGL constructor arguments */
 	const struct xdr_buf	*wi_xdr;
@@ -205,7 +205,8 @@ struct svc_rdma_write_info {
 };
 
 static struct svc_rdma_write_info *
-svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
+			  const struct svc_rdma_chunk *chunk)
 {
 	struct svc_rdma_write_info *info;
 
@@ -213,10 +214,9 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
 	if (!info)
 		return info;
 
+	info->wi_chunk = chunk;
 	info->wi_seg_off = 0;
 	info->wi_seg_no = 0;
-	info->wi_nsegs = be32_to_cpup(++chunk);
-	info->wi_segs = ++chunk;
 	svc_rdma_cc_init(rdma, &info->wi_cc);
 	info->wi_cc.cc_cqe.done = svc_rdma_write_done;
 	return info;
@@ -443,23 +443,19 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
 {
 	struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
 	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	const struct svc_rdma_segment *seg;
 	struct svc_rdma_rw_ctxt *ctxt;
-	__be32 *seg;
 	int ret;
 
-	seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
 	do {
 		unsigned int write_len;
-		u32 handle, length;
 		u64 offset;
 
-		if (info->wi_seg_no >= info->wi_nsegs)
+		seg = &info->wi_chunk->ch_segments[info->wi_seg_no];
+		if (!seg)
 			goto out_overflow;
 
-		xdr_decode_rdma_segment(seg, &handle, &length, &offset);
-		offset += info->wi_seg_off;
-
-		write_len = min(remaining, length - info->wi_seg_off);
+		write_len = min(remaining, seg->rs_length - info->wi_seg_off);
 		if (!write_len)
 			goto out_overflow;
 		ctxt = svc_rdma_get_rw_ctxt(rdma,
@@ -468,17 +464,17 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
 			return -ENOMEM;
 
 		constructor(info, write_len, ctxt);
-		ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle,
+		offset = seg->rs_offset + info->wi_seg_off;
+		ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle,
 					   DMA_TO_DEVICE);
 		if (ret < 0)
 			return -EIO;
 
-		trace_svcrdma_send_wseg(handle, write_len, offset);
+		trace_svcrdma_send_wseg(seg->rs_handle, write_len, offset);
 
 		list_add(&ctxt->rw_list, &cc->cc_rwctxts);
 		cc->cc_sqecount += ret;
-		if (write_len == length - info->wi_seg_off) {
-			seg += 4;
+		if (write_len == seg->rs_length - info->wi_seg_off) {
 			info->wi_seg_no++;
 			info->wi_seg_off = 0;
 		} else {
@@ -491,7 +487,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
 
 out_overflow:
 	trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no,
-				     info->wi_nsegs);
+				     info->wi_chunk->ch_segcount);
 	return -E2BIG;
 }
 
@@ -579,7 +575,7 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr,
 /**
  * svc_rdma_send_write_chunk - Write all segments in a Write chunk
  * @rdma: controlling RDMA transport
- * @wr_ch: Write chunk provided by client
+ * @chunk: Write chunk provided by the client
  * @xdr: xdr_buf containing the data payload
  *
  * Returns a non-negative number of bytes the chunk consumed, or
@@ -589,13 +585,14 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr,
  *	%-ENOTCONN if posting failed (connection is lost),
  *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
  */
-int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+			      const struct svc_rdma_chunk *chunk,
 			      const struct xdr_buf *xdr)
 {
 	struct svc_rdma_write_info *info;
 	int ret;
 
-	info = svc_rdma_write_info_alloc(rdma, wr_ch);
+	info = svc_rdma_write_info_alloc(rdma, chunk);
 	if (!info)
 		return -ENOMEM;
 
@@ -633,12 +630,14 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 			      struct xdr_buf *xdr)
 {
 	struct svc_rdma_write_info *info;
+	struct svc_rdma_chunk *chunk;
 	int consumed, ret;
 
-	if (!rctxt->rc_reply_chunk)
+	if (pcl_is_empty(&rctxt->rc_reply_pcl))
 		return 0;
 
-	info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk);
+	chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+	info = svc_rdma_write_info_alloc(rdma, chunk);
 	if (!info)
 		return -ENOMEM;
 
@@ -650,7 +649,7 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 	/* Send the page list in the Reply chunk only if the
 	 * client did not provide Write chunks.
 	 */
-	if (!rctxt->rc_write_list && xdr->page_len) {
+	if (pcl_is_empty(&rctxt->rc_write_pcl) && xdr->page_len) {
 		ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len,
 					   xdr->page_len);
 		if (ret < 0)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 922dbe573570..9a6d3dee69ed 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -466,12 +466,14 @@ static ssize_t
 svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
 			   struct svc_rdma_send_ctxt *sctxt)
 {
+	struct svc_rdma_chunk *chunk;
 	ssize_t len, ret;
 
 	len = 0;
 	if (rctxt->rc_write_list) {
+		chunk = pcl_first_chunk(&rctxt->rc_write_pcl);
 		ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt,
-						  rctxt->rc_read_payload_length);
+						  chunk->ch_payload_length);
 		if (ret < 0)
 			return ret;
 		len = ret;
@@ -978,25 +980,27 @@ int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
 			    unsigned int length)
 {
 	struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+	struct svc_rdma_chunk *chunk;
 	struct svcxprt_rdma *rdma;
 	struct xdr_buf subbuf;
 	int ret;
 
-	if (!rctxt->rc_write_list || !length)
+	chunk = rctxt->rc_cur_result_payload;
+	if (!length || !chunk)
 		return 0;
+	rctxt->rc_cur_result_payload =
+		pcl_next_chunk(&rctxt->rc_write_pcl, chunk);
+	if (length > chunk->ch_length)
+		return -E2BIG;
 
-	/* XXX: Just one READ payload slot for now, since our
-	 * transport implementation currently supports only one
-	 * Write chunk.
-	 */
-	rctxt->rc_read_payload_offset = offset;
-	rctxt->rc_read_payload_length = length;
+	chunk->ch_position = offset;
+	chunk->ch_payload_length = length;
 
 	if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length))
 		return -EMSGSIZE;
 
 	rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
-	ret = svc_rdma_send_write_chunk(rdma, rctxt->rc_write_list, &subbuf);
+	ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf);
 	if (ret < 0)
 		return ret;
 	return 0;
-- 
cgit 


From 9d0b09d5ef0c842592a5df3a5b8b59124485ff1b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 13 Mar 2020 10:42:11 -0400
Subject: svcrdma: Support multiple write chunks when pulling up

When counting the number of SGEs needed to construct a Send request,
do not count result payloads. And, when copying the Reply message
into the pull-up buffer, result payloads are not to be copied to the
Send buffer.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h            |   2 +
 include/trace/events/rpcrdma.h             |  20 ++-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  14 ++-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c    |   9 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 188 +++++++++++++++++------------
 5 files changed, 146 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d9148787efff..7090af1a9791 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -182,6 +182,8 @@ extern void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp,
 /* svc_rdma_recvfrom.c */
 extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma);
 extern bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma);
+extern struct svc_rdma_recv_ctxt *
+		svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma);
 extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
 				   struct svc_rdma_recv_ctxt *ctxt);
 extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma);
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 5218e0f9596a..afc58accb9cf 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -1805,20 +1805,30 @@ TRACE_EVENT(svcrdma_small_wrch_err,
 
 TRACE_EVENT(svcrdma_send_pullup,
 	TP_PROTO(
-		unsigned int len
+		const struct svc_rdma_send_ctxt *ctxt,
+		unsigned int msglen
 	),
 
-	TP_ARGS(len),
+	TP_ARGS(ctxt, msglen),
 
 	TP_STRUCT__entry(
-		__field(unsigned int, len)
+		__field(u32, cq_id)
+		__field(int, completion_id)
+		__field(unsigned int, hdrlen)
+		__field(unsigned int, msglen)
 	),
 
 	TP_fast_assign(
-		__entry->len = len;
+		__entry->cq_id = ctxt->sc_cid.ci_queue_id;
+		__entry->completion_id = ctxt->sc_cid.ci_completion_id;
+		__entry->hdrlen = ctxt->sc_hdrbuf.len,
+		__entry->msglen = msglen;
 	),
 
-	TP_printk("len=%u", __entry->len)
+	TP_printk("cq_id=%u cid=%d hdr=%u msg=%u (total %u)",
+		__entry->cq_id, __entry->completion_id,
+		__entry->hdrlen, __entry->msglen,
+		__entry->hdrlen + __entry->msglen)
 );
 
 TRACE_EVENT(svcrdma_send_err,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 5e7c4ba9e147..63f8be974df2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -74,11 +74,17 @@ out_unlock:
  */
 static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 			      struct rpc_rqst *rqst,
-			      struct svc_rdma_send_ctxt *ctxt)
+			      struct svc_rdma_send_ctxt *sctxt)
 {
+	struct svc_rdma_recv_ctxt *rctxt;
 	int ret;
 
-	ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf);
+	rctxt = svc_rdma_recv_ctxt_get(rdma);
+	if (!rctxt)
+		return -EIO;
+
+	ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf);
+	svc_rdma_recv_ctxt_put(rdma, rctxt);
 	if (ret < 0)
 		return -EIO;
 
@@ -86,8 +92,8 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	 * the rq_buffer before all retransmits are complete.
 	 */
 	get_page(virt_to_page(rqst->rq_buffer));
-	ctxt->sc_send_wr.opcode = IB_WR_SEND;
-	return svc_rdma_send(rdma, ctxt);
+	sctxt->sc_send_wr.opcode = IB_WR_SEND;
+	return svc_rdma_send(rdma, sctxt);
 }
 
 /* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 7d44e9d2e7a3..af32c3ad45a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -194,8 +194,13 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
 	}
 }
 
-static struct svc_rdma_recv_ctxt *
-svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
+/**
+ * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt
+ * @rdma: controlling svcxprt_rdma
+ *
+ * Returns a recv_ctxt or (rarely) NULL if none are available.
+ */
+struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_recv_ctxt *ctxt;
 	struct llist_node *node;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 1fdbbad3f7dc..50f0216fb08c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -531,6 +531,45 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
 				     offset_in_page(base), len);
 }
 
+struct svc_rdma_pullup_data {
+	u8		*pd_dest;
+	unsigned int	pd_length;
+	unsigned int	pd_num_sges;
+};
+
+/**
+ * svc_rdma_xb_count_sges - Count how many SGEs will be needed
+ * @xdr: xdr_buf containing portion of an RPC message to transmit
+ * @data: pointer to arguments
+ *
+ * Returns:
+ *   Number of SGEs needed to Send the contents of @xdr inline
+ */
+static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr,
+				  void *data)
+{
+	struct svc_rdma_pullup_data *args = data;
+	unsigned int remaining;
+	unsigned long offset;
+
+	if (xdr->head[0].iov_len)
+		++args->pd_num_sges;
+
+	offset = offset_in_page(xdr->page_base);
+	remaining = xdr->page_len;
+	while (remaining) {
+		++args->pd_num_sges;
+		remaining -= min_t(u32, PAGE_SIZE - offset, remaining);
+		offset = 0;
+	}
+
+	if (xdr->tail[0].iov_len)
+		++args->pd_num_sges;
+
+	args->pd_length += xdr->len;
+	return 0;
+}
+
 /**
  * svc_rdma_pull_up_needed - Determine whether to use pull-up
  * @rdma: controlling transport
@@ -539,50 +578,71 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
  * @xdr: xdr_buf containing RPC message to transmit
  *
  * Returns:
- *	%true if pull-up must be used
- *	%false otherwise
+ *   %true if pull-up must be used
+ *   %false otherwise
  */
-static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
-				    struct svc_rdma_send_ctxt *sctxt,
+static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma,
+				    const struct svc_rdma_send_ctxt *sctxt,
 				    const struct svc_rdma_recv_ctxt *rctxt,
-				    struct xdr_buf *xdr)
+				    const struct xdr_buf *xdr)
 {
-	bool write_chunk_present = rctxt && rctxt->rc_write_list;
-	int elements;
+	/* Resources needed for the transport header */
+	struct svc_rdma_pullup_data args = {
+		.pd_length	= sctxt->sc_hdrbuf.len,
+		.pd_num_sges	= 1,
+	};
+	int ret;
 
-	/* For small messages, copying bytes is cheaper than DMA mapping.
-	 */
-	if (!write_chunk_present &&
-	    sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
+	ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+				      svc_rdma_xb_count_sges, &args);
+	if (ret < 0)
+		return false;
+
+	if (args.pd_length < RPCRDMA_PULLUP_THRESH)
 		return true;
+	return args.pd_num_sges >= rdma->sc_max_send_sges;
+}
 
-	/* Check whether the xdr_buf has more elements than can
-	 * fit in a single RDMA Send.
-	 */
-	/* xdr->head */
-	elements = 1;
-
-	/* xdr->pages */
-	if (!rctxt || !rctxt->rc_write_list) {
-		unsigned int remaining;
-		unsigned long pageoff;
-
-		pageoff = xdr->page_base & ~PAGE_MASK;
-		remaining = xdr->page_len;
-		while (remaining) {
-			++elements;
-			remaining -= min_t(u32, PAGE_SIZE - pageoff,
-					   remaining);
-			pageoff = 0;
-		}
+/**
+ * svc_rdma_xb_linearize - Copy region of xdr_buf to flat buffer
+ * @xdr: xdr_buf containing portion of an RPC message to copy
+ * @data: pointer to arguments
+ *
+ * Returns:
+ *   Always zero.
+ */
+static int svc_rdma_xb_linearize(const struct xdr_buf *xdr,
+				 void *data)
+{
+	struct svc_rdma_pullup_data *args = data;
+	unsigned int len, remaining;
+	unsigned long pageoff;
+	struct page **ppages;
+
+	if (xdr->head[0].iov_len) {
+		memcpy(args->pd_dest, xdr->head[0].iov_base, xdr->head[0].iov_len);
+		args->pd_dest += xdr->head[0].iov_len;
 	}
 
-	/* xdr->tail */
-	if (xdr->tail[0].iov_len)
-		++elements;
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	pageoff = offset_in_page(xdr->page_base);
+	remaining = xdr->page_len;
+	while (remaining) {
+		len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+		memcpy(args->pd_dest, page_address(*ppages) + pageoff, len);
+		remaining -= len;
+		args->pd_dest += len;
+		pageoff = 0;
+		ppages++;
+	}
 
-	/* assume 1 SGE is needed for the transport header */
-	return elements >= rdma->sc_max_send_sges;
+	if (xdr->tail[0].iov_len) {
+		memcpy(args->pd_dest, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+		args->pd_dest += xdr->tail[0].iov_len;
+	}
+
+	args->pd_length += xdr->len;
+	return 0;
 }
 
 /**
@@ -595,54 +655,30 @@ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
  * The device is not capable of sending the reply directly.
  * Assemble the elements of @xdr into the transport header buffer.
  *
- * Returns zero on success, or a negative errno on failure.
+ * Assumptions:
+ *  pull_up_needed has determined that @xdr will fit in the buffer.
+ *
+ * Returns:
+ *   %0 if pull-up was successful
+ *   %-EMSGSIZE if a buffer manipulation problem occurred
  */
-static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
+static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
 				      struct svc_rdma_send_ctxt *sctxt,
 				      const struct svc_rdma_recv_ctxt *rctxt,
 				      const struct xdr_buf *xdr)
 {
-	unsigned char *dst, *tailbase;
-	unsigned int taillen;
-
-	dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len;
-	memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
-	dst += xdr->head[0].iov_len;
-
-	tailbase = xdr->tail[0].iov_base;
-	taillen = xdr->tail[0].iov_len;
-	if (rctxt && rctxt->rc_write_list) {
-		u32 xdrpad;
-
-		xdrpad = xdr_pad_size(xdr->page_len);
-		if (taillen && xdrpad) {
-			tailbase += xdrpad;
-			taillen -= xdrpad;
-		}
-	} else {
-		unsigned int len, remaining;
-		unsigned long pageoff;
-		struct page **ppages;
-
-		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
-		pageoff = xdr->page_base & ~PAGE_MASK;
-		remaining = xdr->page_len;
-		while (remaining) {
-			len = min_t(u32, PAGE_SIZE - pageoff, remaining);
-
-			memcpy(dst, page_address(*ppages) + pageoff, len);
-			remaining -= len;
-			dst += len;
-			pageoff = 0;
-			ppages++;
-		}
-	}
+	struct svc_rdma_pullup_data args = {
+		.pd_dest	= sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len,
+	};
+	int ret;
 
-	if (taillen)
-		memcpy(dst, tailbase, taillen);
+	ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+				      svc_rdma_xb_linearize, &args);
+	if (ret < 0)
+		return ret;
 
-	sctxt->sc_sges[0].length += xdr->len;
-	trace_svcrdma_send_pullup(sctxt->sc_sges[0].length);
+	sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length;
+	trace_svcrdma_send_pullup(sctxt, args.pd_length);
 	return 0;
 }
 
-- 
cgit 


From 2371bcc056647327445150d6df0502d92ad68439 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Mar 2020 13:29:27 -0400
Subject: svcrdma: Support multiple Write chunks in svc_rdma_map_reply_msg()

Refactor: svc_rdma_map_reply_msg() is restructured to DMA map only
the parts of rq_res that do not contain a result payload.

This change has been tested to confirm that it does not cause a
regression in the no Write chunk and single Write chunk cases.
Multiple Write chunks have not been tested.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       |   2 +-
 include/trace/events/rpcrdma.h        |   1 +
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 174 +++++++++++++++++++---------------
 3 files changed, 100 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 7090af1a9791..e09fafba00d7 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -213,7 +213,7 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma,
 extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_send_ctxt *sctxt,
 				  const struct svc_rdma_recv_ctxt *rctxt,
-				  struct xdr_buf *xdr);
+				  const struct xdr_buf *xdr);
 extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 				    struct svc_rdma_send_ctxt *sctxt,
 				    struct svc_rdma_recv_ctxt *rctxt,
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index afc58accb9cf..054dedd0280c 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -1687,6 +1687,7 @@ DECLARE_EVENT_CLASS(svcrdma_dma_map_class,
 				TP_ARGS(rdma, dma_addr, length))
 
 DEFINE_SVC_DMA_EVENT(dma_map_page);
+DEFINE_SVC_DMA_EVENT(dma_map_err);
 DEFINE_SVC_DMA_EVENT(dma_unmap_page);
 
 TRACE_EVENT(svcrdma_dma_map_rw_err,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 50f0216fb08c..78fa57ce424f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -496,39 +496,111 @@ svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt,
 	return svc_rdma_encode_write_chunk(sctxt, chunk);
 }
 
-static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
-				 struct svc_rdma_send_ctxt *ctxt,
-				 struct page *page,
-				 unsigned long offset,
-				 unsigned int len)
+struct svc_rdma_map_data {
+	struct svcxprt_rdma		*md_rdma;
+	struct svc_rdma_send_ctxt	*md_ctxt;
+};
+
+/**
+ * svc_rdma_page_dma_map - DMA map one page
+ * @data: pointer to arguments
+ * @page: struct page to DMA map
+ * @offset: offset into the page
+ * @len: number of bytes to map
+ *
+ * Returns:
+ *   %0 if DMA mapping was successful
+ *   %-EIO if the page cannot be DMA mapped
+ */
+static int svc_rdma_page_dma_map(void *data, struct page *page,
+				 unsigned long offset, unsigned int len)
 {
+	struct svc_rdma_map_data *args = data;
+	struct svcxprt_rdma *rdma = args->md_rdma;
+	struct svc_rdma_send_ctxt *ctxt = args->md_ctxt;
 	struct ib_device *dev = rdma->sc_cm_id->device;
 	dma_addr_t dma_addr;
 
+	++ctxt->sc_cur_sge_no;
+
 	dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
-	trace_svcrdma_dma_map_page(rdma, dma_addr, len);
 	if (ib_dma_mapping_error(dev, dma_addr))
 		goto out_maperr;
 
+	trace_svcrdma_dma_map_page(rdma, dma_addr, len);
 	ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr;
 	ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len;
 	ctxt->sc_send_wr.num_sge++;
 	return 0;
 
 out_maperr:
+	trace_svcrdma_dma_map_err(rdma, dma_addr, len);
 	return -EIO;
 }
 
-/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+/**
+ * svc_rdma_iov_dma_map - DMA map an iovec
+ * @data: pointer to arguments
+ * @iov: kvec to DMA map
+ *
+ * ib_dma_map_page() is used here because svc_rdma_dma_unmap()
  * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively.
+ *
+ * Returns:
+ *   %0 if DMA mapping was successful
+ *   %-EIO if the iovec cannot be DMA mapped
  */
-static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
-				struct svc_rdma_send_ctxt *ctxt,
-				unsigned char *base,
-				unsigned int len)
+static int svc_rdma_iov_dma_map(void *data, const struct kvec *iov)
 {
-	return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base),
-				     offset_in_page(base), len);
+	if (!iov->iov_len)
+		return 0;
+	return svc_rdma_page_dma_map(data, virt_to_page(iov->iov_base),
+				     offset_in_page(iov->iov_base),
+				     iov->iov_len);
+}
+
+/**
+ * svc_rdma_xb_dma_map - DMA map all segments of an xdr_buf
+ * @xdr: xdr_buf containing portion of an RPC message to transmit
+ * @data: pointer to arguments
+ *
+ * Returns:
+ *   %0 if DMA mapping was successful
+ *   %-EIO if DMA mapping failed
+ *
+ * On failure, any DMA mappings that have been already done must be
+ * unmapped by the caller.
+ */
+static int svc_rdma_xb_dma_map(const struct xdr_buf *xdr, void *data)
+{
+	unsigned int len, remaining;
+	unsigned long pageoff;
+	struct page **ppages;
+	int ret;
+
+	ret = svc_rdma_iov_dma_map(data, &xdr->head[0]);
+	if (ret < 0)
+		return ret;
+
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	pageoff = offset_in_page(xdr->page_base);
+	remaining = xdr->page_len;
+	while (remaining) {
+		len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+
+		ret = svc_rdma_page_dma_map(data, *ppages++, pageoff, len);
+		if (ret < 0)
+			return ret;
+
+		remaining -= len;
+		pageoff = 0;
+	}
+
+	ret = svc_rdma_iov_dma_map(data, &xdr->tail[0]);
+	if (ret < 0)
+		return ret;
+
+	return xdr->len;
 }
 
 struct svc_rdma_pullup_data {
@@ -688,22 +760,22 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
  * @rctxt: Write and Reply chunks provided by client
  * @xdr: prepared xdr_buf containing RPC message
  *
- * Load the xdr_buf into the ctxt's sge array, and DMA map each
- * element as it is added. The Send WR's num_sge field is set.
+ * Returns:
+ *   %0 if DMA mapping was successful.
+ *   %-EMSGSIZE if a buffer manipulation problem occurred
+ *   %-EIO if DMA mapping failed
  *
- * Returns zero on success, or a negative errno on failure.
+ * The Send WR's num_sge field is set in all cases.
  */
 int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 			   struct svc_rdma_send_ctxt *sctxt,
 			   const struct svc_rdma_recv_ctxt *rctxt,
-			   struct xdr_buf *xdr)
+			   const struct xdr_buf *xdr)
 {
-	unsigned int len, remaining;
-	unsigned long page_off;
-	struct page **ppages;
-	unsigned char *base;
-	u32 xdr_pad;
-	int ret;
+	struct svc_rdma_map_data args = {
+		.md_rdma	= rdma,
+		.md_ctxt	= sctxt,
+	};
 
 	/* Set up the (persistently-mapped) transport header SGE. */
 	sctxt->sc_send_wr.num_sge = 1;
@@ -712,7 +784,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	/* If there is a Reply chunk, nothing follows the transport
 	 * header, and we're done here.
 	 */
-	if (rctxt && rctxt->rc_reply_chunk)
+	if (!pcl_is_empty(&rctxt->rc_reply_pcl))
 		return 0;
 
 	/* For pull-up, svc_rdma_send() will sync the transport header.
@@ -721,58 +793,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
 		return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
 
-	++sctxt->sc_cur_sge_no;
-	ret = svc_rdma_dma_map_buf(rdma, sctxt,
-				   xdr->head[0].iov_base,
-				   xdr->head[0].iov_len);
-	if (ret < 0)
-		return ret;
-
-	/* If a Write chunk is present, the xdr_buf's page list
-	 * is not included inline. However the Upper Layer may
-	 * have added XDR padding in the tail buffer, and that
-	 * should not be included inline.
-	 */
-	if (rctxt && rctxt->rc_write_list) {
-		base = xdr->tail[0].iov_base;
-		len = xdr->tail[0].iov_len;
-		xdr_pad = xdr_pad_size(xdr->page_len);
-
-		if (len && xdr_pad) {
-			base += xdr_pad;
-			len -= xdr_pad;
-		}
-
-		goto tail;
-	}
-
-	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
-	page_off = xdr->page_base & ~PAGE_MASK;
-	remaining = xdr->page_len;
-	while (remaining) {
-		len = min_t(u32, PAGE_SIZE - page_off, remaining);
-
-		++sctxt->sc_cur_sge_no;
-		ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++,
-					    page_off, len);
-		if (ret < 0)
-			return ret;
-
-		remaining -= len;
-		page_off = 0;
-	}
-
-	base = xdr->tail[0].iov_base;
-	len = xdr->tail[0].iov_len;
-tail:
-	if (len) {
-		++sctxt->sc_cur_sge_no;
-		ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len);
-		if (ret < 0)
-			return ret;
-	}
-
-	return 0;
+	return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+				       svc_rdma_xb_dma_map, &args);
 }
 
 /* The svc_rqst and all resources it owns are released as soon as
-- 
cgit 


From 41bc163ffe0fe67cba3fff2f5e8c58caa9e46a1e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Mar 2020 13:29:28 -0400
Subject: svcrdma: Support multiple Write chunks in svc_rdma_send_reply_chunk

Refactor svc_rdma_send_reply_chunk() so that it Sends only the parts
of rq_res that do not contain a result payload.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h   |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_rw.c | 36 +++++++++---------------------------
 2 files changed, 10 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index e09fafba00d7..85fbec47d4b5 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -200,7 +200,7 @@ extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     const struct xdr_buf *xdr);
 extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_recv_ctxt *rctxt,
-				     struct xdr_buf *xdr);
+				     const struct xdr_buf *xdr);
 
 /* svc_rdma_sendto.c */
 extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index ed7def7b801b..4a0ece9aa385 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -537,7 +537,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
 /**
  * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf
  * @xdr: xdr_buf to write
- * @info: pointer to write arguments
+ * @data: pointer to write arguments
  *
  * Returns:
  *   On succes, returns zero
@@ -545,9 +545,9 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
  *   %-ENOMEM if a resource has been exhausted
  *   %-EIO if an rdma-rw error occurred
  */
-static int svc_rdma_xb_write(const struct xdr_buf *xdr,
-			     struct svc_rdma_write_info *info)
+static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
 {
+	struct svc_rdma_write_info *info = data;
 	int ret;
 
 	if (xdr->head[0].iov_len) {
@@ -627,11 +627,11 @@ out_err:
  */
 int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 			      const struct svc_rdma_recv_ctxt *rctxt,
-			      struct xdr_buf *xdr)
+			      const struct xdr_buf *xdr)
 {
 	struct svc_rdma_write_info *info;
 	struct svc_rdma_chunk *chunk;
-	int consumed, ret;
+	int ret;
 
 	if (pcl_is_empty(&rctxt->rc_reply_pcl))
 		return 0;
@@ -641,35 +641,17 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 	if (!info)
 		return -ENOMEM;
 
-	ret = svc_rdma_iov_write(info, &xdr->head[0]);
+	ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+				      svc_rdma_xb_write, info);
 	if (ret < 0)
 		goto out_err;
-	consumed = xdr->head[0].iov_len;
-
-	/* Send the page list in the Reply chunk only if the
-	 * client did not provide Write chunks.
-	 */
-	if (pcl_is_empty(&rctxt->rc_write_pcl) && xdr->page_len) {
-		ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len,
-					   xdr->page_len);
-		if (ret < 0)
-			goto out_err;
-		consumed += xdr->page_len;
-	}
-
-	if (xdr->tail[0].iov_len) {
-		ret = svc_rdma_iov_write(info, &xdr->tail[0]);
-		if (ret < 0)
-			goto out_err;
-		consumed += xdr->tail[0].iov_len;
-	}
 
 	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
 	if (ret < 0)
 		goto out_err;
 
-	trace_svcrdma_send_reply_chunk(consumed);
-	return consumed;
+	trace_svcrdma_send_reply_chunk(xdr->len);
+	return xdr->len;
 
 out_err:
 	svc_rdma_write_info_free(info);
-- 
cgit 


From 7954c8503b8709660d93505a40f1847634d9c3ba Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 17 Jun 2020 15:19:36 -0400
Subject: svcrdma: Remove chunk list pointers

Clean up: These pointers are no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         | 4 ----
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 8 +-------
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 85fbec47d4b5..6f247d043731 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -149,12 +149,8 @@ struct svc_rdma_recv_ctxt {
 	struct svc_rdma_pcl	rc_call_pcl;
 
 	struct svc_rdma_pcl	rc_read_pcl;
-
-	__be32			*rc_write_list;
 	struct svc_rdma_chunk	*rc_cur_result_payload;
 	struct svc_rdma_pcl	rc_write_pcl;
-
-	__be32			*rc_reply_chunk;
 	struct svc_rdma_pcl	rc_reply_pcl;
 
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index af32c3ad45a6..dd10b1de227d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -540,17 +540,13 @@ static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
 	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
 	if (!p)
 		return false;
-
-	rctxt->rc_write_list = NULL;
 	if (!xdr_count_write_chunks(rctxt, p))
 		return false;
 	if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p))
 		return false;
 
-	if (!pcl_is_empty(&rctxt->rc_write_pcl))
-		rctxt->rc_write_list = p;
 	rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl);
-	return rctxt->rc_write_pcl.cl_count < 2;
+	return true;
 }
 
 /* Sanity check the Reply chunk.
@@ -573,13 +569,11 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
 	if (!p)
 		return false;
 
-	rctxt->rc_reply_chunk = NULL;
 	if (!xdr_item_is_present(p))
 		return true;
 	if (!xdr_check_write_chunk(rctxt))
 		return false;
 
-	rctxt->rc_reply_chunk = p;
 	rctxt->rc_reply_pcl.cl_count = 1;
 	return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p);
 }
-- 
cgit 


From d96962e6d0e281bab6a48e83b42f5dce6eb28bf4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Sep 2020 13:04:17 -0400
Subject: svcrdma: Use the new parsed chunk list when pulling Read chunks

As a pre-requisite for handling multiple Read chunks in each Read
list, convert svc_rdma_recv_read_chunk() to use the new parsed Read
chunk list.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |   6 +-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  16 ++--
 net/sunrpc/xprtrdma/svc_rdma_rw.c       | 160 ++++++++++++++++++++------------
 3 files changed, 111 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 6f247d043731..294b56e61522 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -188,15 +188,15 @@ extern int svc_rdma_recvfrom(struct svc_rqst *);
 
 /* svc_rdma_rw.c */
 extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
-extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma,
-				    struct svc_rqst *rqstp,
-				    struct svc_rdma_recv_ctxt *head, __be32 *p);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_chunk *chunk,
 				     const struct xdr_buf *xdr);
 extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_recv_ctxt *rctxt,
 				     const struct xdr_buf *xdr);
+extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
+				      struct svc_rqst *rqstp,
+				      struct svc_rdma_recv_ctxt *head);
 
 /* svc_rdma_sendto.c */
 extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index dd10b1de227d..cbdb71247755 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -824,7 +824,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	struct svcxprt_rdma *rdma_xprt =
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
 	struct svc_rdma_recv_ctxt *ctxt;
-	__be32 *p;
 	int ret;
 
 	rqstp->rq_xprt_ctxt = NULL;
@@ -857,7 +856,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_respages = rqstp->rq_pages;
 	rqstp->rq_next_page = rqstp->rq_respages;
 
-	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
 	ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
 	if (ret < 0)
 		goto out_err;
@@ -870,9 +868,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
 
-	p += rpcrdma_fixed_maxsz;
-	if (*p != xdr_zero)
-		goto out_readchunk;
+	if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
+	    !pcl_is_empty(&ctxt->rc_call_pcl))
+		goto out_readlist;
 
 complete:
 	rqstp->rq_xprt_ctxt = ctxt;
@@ -880,10 +878,10 @@ complete:
 	svc_xprt_copy_addrs(rqstp, xprt);
 	return rqstp->rq_arg.len;
 
-out_readchunk:
-	ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p);
+out_readlist:
+	ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
 	if (ret < 0)
-		goto out_postfail;
+		goto out_readfail;
 	return 0;
 
 out_err:
@@ -891,7 +889,7 @@ out_err:
 	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
 	return 0;
 
-out_postfail:
+out_readfail:
 	if (ret == -EINVAL)
 		svc_rdma_send_error(rdma_xprt, ctxt, ret);
 	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 3154c7ab1ca8..2d454e37b2fe 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -258,8 +258,8 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 /* State for pulling a Read chunk.
  */
 struct svc_rdma_read_info {
+	struct svc_rqst			*ri_rqst;
 	struct svc_rdma_recv_ctxt	*ri_readctxt;
-	unsigned int			ri_position;
 	unsigned int			ri_pageno;
 	unsigned int			ri_pageoff;
 	unsigned int			ri_totalbytes;
@@ -658,17 +658,29 @@ out_err:
 	return ret;
 }
 
+/**
+ * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
+ * @info: context for ongoing I/O
+ * @segment: co-ordinates of remote memory to be read
+ *
+ * Returns:
+ *   %0: the Read WR chain was constructed successfully
+ *   %-EINVAL: there were not enough rq_pages to finish
+ *   %-ENOMEM: allocating a local resources failed
+ *   %-EIO: a DMA mapping error occurred
+ */
 static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
-				       struct svc_rqst *rqstp,
-				       u32 rkey, u32 len, u64 offset)
+				       const struct svc_rdma_segment *segment)
 {
 	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
+	struct svc_rqst *rqstp = info->ri_rqst;
 	struct svc_rdma_rw_ctxt *ctxt;
-	unsigned int sge_no, seg_len;
+	unsigned int sge_no, seg_len, len;
 	struct scatterlist *sg;
 	int ret;
 
+	len = segment->rs_length;
 	sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
 	ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no);
 	if (!ctxt)
@@ -702,8 +714,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
 			goto out_overrun;
 	}
 
-	ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey,
-				   DMA_FROM_DEVICE);
+	ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset,
+				   segment->rs_handle, DMA_FROM_DEVICE);
 	if (ret < 0)
 		return -EIO;
 
@@ -716,48 +728,63 @@ out_overrun:
 	return -EINVAL;
 }
 
-/* Walk the segments in the Read chunk starting at @p and construct
- * RDMA Read operations to pull the chunk to the server.
+/**
+ * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk
+ * @info: context for ongoing I/O
+ * @chunk: Read chunk to pull
+ *
+ * Return values:
+ *   %0: the Read WR chain was constructed successfully
+ *   %-EINVAL: there were not enough resources to finish
+ *   %-ENOMEM: allocating a local resources failed
+ *   %-EIO: a DMA mapping error occurred
  */
-static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
-				     struct svc_rdma_read_info *info,
-				     __be32 *p)
+static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info,
+				     const struct svc_rdma_chunk *chunk)
 {
+	const struct svc_rdma_segment *segment;
 	int ret;
 
 	ret = -EINVAL;
-	while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
-		u32 handle, length;
-		u64 offset;
-
-		p = xdr_decode_rdma_segment(p, &handle, &length, &offset);
-		ret = svc_rdma_build_read_segment(info, rqstp, handle, length,
-						  offset);
+	pcl_for_each_segment(segment, chunk) {
+		ret = svc_rdma_build_read_segment(info, segment);
 		if (ret < 0)
 			break;
-
-		info->ri_totalbytes += length;
+		info->ri_totalbytes += segment->rs_length;
 	}
 	return ret;
 }
 
-/* Construct RDMA Reads to pull over a normal Read chunk. The chunk
- * data lands in the page list of head->rc_arg.pages.
+/**
+ * svc_rdma_read_data_items - Construct RDMA Reads to pull data item Read chunks
+ * @info: context for RDMA Reads
+ *
+ * The chunk data lands in the page list of head->rc_arg.pages.
  *
  * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
  * Therefore, XDR round-up of the Read chunk and trailing
  * inline content must both be added at the end of the pagelist.
+ *
+ * Return values:
+ *   %0: RDMA Read WQEs were successfully built
+ *   %-EINVAL: client provided too many chunks or segments,
+ *   %-ENOMEM: rdma_rw context pool was exhausted,
+ *   %-ENOTCONN: posting failed (connection is lost),
+ *   %-EIO: rdma_rw initialization failed (DMA mapping, etc).
  */
-static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
-					    struct svc_rdma_read_info *info,
-					    __be32 *p)
+static int svc_rdma_read_data_items(struct svc_rdma_read_info *info)
 {
 	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	struct xdr_buf *buf = &head->rc_arg;
+	struct svc_rdma_chunk *chunk;
 	unsigned int length;
 	int ret;
 
-	ret = svc_rdma_build_read_chunk(rqstp, info, p);
+	if (head->rc_read_pcl.cl_count > 1)
+		return -EINVAL;
+
+	chunk = pcl_first_chunk(&head->rc_read_pcl);
+	ret = svc_rdma_build_read_chunk(info, chunk);
 	if (ret < 0)
 		goto out;
 
@@ -768,11 +795,9 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
 	 * chunk is not included in either the pagelist or in
 	 * the tail.
 	 */
-	head->rc_arg.tail[0].iov_base =
-		head->rc_arg.head[0].iov_base + info->ri_position;
-	head->rc_arg.tail[0].iov_len =
-		head->rc_arg.head[0].iov_len - info->ri_position;
-	head->rc_arg.head[0].iov_len = info->ri_position;
+	buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position;
+	buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position;
+	buf->head[0].iov_len = chunk->ch_position;
 
 	/* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
 	 *
@@ -792,26 +817,36 @@ out:
 	return ret;
 }
 
-/* Construct RDMA Reads to pull over a Position Zero Read chunk.
- * The start of the data lands in the first page just after
- * the Transport header, and the rest lands in the page list of
+/**
+ * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message
+ * @info: context for RDMA Reads
+ *
+ * The start of the data lands in the first page just after the
+ * Transport header, and the rest lands in the page list of
  * head->rc_arg.pages.
  *
  * Assumptions:
- *	- A PZRC has an XDR-aligned length (no implicit round-up).
- *	- There can be no trailing inline content (IOW, we assume
- *	  a PZRC is never sent in an RDMA_MSG message, though it's
- *	  allowed by spec).
+ *	- A PZRC is never sent in an RDMA_MSG message, though it's
+ *	  allowed by spec.
+ *
+ * Return values:
+ *   %0: RDMA Read WQEs were successfully built
+ *   %-EINVAL: client provided too many chunks or segments,
+ *   %-ENOMEM: rdma_rw context pool was exhausted,
+ *   %-ENOTCONN: posting failed (connection is lost),
+ *   %-EIO: rdma_rw initialization failed (DMA mapping, etc).
  */
-static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
-					struct svc_rdma_read_info *info,
-					__be32 *p)
+static int svc_rdma_read_special(struct svc_rdma_read_info *info)
 {
 	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	struct xdr_buf *buf = &head->rc_arg;
 	int ret;
 
-	ret = svc_rdma_build_read_chunk(rqstp, info, p);
+	if (head->rc_call_pcl.cl_count > 1)
+		return -EINVAL;
+
+	ret = svc_rdma_build_read_chunk(info,
+					pcl_first_chunk(&head->rc_call_pcl));
 	if (ret < 0)
 		goto out;
 
@@ -848,24 +883,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
 }
 
 /**
- * svc_rdma_recv_read_chunk - Pull a Read chunk from the client
+ * svc_rdma_process_read_list - Pull list of Read chunks from the client
  * @rdma: controlling RDMA transport
  * @rqstp: set of pages to use as Read sink buffers
  * @head: pages under I/O collect here
- * @p: pointer to start of Read chunk
  *
- * Returns:
- *	%0 if all needed RDMA Reads were posted successfully,
- *	%-EINVAL if client provided too many segments,
- *	%-ENOMEM if rdma_rw context pool was exhausted,
- *	%-ENOTCONN if posting failed (connection is lost),
- *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ * The RPC/RDMA protocol assumes that the upper layer's XDR decoders
+ * pull each Read chunk as they decode an incoming RPC message.
  *
- * Assumptions:
- * - All Read segments in @p have the same Position value.
+ * On Linux, however, the server needs to have a fully-constructed RPC
+ * message in rqstp->rq_arg when there is a positive return code from
+ * ->xpo_recvfrom. So the Read list is safety-checked immediately when
+ * it is received, then here the whole Read list is pulled all at once.
+ * The ingress RPC message is fully reconstructed once all associated
+ * RDMA Reads have completed.
+ *
+ * Return values:
+ *   %1: all needed RDMA Reads were posted successfully,
+ *   %-EINVAL: client provided too many chunks or segments,
+ *   %-ENOMEM: rdma_rw context pool was exhausted,
+ *   %-ENOTCONN: posting failed (connection is lost),
+ *   %-EIO: rdma_rw initialization failed (DMA mapping, etc).
  */
-int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
-			     struct svc_rdma_recv_ctxt *head, __be32 *p)
+int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
+			       struct svc_rqst *rqstp,
+			       struct svc_rdma_recv_ctxt *head)
 {
 	struct svc_rdma_read_info *info;
 	struct svc_rdma_chunk_ctxt *cc;
@@ -887,16 +929,16 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
 	if (!info)
 		return -ENOMEM;
 	cc = &info->ri_cc;
+	info->ri_rqst = rqstp;
 	info->ri_readctxt = head;
 	info->ri_pageno = 0;
 	info->ri_pageoff = 0;
 	info->ri_totalbytes = 0;
 
-	info->ri_position = be32_to_cpup(p + 1);
-	if (info->ri_position)
-		ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
+	if (pcl_is_empty(&head->rc_call_pcl))
+		ret = svc_rdma_read_data_items(info);
 	else
-		ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
+		ret = svc_rdma_read_special(info);
 	if (ret < 0)
 		goto out_err;
 
@@ -905,7 +947,7 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
 	if (ret < 0)
 		goto out_err;
 	svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
-	return 0;
+	return 1;
 
 out_err:
 	svc_rdma_read_info_free(info);
-- 
cgit 


From 0ae4c3e8a64ace1b8d7de033b0751afe43024416 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 11 Nov 2020 15:52:47 -0500
Subject: SUNRPC: Add xdr_set_scratch_page() and xdr_reset_scratch_buffer()

Clean up: De-duplicate some frequently-used code.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/blocklayout/blocklayout.c          |  2 +-
 fs/nfs/blocklayout/dev.c                  |  2 +-
 fs/nfs/dir.c                              |  2 +-
 fs/nfs/filelayout/filelayout.c            |  2 +-
 fs/nfs/filelayout/filelayoutdev.c         |  2 +-
 fs/nfs/flexfilelayout/flexfilelayout.c    |  2 +-
 fs/nfs/flexfilelayout/flexfilelayoutdev.c |  2 +-
 fs/nfs/nfs42xdr.c                         |  2 +-
 fs/nfs/nfs4xdr.c                          |  6 ++---
 fs/nfsd/nfs4proc.c                        |  2 +-
 include/linux/sunrpc/xdr.h                | 44 ++++++++++++++++++++++++++++++-
 net/sunrpc/auth_gss/gss_rpc_xdr.c         |  2 +-
 net/sunrpc/xdr.c                          | 28 +++-----------------
 13 files changed, 59 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 08108b6d2fa1..3be6836074ae 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -697,7 +697,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
 
 	xdr_init_decode_pages(&xdr, &buf,
 			lgr->layoutp->pages, lgr->layoutp->len);
-	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+	xdr_set_scratch_page(&xdr, scratch);
 
 	status = -EIO;
 	p = xdr_inline_decode(&xdr, 4);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index dec5880ac6de..acb1d22907da 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -510,7 +510,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 		goto out;
 
 	xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
-	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+	xdr_set_scratch_page(&xdr, scratch);
 
 	p = xdr_inline_decode(&xdr, sizeof(__be32));
 	if (!p)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4e011adaf967..8a24fe20dccf 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -576,7 +576,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 		goto out_nopages;
 
 	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
-	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+	xdr_set_scratch_page(&stream, scratch);
 
 	do {
 		if (entry->label)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7f5aa0403e16..d158a500c25c 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -666,7 +666,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
 		return -ENOMEM;
 
 	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
-	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+	xdr_set_scratch_page(&stream, scratch);
 
 	/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
 	 * num_fh (4) */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index d913e818858f..86c3f7e69ec4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -82,7 +82,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 		goto out_err;
 
 	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
-	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+	xdr_set_scratch_page(&stream, scratch);
 
 	/* Get the stripe count (number of stripe index) */
 	p = xdr_inline_decode(&stream, 4);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index a163533446fa..d7010686d39a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -378,7 +378,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 
 	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
 			      lgr->layoutp->len);
-	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+	xdr_set_scratch_page(&stream, scratch);
 
 	/* stripe unit and mirror_array_cnt */
 	rc = -EIO;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 3eda40a320a5..c9b61b818ec1 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -69,7 +69,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 	INIT_LIST_HEAD(&dsaddrs);
 
 	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
-	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+	xdr_set_scratch_page(&stream, scratch);
 
 	/* multipath count */
 	p = xdr_inline_decode(&stream, 4);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6e060a88f98c..cb5d4da2308f 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1540,7 +1540,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE);
+	xdr_set_scratch_page(xdr, res->scratch);
 
 	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c6dbfcae7517..2eabe5add344 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6403,10 +6403,8 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	struct compound_hdr hdr;
 	int status;
 
-	if (res->acl_scratch != NULL) {
-		void *p = page_address(res->acl_scratch);
-		xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
-	}
+	if (res->acl_scratch != NULL)
+		xdr_set_scratch_page(xdr, res->acl_scratch);
 	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e83b21778816..20772f6b0b2d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2276,7 +2276,7 @@ static void svcxdr_init_encode(struct svc_rqst *rqstp,
 	xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
 	/* Tail and page_len should be zero at this point: */
 	buf->len = buf->head[0].iov_len;
-	xdr->scratch.iov_len = 0;
+	xdr_reset_scratch_buffer(xdr);
 	xdr->page_ptr = buf->pages - 1;
 	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
 		- rqstp->rq_auth_slack;
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index ec2a22ccdc2a..2729d2d6efce 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -248,7 +248,6 @@ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf,
 			    __be32 *p, struct rpc_rqst *rqst);
 extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
 		struct page **pages, unsigned int len);
-extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
 extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
@@ -256,6 +255,49 @@ extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned in
 extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t);
 extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
 
+/**
+ * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to an empty buffer
+ * @buflen: size of 'buf'
+ *
+ * The scratch buffer is used when decoding from an array of pages.
+ * If an xdr_inline_decode() call spans across page boundaries, then
+ * we copy the data into the scratch buffer in order to allow linear
+ * access.
+ */
+static inline void
+xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
+{
+	xdr->scratch.iov_base = buf;
+	xdr->scratch.iov_len = buflen;
+}
+
+/**
+ * xdr_set_scratch_page - Attach a scratch buffer for decoding data
+ * @xdr: pointer to xdr_stream struct
+ * @page: an anonymous page
+ *
+ * See xdr_set_scratch_buffer().
+ */
+static inline void
+xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page)
+{
+	xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE);
+}
+
+/**
+ * xdr_reset_scratch_buffer - Clear scratch buffer information
+ * @xdr: pointer to xdr_stream struct
+ *
+ * See xdr_set_scratch_buffer().
+ */
+static inline void
+xdr_reset_scratch_buffer(struct xdr_stream *xdr)
+{
+	xdr_set_scratch_buffer(xdr, NULL, 0);
+}
+
 /**
  * xdr_stream_remaining - Return the number of bytes remaining in the stream
  * @xdr: pointer to struct xdr_stream
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 2ff7b7083eba..c636c648849b 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -789,7 +789,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
 	scratch = alloc_page(GFP_KERNEL);
 	if (!scratch)
 		return -ENOMEM;
-	xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
+	xdr_set_scratch_page(xdr, scratch);
 
 	/* res->status */
 	err = gssx_dec_status(xdr, &res->status);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 28f81769a27c..c607744b3ea8 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -669,7 +669,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
 	struct kvec *iov = buf->head;
 	int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
 
-	xdr_set_scratch_buffer(xdr, NULL, 0);
+	xdr_reset_scratch_buffer(xdr);
 	BUG_ON(scratch_len < 0);
 	xdr->buf = buf;
 	xdr->iov = iov;
@@ -713,7 +713,7 @@ inline void xdr_commit_encode(struct xdr_stream *xdr)
 	page = page_address(*xdr->page_ptr);
 	memcpy(xdr->scratch.iov_base, page, shift);
 	memmove(page, page + shift, (void *)xdr->p - page);
-	xdr->scratch.iov_len = 0;
+	xdr_reset_scratch_buffer(xdr);
 }
 EXPORT_SYMBOL_GPL(xdr_commit_encode);
 
@@ -743,8 +743,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
 	 * the "scratch" iov to track any temporarily unused fragment of
 	 * space at the end of the previous buffer:
 	 */
-	xdr->scratch.iov_base = xdr->p;
-	xdr->scratch.iov_len = frag1bytes;
+	xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes);
 	p = page_address(*xdr->page_ptr);
 	/*
 	 * Note this is where the next encode will start after we've
@@ -1052,8 +1051,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
 		     struct rpc_rqst *rqst)
 {
 	xdr->buf = buf;
-	xdr->scratch.iov_base = NULL;
-	xdr->scratch.iov_len = 0;
+	xdr_reset_scratch_buffer(xdr);
 	xdr->nwords = XDR_QUADLEN(buf->len);
 	if (buf->head[0].iov_len != 0)
 		xdr_set_iov(xdr, buf->head, buf->len);
@@ -1101,24 +1099,6 @@ static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
 	return p;
 }
 
-/**
- * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
- * @xdr: pointer to xdr_stream struct
- * @buf: pointer to an empty buffer
- * @buflen: size of 'buf'
- *
- * The scratch buffer is used when decoding from an array of pages.
- * If an xdr_inline_decode() call spans across page boundaries, then
- * we copy the data into the scratch buffer in order to allow linear
- * access.
- */
-void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
-{
-	xdr->scratch.iov_base = buf;
-	xdr->scratch.iov_len = buflen;
-}
-EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
-
 static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
 {
 	__be32 *p;
-- 
cgit 


From 5191955d6fc65e6d4efe8f4f10a6028298f57281 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 5 Nov 2020 11:19:42 -0500
Subject: SUNRPC: Prepare for xdr_stream-style decoding on the server-side

A "permanent" struct xdr_stream is allocated in struct svc_rqst so
that it is usable by all server-side decoders. A per-rqst scratch
buffer is also allocated to handle decoding XDR data items that
cross page boundaries.

To demonstrate how it will be used, add the first call site for the
new svcxdr_init_decode() API.

As an additional part of the overall conversion, add symbolic
constants for successful and failed XDR operations. Returning "0" is
overloaded. Sometimes it means something failed, but sometimes it
means success. To make it more clear when XDR decoding functions
succeed or fail, introduce symbolic constants.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfssvc.c           |  2 ++
 include/linux/sunrpc/svc.h | 16 ++++++++++++++++
 net/sunrpc/svc.c           |  5 +++++
 3 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 27b1ad136150..74d5e58eadb0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1020,6 +1020,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	 * (necessary in the NFSv4.0 compound case)
 	 */
 	rqstp->rq_cachetype = proc->pc_cachetype;
+
+	svcxdr_init_decode(rqstp);
 	if (!proc->pc_decode(rqstp, argv->iov_base))
 		goto out_decode_err;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index c220b734fa69..34c2a69820e9 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -247,6 +247,8 @@ struct svc_rqst {
 
 	size_t			rq_xprt_hlen;	/* xprt header len */
 	struct xdr_buf		rq_arg;
+	struct xdr_stream	rq_arg_stream;
+	struct page		*rq_scratch_page;
 	struct xdr_buf		rq_res;
 	struct page		*rq_pages[RPCSVC_MAXPAGES + 1];
 	struct page *		*rq_respages;	/* points into rq_pages */
@@ -557,4 +559,18 @@ static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space)
 	svc_reserve(rqstp, space + rqstp->rq_auth_slack);
 }
 
+/**
+ * svcxdr_init_decode - Prepare an xdr_stream for svc Call decoding
+ * @rqstp: controlling server RPC transaction context
+ *
+ */
+static inline void svcxdr_init_decode(struct svc_rqst *rqstp)
+{
+	struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+	struct kvec *argv = rqstp->rq_arg.head;
+
+	xdr_init_decode(xdr, &rqstp->rq_arg, argv->iov_base, NULL);
+	xdr_set_scratch_page(xdr, rqstp->rq_scratch_page);
+}
+
 #endif /* SUNRPC_SVC_H */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b41500645c3f..4187745887f0 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -614,6 +614,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
+	rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0);
+	if (!rqstp->rq_scratch_page)
+		goto out_enomem;
+
 	rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
 	if (!rqstp->rq_argp)
 		goto out_enomem;
@@ -842,6 +846,7 @@ void
 svc_rqst_free(struct svc_rqst *rqstp)
 {
 	svc_release_buffer(rqstp);
+	put_page(rqstp->rq_scratch_page);
 	kfree(rqstp->rq_resp);
 	kfree(rqstp->rq_argp);
 	kfree(rqstp->rq_auth_data);
-- 
cgit 


From c1346a1216ab5cb04a265380ac9035d91b16b6d5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 3 Nov 2020 11:54:23 -0500
Subject: NFSD: Replace the internals of the READ_BUF() macro

Convert the READ_BUF macro in nfs4xdr.c from open code to instead
use the new xdr_stream-style decoders already in use by the encode
side (and by the in-kernel NFS client implementation). Once this
conversion is done, each individual NFSv4 argument decoder can be
independently cleaned up to replace these macros with C code.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c         |   4 +-
 fs/nfsd/nfs4xdr.c          | 181 +++++++--------------------------------------
 fs/nfsd/xdr4.h             |  10 +--
 include/linux/sunrpc/xdr.h |   2 +
 net/sunrpc/xdr.c           |  45 +++++++++++
 5 files changed, 77 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 76300b0441f0..df2d6f70c8d4 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1023,8 +1023,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	write->wr_how_written = write->wr_stable_how;
 
-	nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist,
-				      &write->wr_head, write->wr_buflen);
+	nvecs = svc_fill_write_vector(rqstp, write->wr_payload.pages,
+				      write->wr_payload.head, write->wr_buflen);
 	WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
 
 	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 66edac748272..e3a2912c5e56 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -131,90 +131,13 @@ xdr_error:					\
 	memcpy((x), p, nbytes);			\
 	p += XDR_QUADLEN(nbytes);		\
 } while (0)
-
-/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */
-#define READ_BUF(nbytes)  do {			\
-	if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) {	\
-		p = argp->p;			\
-		argp->p += XDR_QUADLEN(nbytes);	\
-	} else if (!(p = read_buf(argp, nbytes))) { \
-		dprintk("NFSD: xdr error (%s:%d)\n", \
-				__FILE__, __LINE__); \
-		goto xdr_error;			\
-	}					\
-} while (0)
-
-static void next_decode_page(struct nfsd4_compoundargs *argp)
-{
-	argp->p = page_address(argp->pagelist[0]);
-	argp->pagelist++;
-	if (argp->pagelen < PAGE_SIZE) {
-		argp->end = argp->p + XDR_QUADLEN(argp->pagelen);
-		argp->pagelen = 0;
-	} else {
-		argp->end = argp->p + (PAGE_SIZE>>2);
-		argp->pagelen -= PAGE_SIZE;
-	}
-}
-
-static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
-{
-	/* We want more bytes than seem to be available.
-	 * Maybe we need a new page, maybe we have just run out
-	 */
-	unsigned int avail = (char *)argp->end - (char *)argp->p;
-	__be32 *p;
-
-	if (argp->pagelen == 0) {
-		struct kvec *vec = &argp->rqstp->rq_arg.tail[0];
-
-		if (!argp->tail) {
-			argp->tail = true;
-			avail = vec->iov_len;
-			argp->p = vec->iov_base;
-			argp->end = vec->iov_base + avail;
-		}
-
-		if (avail < nbytes)
-			return NULL;
-
-		p = argp->p;
-		argp->p += XDR_QUADLEN(nbytes);
-		return p;
-	}
-
-	if (avail + argp->pagelen < nbytes)
-		return NULL;
-	if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
-		return NULL;
-	/* ok, we can do it with the current plus the next page */
-	if (nbytes <= sizeof(argp->tmp))
-		p = argp->tmp;
-	else {
-		kfree(argp->tmpp);
-		p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL);
-		if (!p)
-			return NULL;
-		
-	}
-	/*
-	 * The following memcpy is safe because read_buf is always
-	 * called with nbytes > avail, and the two cases above both
-	 * guarantee p points to at least nbytes bytes.
-	 */
-	memcpy(p, argp->p, avail);
-	next_decode_page(argp);
-	memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
-	argp->p += XDR_QUADLEN(nbytes - avail);
-	return p;
-}
-
-static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp)
-{
-	unsigned int this = (char *)argp->end - (char *)argp->p;
-
-	return this + argp->pagelen;
-}
+#define READ_BUF(nbytes)			\
+	do {					\
+		p = xdr_inline_decode(argp->xdr,\
+				      nbytes);	\
+		if (!p)				\
+			goto xdr_error;		\
+	} while (0)
 
 static int zero_clientid(clientid_t *clid)
 {
@@ -261,44 +184,6 @@ svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
 	return p;
 }
 
-static __be32
-svcxdr_construct_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
-			struct page ***pagelist, u32 buflen)
-{
-	int avail;
-	int len;
-	int pages;
-
-	/* Sorry .. no magic macros for this.. *
-	 * READ_BUF(write->wr_buflen);
-	 * SAVEMEM(write->wr_buf, write->wr_buflen);
-	 */
-	avail = (char *)argp->end - (char *)argp->p;
-	if (avail + argp->pagelen < buflen) {
-		dprintk("NFSD: xdr error (%s:%d)\n",
-			       __FILE__, __LINE__);
-		return nfserr_bad_xdr;
-	}
-	head->iov_base = argp->p;
-	head->iov_len = avail;
-	*pagelist = argp->pagelist;
-
-	len = XDR_QUADLEN(buflen) << 2;
-	if (len >= avail) {
-		len -= avail;
-
-		pages = len >> PAGE_SHIFT;
-		argp->pagelist += pages;
-		argp->pagelen -= pages * PAGE_SIZE;
-		len -= pages * PAGE_SIZE;
-
-		next_decode_page(argp);
-	}
-	argp->p += XDR_QUADLEN(len);
-
-	return 0;
-}
-
 /**
  * savemem - duplicate a chunk of memory for later processing
  * @argp: NFSv4 compound argument structure to be freed with
@@ -398,7 +283,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		READ_BUF(4); len += 4;
 		nace = be32_to_cpup(p++);
 
-		if (nace > compoundargs_bytes_left(argp)/20)
+		if (nace > xdr_stream_remaining(argp->xdr) / sizeof(struct nfs4_ace))
 			/*
 			 * Even with 4-byte names there wouldn't be
 			 * space for that many aces; something fishy is
@@ -929,7 +814,7 @@ xdr_error:
 
 static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
 {
-	__be32 *p;
+	DECODE_HEAD;
 
 	READ_BUF(4);
 	o->len = be32_to_cpup(p++);
@@ -939,9 +824,8 @@ static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_ne
 
 	READ_BUF(o->len);
 	SAVEMEM(o->data, o->len);
-	return nfs_ok;
-xdr_error:
-	return nfserr_bad_xdr;
+
+	DECODE_TAIL;
 }
 
 static __be32
@@ -1319,10 +1203,8 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 		goto xdr_error;
 	write->wr_buflen = be32_to_cpup(p++);
 
-	status = svcxdr_construct_vector(argp, &write->wr_head,
-					 &write->wr_pagelist, write->wr_buflen);
-	if (status)
-		return status;
+	if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen))
+		goto xdr_error;
 
 	DECODE_TAIL;
 }
@@ -1891,13 +1773,14 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
  */
 
 /*
- * Decode data into buffer. Uses head and pages constructed by
- * svcxdr_construct_vector.
+ * Decode data into buffer.
  */
 static __be32
-nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
-		       struct page **pages, char **bufp, u32 buflen)
+nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct xdr_buf *xdr,
+		       char **bufp, u32 buflen)
 {
+	struct page **pages = xdr->pages;
+	struct kvec *head = xdr->head;
 	char *tmp, *dp;
 	u32 len;
 
@@ -2012,8 +1895,6 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
 {
 	DECODE_HEAD;
 	u32 flags, maxcount, size;
-	struct kvec head;
-	struct page **pagelist;
 
 	READ_BUF(4);
 	flags = be32_to_cpup(p++);
@@ -2036,12 +1917,12 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
 
 	setxattr->setxa_len = size;
 	if (size > 0) {
-		status = svcxdr_construct_vector(argp, &head, &pagelist, size);
-		if (status)
-			return status;
+		struct xdr_buf payload;
 
-		status = nfsd4_vbuf_from_vector(argp, &head, pagelist,
-		    &setxattr->setxa_buf, size);
+		if (!xdr_stream_subsegment(argp->xdr, &payload, size))
+			goto xdr_error;
+		status = nfsd4_vbuf_from_vector(argp, &payload,
+						&setxattr->setxa_buf, size);
 	}
 
 	DECODE_TAIL;
@@ -5288,8 +5169,6 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
 		kfree(args->ops);
 		args->ops = args->iops;
 	}
-	kfree(args->tmpp);
-	args->tmpp = NULL;
 	while (args->to_free) {
 		struct svcxdr_tmpbuf *tb = args->to_free;
 		args->to_free = tb->next;
@@ -5302,19 +5181,11 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p)
 {
 	struct nfsd4_compoundargs *args = rqstp->rq_argp;
 
-	if (rqstp->rq_arg.head[0].iov_len % 4) {
-		/* client is nuts */
-		dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
-			__func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
-		return 0;
-	}
-	args->p = p;
-	args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
-	args->pagelist = rqstp->rq_arg.pages;
-	args->pagelen = rqstp->rq_arg.page_len;
-	args->tail = false;
+	/* svcxdr_tmp_alloc */
 	args->tmpp = NULL;
 	args->to_free = NULL;
+
+	args->xdr = &rqstp->rq_arg_stream;
 	args->ops = args->iops;
 	args->rqstp = rqstp;
 
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 37f89ad5e992..0eb13bd603ea 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -419,8 +419,7 @@ struct nfsd4_write {
 	u64		wr_offset;          /* request */
 	u32		wr_stable_how;      /* request */
 	u32		wr_buflen;          /* request */
-	struct kvec	wr_head;
-	struct page **	wr_pagelist;        /* request */
+	struct xdr_buf	wr_payload;         /* request */
 
 	u32		wr_bytes_written;   /* response */
 	u32		wr_how_written;     /* response */
@@ -696,15 +695,10 @@ struct svcxdr_tmpbuf {
 
 struct nfsd4_compoundargs {
 	/* scratch variables for XDR decode */
-	__be32 *			p;
-	__be32 *			end;
-	struct page **			pagelist;
-	int				pagelen;
-	bool				tail;
 	__be32				tmp[8];
 	__be32 *			tmpp;
+	struct xdr_stream		*xdr;
 	struct svcxdr_tmpbuf		*to_free;
-
 	struct svc_rqst			*rqstp;
 
 	u32				taglen;
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 2729d2d6efce..cc64cd0891f1 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -254,6 +254,8 @@ extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t);
 extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
+extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
+				  unsigned int len);
 
 /**
  * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c607744b3ea8..757560a3b06b 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1407,6 +1407,51 @@ int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
 }
 EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
 
+/**
+ * xdr_stream_subsegment - set @subbuf to a portion of @xdr
+ * @xdr: an xdr_stream set up for decoding
+ * @subbuf: the result buffer
+ * @nbytes: length of @xdr to extract, in bytes
+ *
+ * Sets up @subbuf to represent a portion of @xdr. The portion
+ * starts at the current offset in @xdr, and extends for a length
+ * of @nbytes. If this is successful, @xdr is advanced to the next
+ * position following that portion.
+ *
+ * Return values:
+ *   %true: @subbuf has been initialized, and @xdr has been advanced.
+ *   %false: a bounds error has occurred
+ */
+bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
+			   unsigned int nbytes)
+{
+	unsigned int remaining, offset, len;
+
+	if (xdr_buf_subsegment(xdr->buf, subbuf, xdr_stream_pos(xdr), nbytes))
+		return false;
+
+	if (subbuf->head[0].iov_len)
+		if (!__xdr_inline_decode(xdr, subbuf->head[0].iov_len))
+			return false;
+
+	remaining = subbuf->page_len;
+	offset = subbuf->page_base;
+	while (remaining) {
+		len = min_t(unsigned int, remaining, PAGE_SIZE) - offset;
+
+		if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
+			return false;
+		if (!__xdr_inline_decode(xdr, len))
+			return false;
+
+		remaining -= len;
+		offset = 0;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_subsegment);
+
 /**
  * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
  * @buf: buf to be trimmed
-- 
cgit 


From cbd9abb3706e96563b36af67595707a7054ab693 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 3 Nov 2020 13:19:51 -0500
Subject: NFSD: Replace READ* macros in nfsd4_decode_commit()

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c          | 12 +++++-------
 include/linux/sunrpc/xdr.h | 21 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e5188b2f943..ff1e3621d977 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -581,13 +581,11 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 static __be32
 nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
 {
-	DECODE_HEAD;
-
-	READ_BUF(12);
-	p = xdr_decode_hyper(p, &commit->co_offset);
-	commit->co_count = be32_to_cpup(p++);
-
-	DECODE_TAIL;
+	if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0)
+		return nfserr_bad_xdr;
+	if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0)
+		return nfserr_bad_xdr;
+	return nfs_ok;
 }
 
 static __be32
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index cc64cd0891f1..cc669d95c484 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -571,6 +571,27 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr)
 	return 0;
 }
 
+/**
+ * xdr_stream_decode_u64 - Decode a 64-bit integer
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store 64-bit integer
+ *
+ * Return values:
+ *   %0 on success
+ *   %-EBADMSG on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr)
+{
+	const size_t count = sizeof(*ptr);
+	__be32 *p = xdr_inline_decode(xdr, count);
+
+	if (unlikely(!p))
+		return -EBADMSG;
+	xdr_decode_hyper(p, ptr);
+	return 0;
+}
+
 /**
  * xdr_stream_decode_opaque_fixed - Decode fixed length opaque xdr data
  * @xdr: pointer to xdr_stream
-- 
cgit 


From 8918cc0d2b72db9997390626010b182c4500d749 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 16 Nov 2020 17:16:52 -0500
Subject: NFSD: Add helper for decoding locker4

Refactor for clarity.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c          | 64 +++++++++++++++++++++++++++++++---------------
 include/linux/sunrpc/xdr.h | 21 +++++++++++++++
 2 files changed, 64 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1b214b475d68..b85ccc94bfe0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -833,6 +833,48 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
 	return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen);
 }
 
+static __be32
+nfsd4_decode_open_to_lock_owner4(struct nfsd4_compoundargs *argp,
+				 struct nfsd4_lock *lock)
+{
+	__be32 status;
+
+	if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_open_seqid) < 0)
+		return nfserr_bad_xdr;
+	status = nfsd4_decode_stateid4(argp, &lock->lk_new_open_stateid);
+	if (status)
+		return status;
+	if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_lock_seqid) < 0)
+		return nfserr_bad_xdr;
+	return nfsd4_decode_state_owner4(argp, &lock->lk_new_clientid,
+					 &lock->lk_new_owner);
+}
+
+static __be32
+nfsd4_decode_exist_lock_owner4(struct nfsd4_compoundargs *argp,
+			       struct nfsd4_lock *lock)
+{
+	__be32 status;
+
+	status = nfsd4_decode_stateid4(argp, &lock->lk_old_lock_stateid);
+	if (status)
+		return status;
+	if (xdr_stream_decode_u32(argp->xdr, &lock->lk_old_lock_seqid) < 0)
+		return nfserr_bad_xdr;
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+{
+	if (xdr_stream_decode_bool(argp->xdr, &lock->lk_is_new) < 0)
+		return nfserr_bad_xdr;
+	if (lock->lk_is_new)
+		return nfsd4_decode_open_to_lock_owner4(argp, lock);
+	return nfsd4_decode_exist_lock_owner4(argp, lock);
+}
+
 static __be32
 nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
@@ -848,27 +890,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 	lock->lk_reclaim = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &lock->lk_offset);
 	p = xdr_decode_hyper(p, &lock->lk_length);
-	lock->lk_is_new = be32_to_cpup(p++);
-
-	if (lock->lk_is_new) {
-		READ_BUF(4);
-		lock->lk_new_open_seqid = be32_to_cpup(p++);
-		status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
-		if (status)
-			return status;
-		READ_BUF(4);
-		lock->lk_new_lock_seqid = be32_to_cpup(p++);
-		status = nfsd4_decode_state_owner4(argp, &lock->lk_new_clientid,
-						   &lock->lk_new_owner);
-		if (status)
-			return status;
-	} else {
-		status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
-		if (status)
-			return status;
-		READ_BUF(4);
-		lock->lk_old_lock_seqid = be32_to_cpup(p++);
-	}
+	status = nfsd4_decode_locker4(argp, lock);
 
 	DECODE_TAIL;
 }
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index cc669d95c484..9b35ce50cf2b 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -550,6 +550,27 @@ static inline bool xdr_item_is_present(const __be32 *p)
 	return *p != xdr_zero;
 }
 
+/**
+ * xdr_stream_decode_bool - Decode a boolean
+ * @xdr: pointer to xdr_stream
+ * @ptr: pointer to a u32 in which to store the result
+ *
+ * Return values:
+ *   %0 on success
+ *   %-EBADMSG on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr)
+{
+	const size_t count = sizeof(*ptr);
+	__be32 *p = xdr_inline_decode(xdr, count);
+
+	if (unlikely(!p))
+		return -EBADMSG;
+	*ptr = (*p != xdr_zero);
+	return 0;
+}
+
 /**
  * xdr_stream_decode_u32 - Decode a 32-bit integer
  * @xdr: pointer to xdr_stream
-- 
cgit 


From 7fd3253a7de6a317a0683f83739479fb880bffc8 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 30 Nov 2020 19:51:56 +0100
Subject: net: Introduce preferred busy-polling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing busy-polling mode, enabled by the SO_BUSY_POLL socket
option or system-wide using the /proc/sys/net/core/busy_read knob, is
an opportunistic. That means that if the NAPI context is not
scheduled, it will poll it. If, after busy-polling, the budget is
exceeded the busy-polling logic will schedule the NAPI onto the
regular softirq handling.

One implication of the behavior above is that a busy/heavy loaded NAPI
context will never enter/allow for busy-polling. Some applications
prefer that most NAPI processing would be done by busy-polling.

This series adds a new socket option, SO_PREFER_BUSY_POLL, that works
in concert with the napi_defer_hard_irqs and gro_flush_timeout
knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were
introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral
feature"), and allows for a user to defer interrupts to be enabled and
instead schedule the NAPI context from a watchdog timer. When a user
enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled,
and the NAPI context is being processed by a softirq, the softirq NAPI
processing will exit early to allow the busy-polling to be performed.

If the application stops performing busy-polling via a system call,
the watchdog timer defined by gro_flush_timeout will timeout, and
regular softirq handling will resume.

In summary; Heavy traffic applications that prefer busy-polling over
softirq processing should use this option.

Example usage:

  $ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs
  $ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout

Note that the timeout should be larger than the userspace processing
window, otherwise the watchdog will timeout and fall back to regular
softirq processing.

Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com
---
 arch/alpha/include/uapi/asm/socket.h  |  2 +
 arch/mips/include/uapi/asm/socket.h   |  2 +
 arch/parisc/include/uapi/asm/socket.h |  2 +
 arch/sparc/include/uapi/asm/socket.h  |  2 +
 fs/eventpoll.c                        |  2 +-
 include/linux/netdevice.h             | 35 +++++++++-------
 include/net/busy_poll.h               |  5 ++-
 include/net/sock.h                    |  4 ++
 include/uapi/asm-generic/socket.h     |  2 +
 net/core/dev.c                        | 78 ++++++++++++++++++++++++++++-------
 net/core/sock.c                       |  9 ++++
 11 files changed, 111 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index de6c4df61082..538359642554 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -124,6 +124,8 @@
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_PREFER_BUSY_POLL	69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index d0a9ed2ca2d6..e406e73b5e6e 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -135,6 +135,8 @@
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_PREFER_BUSY_POLL	69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 10173c32195e..1bc46200889d 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -116,6 +116,8 @@
 
 #define SO_DETACH_REUSEPORT_BPF 0x4042
 
+#define SO_PREFER_BUSY_POLL	0x4043
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 8029b681fc7c..99688cf673a4 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -117,6 +117,8 @@
 
 #define SO_DETACH_REUSEPORT_BPF  0x0047
 
+#define SO_PREFER_BUSY_POLL	 0x0048
+
 #if !defined(__KERNEL__)
 
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4df61129566d..e11fab3a0b9e 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -397,7 +397,7 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock)
 	unsigned int napi_id = READ_ONCE(ep->napi_id);
 
 	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
-		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
+		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false);
 }
 
 static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7ce648a564f7..52d1cc2bd8a7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -350,23 +350,25 @@ struct napi_struct {
 };
 
 enum {
-	NAPI_STATE_SCHED,	/* Poll is scheduled */
-	NAPI_STATE_MISSED,	/* reschedule a napi */
-	NAPI_STATE_DISABLE,	/* Disable pending */
-	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
-	NAPI_STATE_LISTED,	/* NAPI added to system lists */
-	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
-	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+	NAPI_STATE_SCHED,		/* Poll is scheduled */
+	NAPI_STATE_MISSED,		/* reschedule a napi */
+	NAPI_STATE_DISABLE,		/* Disable pending */
+	NAPI_STATE_NPSVC,		/* Netpoll - don't dequeue from poll_list */
+	NAPI_STATE_LISTED,		/* NAPI added to system lists */
+	NAPI_STATE_NO_BUSY_POLL,	/* Do not add in napi_hash, no busy polling */
+	NAPI_STATE_IN_BUSY_POLL,	/* sk_busy_loop() owns this NAPI */
+	NAPI_STATE_PREFER_BUSY_POLL,	/* prefer busy-polling over softirq processing*/
 };
 
 enum {
-	NAPIF_STATE_SCHED	 = BIT(NAPI_STATE_SCHED),
-	NAPIF_STATE_MISSED	 = BIT(NAPI_STATE_MISSED),
-	NAPIF_STATE_DISABLE	 = BIT(NAPI_STATE_DISABLE),
-	NAPIF_STATE_NPSVC	 = BIT(NAPI_STATE_NPSVC),
-	NAPIF_STATE_LISTED	 = BIT(NAPI_STATE_LISTED),
-	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
-	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_SCHED		= BIT(NAPI_STATE_SCHED),
+	NAPIF_STATE_MISSED		= BIT(NAPI_STATE_MISSED),
+	NAPIF_STATE_DISABLE		= BIT(NAPI_STATE_DISABLE),
+	NAPIF_STATE_NPSVC		= BIT(NAPI_STATE_NPSVC),
+	NAPIF_STATE_LISTED		= BIT(NAPI_STATE_LISTED),
+	NAPIF_STATE_NO_BUSY_POLL	= BIT(NAPI_STATE_NO_BUSY_POLL),
+	NAPIF_STATE_IN_BUSY_POLL	= BIT(NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_PREFER_BUSY_POLL	= BIT(NAPI_STATE_PREFER_BUSY_POLL),
 };
 
 enum gro_result {
@@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n)
 	return test_bit(NAPI_STATE_DISABLE, &n->state);
 }
 
+static inline bool napi_prefer_busy_poll(struct napi_struct *n)
+{
+	return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
+}
+
 bool napi_schedule_prep(struct napi_struct *n);
 
 /**
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b001fa91c14e..0292b8353d7e 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -43,7 +43,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time);
 
 void napi_busy_loop(unsigned int napi_id,
 		    bool (*loop_end)(void *, unsigned long),
-		    void *loop_end_arg);
+		    void *loop_end_arg, bool prefer_busy_poll);
 
 #else /* CONFIG_NET_RX_BUSY_POLL */
 static inline unsigned long net_busy_loop_on(void)
@@ -105,7 +105,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
 	unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
 
 	if (napi_id >= MIN_NAPI_ID)
-		napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
+		napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
+			       READ_ONCE(sk->sk_prefer_busy_poll));
 #endif
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index a5c6ae78df77..d49b89b071b6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -301,6 +301,7 @@ struct bpf_local_storage;
   *	@sk_ack_backlog: current listen backlog
   *	@sk_max_ack_backlog: listen backlog set in listen()
   *	@sk_uid: user id of owner
+  *	@sk_prefer_busy_poll: prefer busypolling over softirq processing
   *	@sk_priority: %SO_PRIORITY setting
   *	@sk_type: socket type (%SOCK_STREAM, etc)
   *	@sk_protocol: which protocol this socket belongs in this network family
@@ -479,6 +480,9 @@ struct sock {
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
 	kuid_t			sk_uid;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	u8			sk_prefer_busy_poll;
+#endif
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 77f7c1638eb1..7dd02408b7ce 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -119,6 +119,8 @@
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_PREFER_BUSY_POLL	69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/core/dev.c b/net/core/dev.c
index 60d325bda0d7..6f8d2cffb7c5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6458,7 +6458,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 
 		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 
-		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+			      NAPIF_STATE_PREFER_BUSY_POLL);
 
 		/* If STATE_MISSED was set, leave STATE_SCHED set,
 		 * because we will call napi->poll() one more time.
@@ -6497,8 +6498,29 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
 
 #define BUSY_POLL_BUDGET 8
 
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 {
+	if (!skip_schedule) {
+		gro_normal_list(napi);
+		__napi_schedule(napi);
+		return;
+	}
+
+	if (napi->gro_bitmask) {
+		/* flush too old packets
+		 * If HZ < 1000, flush all packets.
+		 */
+		napi_gro_flush(napi, HZ >= 1000);
+	}
+
+	gro_normal_list(napi);
+	clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll)
+{
+	bool skip_schedule = false;
+	unsigned long timeout;
 	int rc;
 
 	/* Busy polling means there is a high chance device driver hard irq
@@ -6515,6 +6537,15 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 
 	local_bh_disable();
 
+	if (prefer_busy_poll) {
+		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+		if (napi->defer_hard_irqs_count && timeout) {
+			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+			skip_schedule = true;
+		}
+	}
+
 	/* All we really want here is to re-enable device interrupts.
 	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 	 */
@@ -6525,19 +6556,14 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 	 */
 	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 	netpoll_poll_unlock(have_poll_lock);
-	if (rc == BUSY_POLL_BUDGET) {
-		/* As the whole budget was spent, we still own the napi so can
-		 * safely handle the rx_list.
-		 */
-		gro_normal_list(napi);
-		__napi_schedule(napi);
-	}
+	if (rc == BUSY_POLL_BUDGET)
+		__busy_poll_stop(napi, skip_schedule);
 	local_bh_enable();
 }
 
 void napi_busy_loop(unsigned int napi_id,
 		    bool (*loop_end)(void *, unsigned long),
-		    void *loop_end_arg)
+		    void *loop_end_arg, bool prefer_busy_poll)
 {
 	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 	int (*napi_poll)(struct napi_struct *napi, int budget);
@@ -6565,12 +6591,18 @@ restart:
 			 * we avoid dirtying napi->state as much as we can.
 			 */
 			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
-				   NAPIF_STATE_IN_BUSY_POLL))
+				   NAPIF_STATE_IN_BUSY_POLL)) {
+				if (prefer_busy_poll)
+					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 				goto count;
+			}
 			if (cmpxchg(&napi->state, val,
 				    val | NAPIF_STATE_IN_BUSY_POLL |
-					  NAPIF_STATE_SCHED) != val)
+					  NAPIF_STATE_SCHED) != val) {
+				if (prefer_busy_poll)
+					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 				goto count;
+			}
 			have_poll_lock = netpoll_poll_lock(napi);
 			napi_poll = napi->poll;
 		}
@@ -6588,7 +6620,7 @@ count:
 
 		if (unlikely(need_resched())) {
 			if (napi_poll)
-				busy_poll_stop(napi, have_poll_lock);
+				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll);
 			preempt_enable();
 			rcu_read_unlock();
 			cond_resched();
@@ -6599,7 +6631,7 @@ count:
 		cpu_relax();
 	}
 	if (napi_poll)
-		busy_poll_stop(napi, have_poll_lock);
+		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll);
 	preempt_enable();
 out:
 	rcu_read_unlock();
@@ -6650,8 +6682,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 	 */
 	if (!napi_disable_pending(napi) &&
-	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 		__napi_schedule_irqoff(napi);
+	}
 
 	return HRTIMER_NORESTART;
 }
@@ -6709,6 +6743,7 @@ void napi_disable(struct napi_struct *n)
 
 	hrtimer_cancel(&n->timer);
 
+	clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
@@ -6781,6 +6816,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 		goto out_unlock;
 	}
 
+	/* The NAPI context has more processing work, but busy-polling
+	 * is preferred. Exit early.
+	 */
+	if (napi_prefer_busy_poll(n)) {
+		if (napi_complete_done(n, work)) {
+			/* If timeout is not set, we need to make sure
+			 * that the NAPI is re-scheduled.
+			 */
+			napi_schedule(n);
+		}
+		goto out_unlock;
+	}
+
 	if (n->gro_bitmask) {
 		/* flush too old packets
 		 * If HZ < 1000, flush all packets.
diff --git a/net/core/sock.c b/net/core/sock.c
index 727ea1cc633c..e05f2e52b5a8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1159,6 +1159,12 @@ set_sndbuf:
 				sk->sk_ll_usec = val;
 		}
 		break;
+	case SO_PREFER_BUSY_POLL:
+		if (valbool && !capable(CAP_NET_ADMIN))
+			ret = -EPERM;
+		else
+			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+		break;
 #endif
 
 	case SO_MAX_PACING_RATE:
@@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 	case SO_BUSY_POLL:
 		v.val = sk->sk_ll_usec;
 		break;
+	case SO_PREFER_BUSY_POLL:
+		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
+		break;
 #endif
 
 	case SO_MAX_PACING_RATE:
-- 
cgit 


From 147ad605dc12c515c97136899ccb5c70e6c674e1 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:16 +0100
Subject: init: use type alignment for kernel parameters

Specify type alignment for kernel parameters instead of sizeof(long).

The alignment attribute is used to prevent gcc from increasing the
alignment of objects with static extent as an optimisation, something
which would mess up the __setup array stride.

Using __alignof__(struct obs_kernel_param) rather than sizeof(long) is
preferred since it better indicates why it is there and doesn't break
should the type size or alignment change.

Note that on m68k the alignment of struct obs_kernel_param is actually
two and that adding a 1- or 2-byte field to the 12-byte struct would
cause a breakage with the current 4-byte alignment.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/init.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 7b53cb3092ee..e668832ef66a 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -255,7 +255,7 @@ struct obs_kernel_param {
 		__aligned(1) = str; 					\
 	static struct obs_kernel_param __setup_##unique_id		\
 		__used __section(".init.setup")				\
-		__attribute__((aligned((sizeof(long)))))		\
+		__aligned(__alignof__(struct obs_kernel_param))		\
 		= { __setup_str_##unique_id, fn, early }
 
 #define __setup(str, fn)						\
-- 
cgit 


From a2abe7cbd8fe2db5ff386c968e2273d9dc6c468d Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Mon, 30 Nov 2020 15:34:41 -0800
Subject: scs: switch to vmapped shadow stacks

The kernel currently uses kmem_cache to allocate shadow call stacks,
which means an overflows may not be immediately detected and can
potentially result in another task's shadow stack to be overwritten.

This change switches SCS to use virtually mapped shadow stacks for
tasks, which increases shadow stack size to a full page and provides
more robust overflow detection, similarly to VMAP_STACK.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201130233442.2562064-2-samitolvanen@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/scs.h | 12 ++++-----
 kernel/scs.c        | 71 ++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 66 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scs.h b/include/linux/scs.h
index 6dec390cf154..2a506c2a16f4 100644
--- a/include/linux/scs.h
+++ b/include/linux/scs.h
@@ -15,12 +15,8 @@
 
 #ifdef CONFIG_SHADOW_CALL_STACK
 
-/*
- * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit
- * architecture) provided ~40% safety margin on stack usage while keeping
- * memory allocation overhead reasonable.
- */
-#define SCS_SIZE		SZ_1K
+#define SCS_ORDER		0
+#define SCS_SIZE		(PAGE_SIZE << SCS_ORDER)
 #define GFP_SCS			(GFP_KERNEL | __GFP_ZERO)
 
 /* An illegal pointer value to mark the end of the shadow stack. */
@@ -33,6 +29,8 @@
 #define task_scs(tsk)		(task_thread_info(tsk)->scs_base)
 #define task_scs_sp(tsk)	(task_thread_info(tsk)->scs_sp)
 
+void *scs_alloc(int node);
+void scs_free(void *s);
 void scs_init(void);
 int scs_prepare(struct task_struct *tsk, int node);
 void scs_release(struct task_struct *tsk);
@@ -61,6 +59,8 @@ static inline bool task_scs_end_corrupted(struct task_struct *tsk)
 
 #else /* CONFIG_SHADOW_CALL_STACK */
 
+static inline void *scs_alloc(int node) { return NULL; }
+static inline void scs_free(void *s) {}
 static inline void scs_init(void) {}
 static inline void scs_task_reset(struct task_struct *tsk) {}
 static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; }
diff --git a/kernel/scs.c b/kernel/scs.c
index 4ff4a7ba0094..e2a71fc82fa0 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -5,26 +5,49 @@
  * Copyright (C) 2019 Google LLC
  */
 
+#include <linux/cpuhotplug.h>
 #include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/scs.h>
-#include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 
-static struct kmem_cache *scs_cache;
-
 static void __scs_account(void *s, int account)
 {
-	struct page *scs_page = virt_to_page(s);
+	struct page *scs_page = vmalloc_to_page(s);
 
 	mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
 			    account * (SCS_SIZE / SZ_1K));
 }
 
-static void *scs_alloc(int node)
+/* Matches NR_CACHED_STACKS for VMAP_STACK */
+#define NR_CACHED_SCS 2
+static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]);
+
+static void *__scs_alloc(int node)
 {
-	void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
+	int i;
+	void *s;
+
+	for (i = 0; i < NR_CACHED_SCS; i++) {
+		s = this_cpu_xchg(scs_cache[i], NULL);
+		if (s) {
+			kasan_unpoison_vmalloc(s, SCS_SIZE);
+			memset(s, 0, SCS_SIZE);
+			return s;
+		}
+	}
+
+	return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+				    GFP_SCS, PAGE_KERNEL, 0, node,
+				    __builtin_return_address(0));
+}
 
+void *scs_alloc(int node)
+{
+	void *s;
+
+	s = __scs_alloc(node);
 	if (!s)
 		return NULL;
 
@@ -34,21 +57,47 @@ static void *scs_alloc(int node)
 	 * Poison the allocation to catch unintentional accesses to
 	 * the shadow stack when KASAN is enabled.
 	 */
-	kasan_poison_object_data(scs_cache, s);
+	kasan_poison_vmalloc(s, SCS_SIZE);
 	__scs_account(s, 1);
 	return s;
 }
 
-static void scs_free(void *s)
+void scs_free(void *s)
 {
+	int i;
+
 	__scs_account(s, -1);
-	kasan_unpoison_object_data(scs_cache, s);
-	kmem_cache_free(scs_cache, s);
+
+	/*
+	 * We cannot sleep as this can be called in interrupt context,
+	 * so use this_cpu_cmpxchg to update the cache, and vfree_atomic
+	 * to free the stack.
+	 */
+
+	for (i = 0; i < NR_CACHED_SCS; i++)
+		if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
+			return;
+
+	vfree_atomic(s);
+}
+
+static int scs_cleanup(unsigned int cpu)
+{
+	int i;
+	void **cache = per_cpu_ptr(scs_cache, cpu);
+
+	for (i = 0; i < NR_CACHED_SCS; i++) {
+		vfree(cache[i]);
+		cache[i] = NULL;
+	}
+
+	return 0;
 }
 
 void __init scs_init(void)
 {
-	scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL);
+	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
+			  scs_cleanup);
 }
 
 int scs_prepare(struct task_struct *tsk, int node)
-- 
cgit 


From ac20ffbb0279aae7be48567fb734eae7d050769e Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Mon, 30 Nov 2020 15:34:42 -0800
Subject: arm64: scs: use vmapped IRQ and SDEI shadow stacks

Use scs_alloc() to allocate also IRQ and SDEI shadow stacks instead of
using statically allocated stacks.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201130233442.2562064-3-samitolvanen@google.com
[will: Move CONFIG_SHADOW_CALL_STACK check into init_irq_scs()]
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/Makefile |  1 -
 arch/arm64/kernel/entry.S  |  6 ++--
 arch/arm64/kernel/irq.c    | 21 ++++++++++++++
 arch/arm64/kernel/scs.c    | 16 -----------
 arch/arm64/kernel/sdei.c   | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/scs.h        |  4 ---
 6 files changed, 94 insertions(+), 24 deletions(-)
 delete mode 100644 arch/arm64/kernel/scs.c

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index bbaf0bc4ad60..86364ab6f13f 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 obj-$(CONFIG_CRASH_CORE)		+= crash_core.o
 obj-$(CONFIG_ARM_SDE_INTERFACE)		+= sdei.o
 obj-$(CONFIG_ARM64_PTR_AUTH)		+= pointer_auth.o
-obj-$(CONFIG_SHADOW_CALL_STACK)		+= scs.o
 obj-$(CONFIG_ARM64_MTE)			+= mte.o
 
 obj-y					+= vdso/ probes/
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index b295fb912b12..5c2ac4b5b2da 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -441,7 +441,7 @@ SYM_CODE_END(__swpan_exit_el0)
 
 #ifdef CONFIG_SHADOW_CALL_STACK
 	/* also switch to the irq shadow stack */
-	adr_this_cpu scs_sp, irq_shadow_call_stack, x26
+	ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x26
 #endif
 
 9998:
@@ -1097,9 +1097,9 @@ SYM_CODE_START(__sdei_asm_handler)
 #ifdef CONFIG_SHADOW_CALL_STACK
 	/* Use a separate shadow call stack for normal and critical events */
 	cbnz	w4, 3f
-	adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal, tmp=x6
+	ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal_ptr, tmp=x6
 	b	4f
-3:	adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical, tmp=x6
+3:	ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical_ptr, tmp=x6
 4:
 #endif
 
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 9cf2fb87584a..ac54c21140d4 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/irqchip.h>
 #include <linux/kprobes.h>
+#include <linux/scs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <asm/daifflags.h>
@@ -27,6 +28,25 @@ DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts);
 
 DEFINE_PER_CPU(unsigned long *, irq_stack_ptr);
 
+
+DECLARE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr);
+
+#ifdef CONFIG_SHADOW_CALL_STACK
+DEFINE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr);
+#endif
+
+static void init_irq_scs(void)
+{
+	int cpu;
+
+	if (!IS_ENABLED(CONFIG_SHADOW_CALL_STACK))
+		return;
+
+	for_each_possible_cpu(cpu)
+		per_cpu(irq_shadow_call_stack_ptr, cpu) =
+			scs_alloc(cpu_to_node(cpu));
+}
+
 #ifdef CONFIG_VMAP_STACK
 static void init_irq_stacks(void)
 {
@@ -54,6 +74,7 @@ static void init_irq_stacks(void)
 void __init init_IRQ(void)
 {
 	init_irq_stacks();
+	init_irq_scs();
 	irqchip_init();
 	if (!handle_arch_irq)
 		panic("No interrupt controller found.");
diff --git a/arch/arm64/kernel/scs.c b/arch/arm64/kernel/scs.c
deleted file mode 100644
index e8f7ff45dd8f..000000000000
--- a/arch/arm64/kernel/scs.c
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Shadow Call Stack support.
- *
- * Copyright (C) 2019 Google LLC
- */
-
-#include <linux/percpu.h>
-#include <linux/scs.h>
-
-DEFINE_SCS(irq_shadow_call_stack);
-
-#ifdef CONFIG_ARM_SDE_INTERFACE
-DEFINE_SCS(sdei_shadow_call_stack_normal);
-DEFINE_SCS(sdei_shadow_call_stack_critical);
-#endif
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 7689f2031c0c..d12fd786b267 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -7,6 +7,7 @@
 #include <linux/hardirq.h>
 #include <linux/irqflags.h>
 #include <linux/sched/task_stack.h>
+#include <linux/scs.h>
 #include <linux/uaccess.h>
 
 #include <asm/alternative.h>
@@ -37,6 +38,14 @@ DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr);
 DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr);
 #endif
 
+DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr);
+DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr);
+
+#ifdef CONFIG_SHADOW_CALL_STACK
+DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr);
+DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr);
+#endif
+
 static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu)
 {
 	unsigned long *p;
@@ -90,6 +99,59 @@ static int init_sdei_stacks(void)
 	return err;
 }
 
+static void _free_sdei_scs(unsigned long * __percpu *ptr, int cpu)
+{
+	void *s;
+
+	s = per_cpu(*ptr, cpu);
+	if (s) {
+		per_cpu(*ptr, cpu) = NULL;
+		scs_free(s);
+	}
+}
+
+static void free_sdei_scs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		_free_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu);
+		_free_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu);
+	}
+}
+
+static int _init_sdei_scs(unsigned long * __percpu *ptr, int cpu)
+{
+	void *s;
+
+	s = scs_alloc(cpu_to_node(cpu));
+	if (!s)
+		return -ENOMEM;
+	per_cpu(*ptr, cpu) = s;
+
+	return 0;
+}
+
+static int init_sdei_scs(void)
+{
+	int cpu;
+	int err = 0;
+
+	for_each_possible_cpu(cpu) {
+		err = _init_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu);
+		if (err)
+			break;
+		err = _init_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu);
+		if (err)
+			break;
+	}
+
+	if (err)
+		free_sdei_scs();
+
+	return err;
+}
+
 static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr);
@@ -138,6 +200,14 @@ unsigned long sdei_arch_get_entry_point(int conduit)
 			return 0;
 	}
 
+	if (IS_ENABLED(CONFIG_SHADOW_CALL_STACK)) {
+		if (init_sdei_scs()) {
+			if (IS_ENABLED(CONFIG_VMAP_STACK))
+				free_sdei_stacks();
+			return 0;
+		}
+	}
+
 	sdei_exit_mode = (conduit == SMCCC_CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC;
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
diff --git a/include/linux/scs.h b/include/linux/scs.h
index 2a506c2a16f4..18122d9e17ff 100644
--- a/include/linux/scs.h
+++ b/include/linux/scs.h
@@ -22,10 +22,6 @@
 /* An illegal pointer value to mark the end of the shadow stack. */
 #define SCS_END_MAGIC		(0x5f6UL + POISON_POINTER_DELTA)
 
-/* Allocate a static per-CPU shadow stack */
-#define DEFINE_SCS(name)						\
-	DEFINE_PER_CPU(unsigned long [SCS_SIZE/sizeof(long)], name)	\
-
 #define task_scs(tsk)		(task_thread_info(tsk)->scs_base)
 #define task_scs_sp(tsk)	(task_thread_info(tsk)->scs_sp)
 
-- 
cgit 


From 57d9352b6c651b090179f8b223b6681275c64a4f Mon Sep 17 00:00:00 2001
From: Moritz Fischer <mdf@kernel.org>
Date: Sun, 15 Nov 2020 11:51:18 -0800
Subject: fpga: fpga-mgr: Add devm_fpga_mgr_register() API

Add a devm_fpga_mgr_register() API that can be used to register a FPGA
Manager that was created using devm_fpga_mgr_create().

Introduce a struct fpga_mgr_devres that makes the devres
allocation a little bit more readable and gets reused for
devm_fpga_mgr_create() devm_fpga_mgr_register().

Reviewed-by: Tom Rix <trix@redhat.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Link: https://lore.kernel.org/r/20201115195127.284487-2-mdf@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/fpga-mgr.c       | 81 ++++++++++++++++++++++++++++++++++++-------
 include/linux/fpga/fpga-mgr.h |  2 ++
 2 files changed, 71 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index f38bab01432e..b85bc47c91a9 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -21,6 +21,10 @@
 static DEFINE_IDA(fpga_mgr_ida);
 static struct class *fpga_mgr_class;
 
+struct fpga_mgr_devres {
+	struct fpga_manager *mgr;
+};
+
 /**
  * fpga_image_info_alloc - Allocate a FPGA image info struct
  * @dev: owning device
@@ -625,9 +629,9 @@ EXPORT_SYMBOL_GPL(fpga_mgr_free);
 
 static void devm_fpga_mgr_release(struct device *dev, void *res)
 {
-	struct fpga_manager *mgr = *(struct fpga_manager **)res;
+	struct fpga_mgr_devres *dr = res;
 
-	fpga_mgr_free(mgr);
+	fpga_mgr_free(dr->mgr);
 }
 
 /**
@@ -651,21 +655,21 @@ struct fpga_manager *devm_fpga_mgr_create(struct device *dev, const char *name,
 					  const struct fpga_manager_ops *mops,
 					  void *priv)
 {
-	struct fpga_manager **ptr, *mgr;
+	struct fpga_mgr_devres *dr;
 
-	ptr = devres_alloc(devm_fpga_mgr_release, sizeof(*ptr), GFP_KERNEL);
-	if (!ptr)
+	dr = devres_alloc(devm_fpga_mgr_release, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
 		return NULL;
 
-	mgr = fpga_mgr_create(dev, name, mops, priv);
-	if (!mgr) {
-		devres_free(ptr);
-	} else {
-		*ptr = mgr;
-		devres_add(dev, ptr);
+	dr->mgr = fpga_mgr_create(dev, name, mops, priv);
+	if (!dr->mgr) {
+		devres_free(dr);
+		return NULL;
 	}
 
-	return mgr;
+	devres_add(dev, dr);
+
+	return dr->mgr;
 }
 EXPORT_SYMBOL_GPL(devm_fpga_mgr_create);
 
@@ -722,6 +726,59 @@ void fpga_mgr_unregister(struct fpga_manager *mgr)
 }
 EXPORT_SYMBOL_GPL(fpga_mgr_unregister);
 
+static int fpga_mgr_devres_match(struct device *dev, void *res,
+				 void *match_data)
+{
+	struct fpga_mgr_devres *dr = res;
+
+	return match_data == dr->mgr;
+}
+
+static void devm_fpga_mgr_unregister(struct device *dev, void *res)
+{
+	struct fpga_mgr_devres *dr = res;
+
+	fpga_mgr_unregister(dr->mgr);
+}
+
+/**
+ * devm_fpga_mgr_register - resource managed variant of fpga_mgr_register()
+ * @dev: managing device for this FPGA manager
+ * @mgr: fpga manager struct
+ *
+ * This is the devres variant of fpga_mgr_register() for which the unregister
+ * function will be called automatically when the managing device is detached.
+ */
+int devm_fpga_mgr_register(struct device *dev, struct fpga_manager *mgr)
+{
+	struct fpga_mgr_devres *dr;
+	int ret;
+
+	/*
+	 * Make sure that the struct fpga_manager * that is passed in is
+	 * managed itself.
+	 */
+	if (WARN_ON(!devres_find(dev, devm_fpga_mgr_release,
+				 fpga_mgr_devres_match, mgr)))
+		return -EINVAL;
+
+	dr = devres_alloc(devm_fpga_mgr_unregister, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	ret = fpga_mgr_register(mgr);
+	if (ret) {
+		devres_free(dr);
+		return ret;
+	}
+
+	dr->mgr = mgr;
+	devres_add(dev, dr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_fpga_mgr_register);
+
 static void fpga_mgr_dev_release(struct device *dev)
 {
 }
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index e8ca62b2cb5b..2bc3030a69e5 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -198,6 +198,8 @@ void fpga_mgr_free(struct fpga_manager *mgr);
 int fpga_mgr_register(struct fpga_manager *mgr);
 void fpga_mgr_unregister(struct fpga_manager *mgr);
 
+int devm_fpga_mgr_register(struct device *dev, struct fpga_manager *mgr);
+
 struct fpga_manager *devm_fpga_mgr_create(struct device *dev, const char *name,
 					  const struct fpga_manager_ops *mops,
 					  void *priv);
-- 
cgit 


From fa69ee5aa48b5b52e8028c2eb486906e9998d081 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 25 Nov 2020 23:48:40 +0100
Subject: net: switch to storing KCOV handle directly in sk_buff

It turns out that usage of skb extensions can cause memory leaks. Ido
Schimmel reported: "[...] there are instances that blindly overwrite
'skb->extensions' by invoking skb_copy_header() after __alloc_skb()."

Therefore, give up on using skb extensions for KCOV handle, and instead
directly store kcov_handle in sk_buff.

Fixes: 6370cc3bbd8a ("net: add kcov handle to skb extensions")
Fixes: 85ce50d337d1 ("net: kcov: don't select SKB_EXTENSIONS when there is no NET")
Fixes: 97f53a08cba1 ("net: linux/skbuff.h: combine SKB_EXTENSIONS + KCOV handling")
Link: https://lore.kernel.org/linux-wireless/20201121160941.GA485907@shredder.lan/
Reported-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20201125224840.2014773-1-elver@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 37 +++++++++++++------------------------
 lib/Kconfig.debug      |  1 -
 net/core/skbuff.c      |  6 ------
 3 files changed, 13 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0a1239819fd2..333bcdc39635 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -701,6 +701,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@transport_header: Transport layer header
  *	@network_header: Network layer header
  *	@mac_header: Link layer header
+ *	@kcov_handle: KCOV remote handle for remote coverage collection
  *	@tail: Tail pointer
  *	@end: End pointer
  *	@head: Head of buffer
@@ -904,6 +905,10 @@ struct sk_buff {
 	__u16			network_header;
 	__u16			mac_header;
 
+#ifdef CONFIG_KCOV
+	u64			kcov_handle;
+#endif
+
 	/* private: */
 	__u32			headers_end[0];
 	/* public: */
@@ -4150,9 +4155,6 @@ enum skb_ext_id {
 #endif
 #if IS_ENABLED(CONFIG_MPTCP)
 	SKB_EXT_MPTCP,
-#endif
-#if IS_ENABLED(CONFIG_KCOV)
-	SKB_EXT_KCOV_HANDLE,
 #endif
 	SKB_EXT_NUM, /* must be last */
 };
@@ -4608,35 +4610,22 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 #endif
 }
 
-#if IS_ENABLED(CONFIG_KCOV) && IS_ENABLED(CONFIG_SKB_EXTENSIONS)
 static inline void skb_set_kcov_handle(struct sk_buff *skb,
 				       const u64 kcov_handle)
 {
-	/* Do not allocate skb extensions only to set kcov_handle to zero
-	 * (as it is zero by default). However, if the extensions are
-	 * already allocated, update kcov_handle anyway since
-	 * skb_set_kcov_handle can be called to zero a previously set
-	 * value.
-	 */
-	if (skb_has_extensions(skb) || kcov_handle) {
-		u64 *kcov_handle_ptr = skb_ext_add(skb, SKB_EXT_KCOV_HANDLE);
-
-		if (kcov_handle_ptr)
-			*kcov_handle_ptr = kcov_handle;
-	}
+#ifdef CONFIG_KCOV
+	skb->kcov_handle = kcov_handle;
+#endif
 }
 
 static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 {
-	u64 *kcov_handle = skb_ext_find(skb, SKB_EXT_KCOV_HANDLE);
-
-	return kcov_handle ? *kcov_handle : 0;
-}
+#ifdef CONFIG_KCOV
+	return skb->kcov_handle;
 #else
-static inline void skb_set_kcov_handle(struct sk_buff *skb,
-				       const u64 kcov_handle) { }
-static inline u64 skb_get_kcov_handle(struct sk_buff *skb) { return 0; }
-#endif /* CONFIG_KCOV && CONFIG_SKB_EXTENSIONS */
+	return 0;
+#endif
+}
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 826a205ffd1c..1d15cdaf1b89 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1879,7 +1879,6 @@ config KCOV
 	depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS
 	select DEBUG_FS
 	select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC
-	select SKB_EXTENSIONS if NET
 	help
 	  KCOV exposes kernel code coverage information in a form suitable
 	  for coverage-guided fuzzing (randomized testing).
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ad98265f1dd1..90d3423e6017 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4210,9 +4210,6 @@ static const u8 skb_ext_type_len[] = {
 #if IS_ENABLED(CONFIG_MPTCP)
 	[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
 #endif
-#if IS_ENABLED(CONFIG_KCOV)
-	[SKB_EXT_KCOV_HANDLE] = SKB_EXT_CHUNKSIZEOF(u64),
-#endif
 };
 
 static __always_inline unsigned int skb_ext_total_length(void)
@@ -4229,9 +4226,6 @@ static __always_inline unsigned int skb_ext_total_length(void)
 #endif
 #if IS_ENABLED(CONFIG_MPTCP)
 		skb_ext_type_len[SKB_EXT_MPTCP] +
-#endif
-#if IS_ENABLED(CONFIG_KCOV)
-		skb_ext_type_len[SKB_EXT_KCOV_HANDLE] +
 #endif
 		0;
 }
-- 
cgit 


From 53ffabfd4ddb3a24c5603ae82eefb5537ecb5c20 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 9 Nov 2020 18:24:03 -0800
Subject: block: move blk_rq_bio_prep() to linux/blk-mq.h

This is a preparation patch to have minimal block layer request bio
append functionality in the context of the NVMeOF Passthru driver which
falls in the fast path and doesn't need calls from blk_rq_append_bio().

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk.h            | 12 ------------
 include/linux/blk-mq.h | 12 ++++++++++++
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk.h b/block/blk.h
index dfab98465db9..e05507a8d1e3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -91,18 +91,6 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
 	return __bvec_gap_to_prev(q, bprv, offset);
 }
 
-static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
-		unsigned int nr_segs)
-{
-	rq->nr_phys_segments = nr_segs;
-	rq->__data_len = bio->bi_iter.bi_size;
-	rq->bio = rq->biotail = bio;
-	rq->ioprio = bio_prio(bio);
-
-	if (bio->bi_disk)
-		rq->rq_disk = bio->bi_disk;
-}
-
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 void blk_flush_integrity(void);
 bool __bio_integrity_endio(struct bio *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 794b2a33a2c3..e7482e6ad3ec 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -593,6 +593,18 @@ static inline void blk_mq_cleanup_rq(struct request *rq)
 		rq->q->mq_ops->cleanup_rq(rq);
 }
 
+static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
+		unsigned int nr_segs)
+{
+	rq->nr_phys_segments = nr_segs;
+	rq->__data_len = bio->bi_iter.bi_size;
+	rq->bio = rq->biotail = bio;
+	rq->ioprio = bio_prio(bio);
+
+	if (bio->bi_disk)
+		rq->rq_disk = bio->bi_disk;
+}
+
 blk_qc_t blk_mq_submit_bio(struct bio *bio);
 
 #endif
-- 
cgit 


From b6f8ed33ab2bbc58e40fb1e2fb0f9c90cab04baf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Oct 2020 15:20:41 +0200
Subject: pstore/blk: remove {un,}register_pstore_blk

This interface is entirely unused, so remove them and various bits of
unreachable code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201016132047.3068029-4-hch@lst.de
---
 Documentation/admin-guide/pstore-blk.rst |  5 +-
 fs/pstore/blk.c                          | 83 ++++----------------------------
 include/linux/pstore_blk.h               | 42 ----------------
 3 files changed, 11 insertions(+), 119 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/pstore-blk.rst b/Documentation/admin-guide/pstore-blk.rst
index 83543cb3daa1..49d8149f8d32 100644
--- a/Documentation/admin-guide/pstore-blk.rst
+++ b/Documentation/admin-guide/pstore-blk.rst
@@ -151,10 +151,7 @@ otherwise KMSG_DUMP_MAX.
 Configurations for driver
 -------------------------
 
-Only a block device driver cares about these configurations. A block device
-driver uses ``register_pstore_blk`` to register to pstore/blk.
-
-A non-block device driver uses ``register_pstore_device`` with
+A device driver uses ``register_pstore_device`` with
 ``struct pstore_device_info`` to register to pstore/blk.
 
 .. kernel-doc:: fs/pstore/blk.c
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index fcd5563dde06..882d0bc0cfd7 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -90,7 +90,6 @@ MODULE_PARM_DESC(blkdev, "block device for pstore storage");
 static DEFINE_MUTEX(pstore_blk_lock);
 static struct block_device *psblk_bdev;
 static struct pstore_zone_info *pstore_zone_info;
-static pstore_blk_panic_write_op blkdev_panic_write;
 
 struct bdev_info {
 	dev_t devt;
@@ -341,24 +340,11 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
 	return ret;
 }
 
-static ssize_t psblk_blk_panic_write(const char *buf, size_t size,
-		loff_t off)
-{
-	int ret;
-
-	if (!blkdev_panic_write)
-		return -EOPNOTSUPP;
-
-	/* size and off must align to SECTOR_SIZE for block device */
-	ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT,
-			size >> SECTOR_SHIFT);
-	/* try next zone */
-	if (ret == -ENOMSG)
-		return ret;
-	return ret ? -EIO : size;
-}
-
-static int __register_pstore_blk(struct pstore_blk_info *info)
+/*
+ * This takes its configuration only from the module parameters now.
+ * See psblk_get_bdev() and blkdev.
+ */
+static int __register_pstore_blk(void)
 {
 	char bdev_name[BDEVNAME_SIZE];
 	struct block_device *bdev;
@@ -378,68 +364,34 @@ static int __register_pstore_blk(struct pstore_blk_info *info)
 	}
 
 	/* only allow driver matching the @blkdev */
-	if (!binfo.devt || (!best_effort &&
-			    MAJOR(binfo.devt) != info->major)) {
-		pr_debug("invalid major %u (expect %u)\n",
-				info->major, MAJOR(binfo.devt));
+	if (!binfo.devt) {
+		pr_debug("no major\n");
 		ret = -ENODEV;
 		goto err_put_bdev;
 	}
 
 	/* psblk_bdev must be assigned before register to pstore/blk */
 	psblk_bdev = bdev;
-	blkdev_panic_write = info->panic_write;
-
-	/* Copy back block device details. */
-	info->devt = binfo.devt;
-	info->nr_sects = binfo.nr_sects;
-	info->start_sect = binfo.start_sect;
 
 	memset(&dev, 0, sizeof(dev));
-	dev.total_size = info->nr_sects << SECTOR_SHIFT;
-	dev.flags = info->flags;
+	dev.total_size = binfo.nr_sects << SECTOR_SHIFT;
 	dev.read = psblk_generic_blk_read;
 	dev.write = psblk_generic_blk_write;
-	dev.erase = NULL;
-	dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL;
 
 	ret = __register_pstore_device(&dev);
 	if (ret)
 		goto err_put_bdev;
 
 	bdevname(bdev, bdev_name);
-	pr_info("attached %s%s\n", bdev_name,
-		info->panic_write ? "" : " (no dedicated panic_write!)");
+	pr_info("attached %s (no dedicated panic_write!)\n", bdev_name);
 	return 0;
 
 err_put_bdev:
 	psblk_bdev = NULL;
-	blkdev_panic_write = NULL;
 	psblk_put_bdev(bdev, holder);
 	return ret;
 }
 
-/**
- * register_pstore_blk() - register block device to pstore/blk
- *
- * @info: details on the desired block device interface
- *
- * Return:
- * * 0		- OK
- * * Others	- something error.
- */
-int register_pstore_blk(struct pstore_blk_info *info)
-{
-	int ret;
-
-	mutex_lock(&pstore_blk_lock);
-	ret = __register_pstore_blk(info);
-	mutex_unlock(&pstore_blk_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(register_pstore_blk);
-
 static void __unregister_pstore_blk(unsigned int major)
 {
 	struct pstore_device_info dev = { .read = psblk_generic_blk_read };
@@ -449,24 +401,10 @@ static void __unregister_pstore_blk(unsigned int major)
 	if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) {
 		__unregister_pstore_device(&dev);
 		psblk_put_bdev(psblk_bdev, holder);
-		blkdev_panic_write = NULL;
 		psblk_bdev = NULL;
 	}
 }
 
-/**
- * unregister_pstore_blk() - unregister block device from pstore/blk
- *
- * @major: the major device number of device
- */
-void unregister_pstore_blk(unsigned int major)
-{
-	mutex_lock(&pstore_blk_lock);
-	__unregister_pstore_blk(major);
-	mutex_unlock(&pstore_blk_lock);
-}
-EXPORT_SYMBOL_GPL(unregister_pstore_blk);
-
 /* get information of pstore/blk */
 int pstore_blk_get_config(struct pstore_blk_config *info)
 {
@@ -483,12 +421,11 @@ EXPORT_SYMBOL_GPL(pstore_blk_get_config);
 
 static int __init pstore_blk_init(void)
 {
-	struct pstore_blk_info info = { };
 	int ret = 0;
 
 	mutex_lock(&pstore_blk_lock);
 	if (!pstore_zone_info && best_effort && blkdev[0])
-		ret = __register_pstore_blk(&info);
+		ret = __register_pstore_blk();
 	mutex_unlock(&pstore_blk_lock);
 
 	return ret;
diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h
index 61e914522b01..99564f93d774 100644
--- a/include/linux/pstore_blk.h
+++ b/include/linux/pstore_blk.h
@@ -7,48 +7,6 @@
 #include <linux/pstore.h>
 #include <linux/pstore_zone.h>
 
-/**
- * typedef pstore_blk_panic_write_op - panic write operation to block device
- *
- * @buf: the data to write
- * @start_sect: start sector to block device
- * @sects: sectors count on buf
- *
- * Return: On success, zero should be returned. Others excluding -ENOMSG
- * mean error. -ENOMSG means to try next zone.
- *
- * Panic write to block device must be aligned to SECTOR_SIZE.
- */
-typedef int (*pstore_blk_panic_write_op)(const char *buf, sector_t start_sect,
-		sector_t sects);
-
-/**
- * struct pstore_blk_info - pstore/blk registration details
- *
- * @major:	Which major device number to support with pstore/blk
- * @flags:	The supported PSTORE_FLAGS_* from linux/pstore.h.
- * @panic_write:The write operation only used for the panic case.
- *		This can be NULL, but is recommended to avoid losing
- *		crash data if the kernel's IO path or work queues are
- *		broken during a panic.
- * @devt:	The dev_t that pstore/blk has attached to.
- * @nr_sects:	Number of sectors on @devt.
- * @start_sect:	Starting sector on @devt.
- */
-struct pstore_blk_info {
-	unsigned int major;
-	unsigned int flags;
-	pstore_blk_panic_write_op panic_write;
-
-	/* Filled in by pstore/blk after registration. */
-	dev_t devt;
-	sector_t nr_sects;
-	sector_t start_sect;
-};
-
-int  register_pstore_blk(struct pstore_blk_info *info);
-void unregister_pstore_blk(unsigned int major);
-
 /**
  * struct pstore_device_info - back-end pstore/blk driver structure.
  *
-- 
cgit 


From 60b498852bf219c0bf2b0864c69972840978ca43 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Nov 2020 15:21:18 +0100
Subject: fs: remove get_super_thawed and get_super_exclusive_thawed

Just open code the wait in the only caller of both functions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/internal.h      |  2 ++
 fs/quota/quota.c   | 31 ++++++++++++++++++++++++-------
 fs/super.c         | 51 ++-------------------------------------------------
 include/linux/fs.h |  4 +---
 4 files changed, 29 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/fs/internal.h b/fs/internal.h
index a7cd0f64faa4..47be21dfeebe 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -114,7 +114,9 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
  */
 extern int reconfigure_super(struct fs_context *);
 extern bool trylock_super(struct super_block *sb);
+struct super_block *__get_super(struct block_device *bdev, bool excl);
 extern struct super_block *user_get_super(dev_t);
+void put_super(struct super_block *sb);
 extern bool mount_capable(struct fs_context *);
 
 /*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 9af95c7a0bbe..f3d32b0d9008 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -20,6 +20,7 @@
 #include <linux/writeback.h>
 #include <linux/nospec.h>
 #include "compat.h"
+#include "../internal.h"
 
 static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
 				     qid_t id)
@@ -868,6 +869,7 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
 	struct block_device *bdev;
 	struct super_block *sb;
 	struct filename *tmp = getname(special);
+	bool excl = false, thawed = false;
 
 	if (IS_ERR(tmp))
 		return ERR_CAST(tmp);
@@ -875,17 +877,32 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
 	putname(tmp);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
-	if (quotactl_cmd_onoff(cmd))
-		sb = get_super_exclusive_thawed(bdev);
-	else if (quotactl_cmd_write(cmd))
-		sb = get_super_thawed(bdev);
-	else
-		sb = get_super(bdev);
+
+	if (quotactl_cmd_onoff(cmd)) {
+		excl = true;
+		thawed = true;
+	} else if (quotactl_cmd_write(cmd)) {
+		thawed = true;
+	}
+
+retry:
+	sb = __get_super(bdev, excl);
+	if (thawed && sb && sb->s_writers.frozen != SB_UNFROZEN) {
+		if (excl)
+			up_write(&sb->s_umount);
+		else
+			up_read(&sb->s_umount);
+		wait_event(sb->s_writers.wait_unfrozen,
+			   sb->s_writers.frozen == SB_UNFROZEN);
+		put_super(sb);
+		goto retry;
+	}
+
 	bdput(bdev);
 	if (!sb)
 		return ERR_PTR(-ENODEV);
-
 	return sb;
+
 #else
 	return ERR_PTR(-ENODEV);
 #endif
diff --git a/fs/super.c b/fs/super.c
index 98bb0629ee10..343e5c1e538d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -307,7 +307,7 @@ static void __put_super(struct super_block *s)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
 	__put_super(sb);
@@ -740,7 +740,7 @@ void iterate_supers_type(struct file_system_type *type,
 
 EXPORT_SYMBOL(iterate_supers_type);
 
-static struct super_block *__get_super(struct block_device *bdev, bool excl)
+struct super_block *__get_super(struct block_device *bdev, bool excl)
 {
 	struct super_block *sb;
 
@@ -789,53 +789,6 @@ struct super_block *get_super(struct block_device *bdev)
 }
 EXPORT_SYMBOL(get_super);
 
-static struct super_block *__get_super_thawed(struct block_device *bdev,
-					      bool excl)
-{
-	while (1) {
-		struct super_block *s = __get_super(bdev, excl);
-		if (!s || s->s_writers.frozen == SB_UNFROZEN)
-			return s;
-		if (!excl)
-			up_read(&s->s_umount);
-		else
-			up_write(&s->s_umount);
-		wait_event(s->s_writers.wait_unfrozen,
-			   s->s_writers.frozen == SB_UNFROZEN);
-		put_super(s);
-	}
-}
-
-/**
- *	get_super_thawed - get thawed superblock of a device
- *	@bdev: device to get the superblock for
- *
- *	Scans the superblock list and finds the superblock of the file system
- *	mounted on the device. The superblock is returned once it is thawed
- *	(or immediately if it was not frozen). %NULL is returned if no match
- *	is found.
- */
-struct super_block *get_super_thawed(struct block_device *bdev)
-{
-	return __get_super_thawed(bdev, false);
-}
-EXPORT_SYMBOL(get_super_thawed);
-
-/**
- *	get_super_exclusive_thawed - get thawed superblock of a device
- *	@bdev: device to get the superblock for
- *
- *	Scans the superblock list and finds the superblock of the file system
- *	mounted on the device. The superblock is returned once it is thawed
- *	(or immediately if it was not frozen) and s_umount semaphore is held
- *	in exclusive mode. %NULL is returned if no match is found.
- */
-struct super_block *get_super_exclusive_thawed(struct block_device *bdev)
-{
-	return __get_super_thawed(bdev, true);
-}
-EXPORT_SYMBOL(get_super_exclusive_thawed);
-
 /**
  * get_active_super - get an active reference to the superblock of a device
  * @bdev: device to get the superblock for
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8667d0cdc71e..a61df0dd4f19 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1409,7 +1409,7 @@ enum {
 
 struct sb_writers {
 	int				frozen;		/* Is sb frozen? */
-	wait_queue_head_t		wait_unfrozen;	/* for get_super_thawed() */
+	wait_queue_head_t		wait_unfrozen;	/* wait for thaw */
 	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
 };
 
@@ -3132,8 +3132,6 @@ extern struct file_system_type *get_filesystem(struct file_system_type *fs);
 extern void put_filesystem(struct file_system_type *fs);
 extern struct file_system_type *get_fs_type(const char *name);
 extern struct super_block *get_super(struct block_device *);
-extern struct super_block *get_super_thawed(struct block_device *);
-extern struct super_block *get_super_exclusive_thawed(struct block_device *bdev);
 extern struct super_block *get_active_super(struct block_device *bdev);
 extern void drop_super(struct super_block *sb);
 extern void drop_super_exclusive(struct super_block *sb);
-- 
cgit 


From 040f04bd2e825f1d80b14a0e0ac3d830339eb779 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 11:54:06 +0100
Subject: fs: simplify freeze_bdev/thaw_bdev

Store the frozen superblock in struct block_device to avoid the awkward
interface that can return a sb only used a cookie, an ERR_PTR or NULL.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Chao Yu <yuchao0@huawei.com>		[f2fs]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-core.h      |  5 -----
 drivers/md/dm.c           | 20 ++++++--------------
 fs/block_dev.c            | 37 +++++++++++++++----------------------
 fs/buffer.c               |  2 +-
 fs/ext4/ioctl.c           |  2 +-
 fs/f2fs/file.c            | 14 +++++---------
 fs/xfs/xfs_fsops.c        |  7 ++-----
 include/linux/blk_types.h |  1 +
 include/linux/blkdev.h    |  4 ++--
 9 files changed, 33 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index d522093cb39d..aace147effca 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -96,11 +96,6 @@ struct mapped_device {
 	 */
 	struct workqueue_struct *wq;
 
-	/*
-	 * freeze/thaw support require holding onto a super block
-	 */
-	struct super_block *frozen_sb;
-
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 54739f1b579b..50541d336c71 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2392,27 +2392,19 @@ static int lock_fs(struct mapped_device *md)
 {
 	int r;
 
-	WARN_ON(md->frozen_sb);
+	WARN_ON(test_bit(DMF_FROZEN, &md->flags));
 
-	md->frozen_sb = freeze_bdev(md->bdev);
-	if (IS_ERR(md->frozen_sb)) {
-		r = PTR_ERR(md->frozen_sb);
-		md->frozen_sb = NULL;
-		return r;
-	}
-
-	set_bit(DMF_FROZEN, &md->flags);
-
-	return 0;
+	r = freeze_bdev(md->bdev);
+	if (!r)
+		set_bit(DMF_FROZEN, &md->flags);
+	return r;
 }
 
 static void unlock_fs(struct mapped_device *md)
 {
 	if (!test_bit(DMF_FROZEN, &md->flags))
 		return;
-
-	thaw_bdev(md->bdev, md->frozen_sb);
-	md->frozen_sb = NULL;
+	thaw_bdev(md->bdev);
 	clear_bit(DMF_FROZEN, &md->flags);
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d8664f5c1ff6..33c29106c989 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -548,55 +548,47 @@ EXPORT_SYMBOL(fsync_bdev);
  * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
  * actually.
  */
-struct super_block *freeze_bdev(struct block_device *bdev)
+int freeze_bdev(struct block_device *bdev)
 {
 	struct super_block *sb;
 	int error = 0;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (++bdev->bd_fsfreeze_count > 1) {
-		/*
-		 * We don't even need to grab a reference - the first call
-		 * to freeze_bdev grab an active reference and only the last
-		 * thaw_bdev drops it.
-		 */
-		sb = get_super(bdev);
-		if (sb)
-			drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
+	if (++bdev->bd_fsfreeze_count > 1)
+		goto done;
 
 	sb = get_active_super(bdev);
 	if (!sb)
-		goto out;
+		goto sync;
 	if (sb->s_op->freeze_super)
 		error = sb->s_op->freeze_super(sb);
 	else
 		error = freeze_super(sb);
+	deactivate_super(sb);
+
 	if (error) {
-		deactivate_super(sb);
 		bdev->bd_fsfreeze_count--;
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return ERR_PTR(error);
+		goto done;
 	}
-	deactivate_super(sb);
- out:
+	bdev->bd_fsfreeze_sb = sb;
+
+sync:
 	sync_blockdev(bdev);
+done:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return sb;	/* thaw_bdev releases s->s_umount */
+	return error;
 }
 EXPORT_SYMBOL(freeze_bdev);
 
 /**
  * thaw_bdev  -- unlock filesystem
  * @bdev:	blockdevice to unlock
- * @sb:		associated superblock
  *
  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
  */
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+int thaw_bdev(struct block_device *bdev)
 {
+	struct super_block *sb;
 	int error = -EINVAL;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
@@ -607,6 +599,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	if (--bdev->bd_fsfreeze_count > 0)
 		goto out;
 
+	sb = bdev->bd_fsfreeze_sb;
 	if (!sb)
 		goto out;
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f645657488..a7595ada9400 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -523,7 +523,7 @@ repeat:
 
 void emergency_thaw_bdev(struct super_block *sb)
 {
-	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+	while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
 		printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
 }
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f0381876a7e5..524e13432447 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -624,7 +624,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
 	case EXT4_GOING_FLAGS_DEFAULT:
 		freeze_bdev(sb->s_bdev);
 		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
-		thaw_bdev(sb->s_bdev, sb);
+		thaw_bdev(sb->s_bdev);
 		break;
 	case EXT4_GOING_FLAGS_LOGFLUSH:
 		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ee861c6d9ff0..a9fc482a0e60 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2230,16 +2230,12 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 
 	switch (in) {
 	case F2FS_GOING_DOWN_FULLSYNC:
-		sb = freeze_bdev(sb->s_bdev);
-		if (IS_ERR(sb)) {
-			ret = PTR_ERR(sb);
+		ret = freeze_bdev(sb->s_bdev);
+		if (ret)
 			goto out;
-		}
-		if (sb) {
-			f2fs_stop_checkpoint(sbi, false);
-			set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
-			thaw_bdev(sb->s_bdev, sb);
-		}
+		f2fs_stop_checkpoint(sbi, false);
+		set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
+		thaw_bdev(sb->s_bdev);
 		break;
 	case F2FS_GOING_DOWN_METASYNC:
 		/* do checkpoint only */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ef1d5bb88b93..b7c5783a031c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -433,13 +433,10 @@ xfs_fs_goingdown(
 {
 	switch (inflags) {
 	case XFS_FSOP_GOING_FLAGS_DEFAULT: {
-		struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
-
-		if (sb && !IS_ERR(sb)) {
+		if (!freeze_bdev(mp->m_super->s_bdev)) {
 			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
-			thaw_bdev(sb->s_bdev, sb);
+			thaw_bdev(mp->m_super->s_bdev);
 		}
-
 		break;
 	}
 	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d9b69bbde5cc..ebfb4e7c1fd1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -46,6 +46,7 @@ struct block_device {
 	int			bd_fsfreeze_count;
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
+	struct super_block	*bd_fsfreeze_sb;
 } __randomize_layout;
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05b346a68c2e..12810a19edeb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -2020,7 +2020,7 @@ static inline int sync_blockdev(struct block_device *bdev)
 #endif
 int fsync_bdev(struct block_device *bdev);
 
-struct super_block *freeze_bdev(struct block_device *bdev);
-int thaw_bdev(struct block_device *bdev, struct super_block *sb);
+int freeze_bdev(struct block_device *bdev);
+int thaw_bdev(struct block_device *bdev);
 
 #endif /* _LINUX_BLKDEV_H */
-- 
cgit 


From b601d148a16ea16dfbaf3600be35ee175847a09b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Nov 2020 16:21:52 +0100
Subject: block: remove a duplicate __disk_get_part prototype

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 46553d6d6025..22f5b9fd96f8 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -250,7 +250,6 @@ static inline dev_t part_devt(struct hd_struct *part)
 	return part_to_dev(part)->devt;
 }
 
-extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
 extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
 
 static inline void disk_put_part(struct hd_struct *part)
-- 
cgit 


From 8d65269fe8065fee889bca5b204d711b0695a8f6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 17 Nov 2020 08:18:55 +0100
Subject: block: add a bdev_kobj helper

Add a little helper to find the kobject for a struct block_device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Coly Li <colyli@suse.de>		[bcache]
Acked-by: David Sterba <dsterba@suse.com>	[btrfs]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c |  7 ++-----
 drivers/md/md.c           |  4 +---
 fs/block_dev.c            |  6 +++---
 fs/btrfs/sysfs.c          | 15 +++------------
 include/linux/blk_types.h |  3 +++
 5 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 46a00134a36a..a6a5e21e4fd1 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1447,8 +1447,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 		goto err;
 
 	err = "error creating kobject";
-	if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
-			"bcache"))
+	if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
 		goto err;
 	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
 		goto err;
@@ -2342,9 +2341,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 		goto err;
 	}
 
-	if (kobject_add(&ca->kobj,
-			&part_to_dev(bdev->bd_part)->kobj,
-			"bcache")) {
+	if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
 		err = "error calling kobject_add";
 		ret = -ENOMEM;
 		goto out;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b2edf5e0f965..7ce6047c856e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2414,7 +2414,6 @@ EXPORT_SYMBOL(md_integrity_add_rdev);
 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 {
 	char b[BDEVNAME_SIZE];
-	struct kobject *ko;
 	int err;
 
 	/* prevent duplicates */
@@ -2477,9 +2476,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
 		goto fail;
 
-	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
 	/* failure here is OK */
-	err = sysfs_create_link(&rdev->kobj, ko, "block");
+	err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
 	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
 	rdev->sysfs_unack_badblocks =
 		sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 33c29106c989..c5755150c6be 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1242,7 +1242,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	holder->disk = disk;
 	holder->refcnt = 1;
 
-	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
 	if (ret)
 		goto out_free;
 
@@ -1259,7 +1259,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	goto out_unlock;
 
 out_del:
-	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	del_symlink(disk->slave_dir, bdev_kobj(bdev));
 out_free:
 	kfree(holder);
 out_unlock:
@@ -1287,7 +1287,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	holder = bd_find_holder_disk(bdev, disk);
 
 	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
-		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+		del_symlink(disk->slave_dir, bdev_kobj(bdev));
 		del_symlink(bdev->bd_part->holder_dir,
 			    &disk_to_dev(disk)->kobj);
 		kobject_put(bdev->bd_part->holder_dir);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 279d9262b676..24b6c6dc6900 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1232,8 +1232,6 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
 
 void btrfs_sysfs_remove_device(struct btrfs_device *device)
 {
-	struct hd_struct *disk;
-	struct kobject *disk_kobj;
 	struct kobject *devices_kobj;
 
 	/*
@@ -1243,11 +1241,8 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device)
 	devices_kobj = device->fs_info->fs_devices->devices_kobj;
 	ASSERT(devices_kobj);
 
-	if (device->bdev) {
-		disk = device->bdev->bd_part;
-		disk_kobj = &part_to_dev(disk)->kobj;
-		sysfs_remove_link(devices_kobj, disk_kobj->name);
-	}
+	if (device->bdev)
+		sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name);
 
 	if (device->devid_kobj.state_initialized) {
 		kobject_del(&device->devid_kobj);
@@ -1353,11 +1348,7 @@ int btrfs_sysfs_add_device(struct btrfs_device *device)
 	nofs_flag = memalloc_nofs_save();
 
 	if (device->bdev) {
-		struct hd_struct *disk;
-		struct kobject *disk_kobj;
-
-		disk = device->bdev->bd_part;
-		disk_kobj = &part_to_dev(disk)->kobj;
+		struct kobject *disk_kobj = bdev_kobj(device->bdev);
 
 		ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
 		if (ret) {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ebfb4e7c1fd1..9698f459cc65 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -49,6 +49,9 @@ struct block_device {
 	struct super_block	*bd_fsfreeze_sb;
 } __randomize_layout;
 
+#define bdev_kobj(_bdev) \
+	(&part_to_dev((_bdev)->bd_part)->kobj)
+
 /*
  * Block error status values.  See block/blk-core:blk_errors for the details.
  * Alpha cannot write a byte atomically, so we need to use 32-bit value.
-- 
cgit 


From c2637e80a09e0d6c698d2771d7230f59c2138122 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 14 Nov 2020 19:19:57 +0100
Subject: init: refactor name_to_dev_t

Split each case into a self-contained helper, and move the block
dependent code entirely under the pre-existing #ifdef CONFIG_BLOCK.
This allows to remove the blk_lookup_devt stub in genhd.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h |   7 +-
 init/do_mounts.c      | 183 +++++++++++++++++++++++++-------------------------
 2 files changed, 91 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 22f5b9fd96f8..ca5e356084c3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -388,18 +388,13 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
 }
 #endif /* CONFIG_SYSFS */
 
+dev_t blk_lookup_devt(const char *name, int partno);
 #ifdef CONFIG_BLOCK
 void printk_all_partitions(void);
-dev_t blk_lookup_devt(const char *name, int partno);
 #else /* CONFIG_BLOCK */
 static inline void printk_all_partitions(void)
 {
 }
-static inline dev_t blk_lookup_devt(const char *name, int partno)
-{
-	dev_t devt = MKDEV(0, 0);
-	return devt;
-}
 #endif /* CONFIG_BLOCK */
 
 #endif /* _LINUX_GENHD_H */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index b5f9604d0c98..aef2f24461c7 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -90,7 +90,6 @@ no_match:
 	return 0;
 }
 
-
 /**
  * devt_from_partuuid - looks up the dev_t of a partition by its UUID
  * @uuid_str:	char array containing ascii UUID
@@ -186,7 +185,83 @@ static int match_dev_by_label(struct device *dev, const void *data)
 
 	return 0;
 }
-#endif
+
+static dev_t devt_from_partlabel(const char *label)
+{
+	struct device *dev;
+	dev_t devt = 0;
+
+	dev = class_find_device(&block_class, NULL, label, &match_dev_by_label);
+	if (dev) {
+		devt = dev->devt;
+		put_device(dev);
+	}
+
+	return devt;
+}
+
+static dev_t devt_from_devname(const char *name)
+{
+	dev_t devt = 0;
+	int part;
+	char s[32];
+	char *p;
+
+	if (strlen(name) > 31)
+		return 0;
+	strcpy(s, name);
+	for (p = s; *p; p++) {
+		if (*p == '/')
+			*p = '!';
+	}
+
+	devt = blk_lookup_devt(s, 0);
+	if (devt)
+		return devt;
+
+	/*
+	 * Try non-existent, but valid partition, which may only exist after
+	 * opening the device, like partitioned md devices.
+	 */
+	while (p > s && isdigit(p[-1]))
+		p--;
+	if (p == s || !*p || *p == '0')
+		return 0;
+
+	/* try disk name without <part number> */
+	part = simple_strtoul(p, NULL, 10);
+	*p = '\0';
+	devt = blk_lookup_devt(s, part);
+	if (devt)
+		return devt;
+
+	/* try disk name without p<part number> */
+	if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p')
+		return 0;
+	p[-1] = '\0';
+	return blk_lookup_devt(s, part);
+}
+#endif /* CONFIG_BLOCK */
+
+static dev_t devt_from_devnum(const char *name)
+{
+	unsigned maj, min, offset;
+	dev_t devt = 0;
+	char *p, dummy;
+
+	if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 ||
+	    sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) {
+		devt = MKDEV(maj, min);
+		if (maj != MAJOR(devt) || min != MINOR(devt))
+			return 0;
+	} else {
+		devt = new_decode_dev(simple_strtoul(name, &p, 16));
+		if (*p)
+			return 0;
+	}
+
+	return devt;
+}
 
 /*
  *	Convert a name into device number.  We accept the following variants:
@@ -218,101 +293,23 @@ static int match_dev_by_label(struct device *dev, const void *data)
  *	name contains slashes, the device name has them replaced with
  *	bangs.
  */
-
 dev_t name_to_dev_t(const char *name)
 {
-	char s[32];
-	char *p;
-	dev_t res = 0;
-	int part;
-
+	if (strcmp(name, "/dev/nfs") == 0)
+		return Root_NFS;
+	if (strcmp(name, "/dev/cifs") == 0)
+		return Root_CIFS;
+	if (strcmp(name, "/dev/ram") == 0)
+		return Root_RAM0;
 #ifdef CONFIG_BLOCK
-	if (strncmp(name, "PARTUUID=", 9) == 0) {
-		name += 9;
-		res = devt_from_partuuid(name);
-		if (!res)
-			goto fail;
-		goto done;
-	} else if (strncmp(name, "PARTLABEL=", 10) == 0) {
-		struct device *dev;
-
-		dev = class_find_device(&block_class, NULL, name + 10,
-					&match_dev_by_label);
-		if (!dev)
-			goto fail;
-
-		res = dev->devt;
-		put_device(dev);
-		goto done;
-	}
+	if (strncmp(name, "PARTUUID=", 9) == 0)
+		return devt_from_partuuid(name + 9);
+	if (strncmp(name, "PARTLABEL=", 10) == 0)
+		return devt_from_partlabel(name + 10);
+	if (strncmp(name, "/dev/", 5) == 0)
+		return devt_from_devname(name + 5);
 #endif
-
-	if (strncmp(name, "/dev/", 5) != 0) {
-		unsigned maj, min, offset;
-		char dummy;
-
-		if ((sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) ||
-		    (sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3)) {
-			res = MKDEV(maj, min);
-			if (maj != MAJOR(res) || min != MINOR(res))
-				goto fail;
-		} else {
-			res = new_decode_dev(simple_strtoul(name, &p, 16));
-			if (*p)
-				goto fail;
-		}
-		goto done;
-	}
-
-	name += 5;
-	res = Root_NFS;
-	if (strcmp(name, "nfs") == 0)
-		goto done;
-	res = Root_CIFS;
-	if (strcmp(name, "cifs") == 0)
-		goto done;
-	res = Root_RAM0;
-	if (strcmp(name, "ram") == 0)
-		goto done;
-
-	if (strlen(name) > 31)
-		goto fail;
-	strcpy(s, name);
-	for (p = s; *p; p++)
-		if (*p == '/')
-			*p = '!';
-	res = blk_lookup_devt(s, 0);
-	if (res)
-		goto done;
-
-	/*
-	 * try non-existent, but valid partition, which may only exist
-	 * after revalidating the disk, like partitioned md devices
-	 */
-	while (p > s && isdigit(p[-1]))
-		p--;
-	if (p == s || !*p || *p == '0')
-		goto fail;
-
-	/* try disk name without <part number> */
-	part = simple_strtoul(p, NULL, 10);
-	*p = '\0';
-	res = blk_lookup_devt(s, part);
-	if (res)
-		goto done;
-
-	/* try disk name without p<part number> */
-	if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p')
-		goto fail;
-	p[-1] = '\0';
-	res = blk_lookup_devt(s, part);
-	if (res)
-		goto done;
-
-fail:
-	return 0;
-done:
-	return res;
+	return devt_from_devnum(name);
 }
 EXPORT_SYMBOL_GPL(name_to_dev_t);
 
-- 
cgit 


From 4e7b5671c6a883d94b5428e1a9c141bbd56cb2a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 23 Nov 2020 13:38:40 +0100
Subject: block: remove i_bdev

Switch the block device lookup interfaces to directly work with a dev_t
so that struct block_device references are only acquired by the
blkdev_get variants (and the blk-cgroup special case).  This means that
we now don't need an extra reference in the inode and can generally
simplify handling of struct block_device to keep the lookups contained
in the core block layer code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Coly Li <colyli@suse.de>		[bcache]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c                                |   3 +-
 drivers/block/loop.c                         |   8 +-
 drivers/md/bcache/super.c                    |  20 +--
 drivers/md/dm-table.c                        |   9 +-
 drivers/mtd/mtdsuper.c                       |  17 +--
 drivers/target/target_core_file.c            |   6 +-
 drivers/usb/gadget/function/storage_common.c |   8 +-
 fs/block_dev.c                               | 196 +++++++--------------------
 fs/btrfs/volumes.c                           |  13 +-
 fs/inode.c                                   |   3 -
 fs/internal.h                                |   7 +-
 fs/io_uring.c                                |  10 +-
 fs/pipe.c                                    |   5 +-
 fs/quota/quota.c                             |  19 ++-
 fs/statfs.c                                  |   2 +-
 fs/super.c                                   |  44 +++---
 include/linux/blkdev.h                       |   2 +-
 include/linux/fs.h                           |   1 -
 18 files changed, 121 insertions(+), 252 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 0c09bb7a6ff3..a6d8171221c7 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -590,8 +590,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	int ret;
 	void __user *argp = compat_ptr(arg);
-	struct inode *inode = file->f_mapping->host;
-	struct block_device *bdev = inode->i_bdev;
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	struct gendisk *disk = bdev->bd_disk;
 	fmode_t mode = file->f_mode;
 	loff_t size;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b42c728620c9..26c7aafba7c5 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -675,10 +675,10 @@ static int loop_validate_file(struct file *file, struct block_device *bdev)
 	while (is_loop_device(f)) {
 		struct loop_device *l;
 
-		if (f->f_mapping->host->i_bdev == bdev)
+		if (f->f_mapping->host->i_rdev == bdev->bd_dev)
 			return -EBADF;
 
-		l = f->f_mapping->host->i_bdev->bd_disk->private_data;
+		l = I_BDEV(f->f_mapping->host)->bd_disk->private_data;
 		if (l->lo_state != Lo_bound) {
 			return -EINVAL;
 		}
@@ -885,9 +885,7 @@ static void loop_config_discard(struct loop_device *lo)
 	 * file-backed loop devices: discarded regions read back as zero.
 	 */
 	if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) {
-		struct request_queue *backingq;
-
-		backingq = bdev_get_queue(inode->i_bdev);
+		struct request_queue *backingq = bdev_get_queue(I_BDEV(inode));
 
 		max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
 		granularity = backingq->limits.discard_granularity ?:
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a6a5e21e4fd1..c55d3c58a7ef 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2380,38 +2380,38 @@ kobj_attribute_write(register,		register_bcache);
 kobj_attribute_write(register_quiet,	register_bcache);
 kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
 
-static bool bch_is_open_backing(struct block_device *bdev)
+static bool bch_is_open_backing(dev_t dev)
 {
 	struct cache_set *c, *tc;
 	struct cached_dev *dc, *t;
 
 	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
 		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
-			if (dc->bdev == bdev)
+			if (dc->bdev->bd_dev == dev)
 				return true;
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
-		if (dc->bdev == bdev)
+		if (dc->bdev->bd_dev == dev)
 			return true;
 	return false;
 }
 
-static bool bch_is_open_cache(struct block_device *bdev)
+static bool bch_is_open_cache(dev_t dev)
 {
 	struct cache_set *c, *tc;
 
 	list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
 		struct cache *ca = c->cache;
 
-		if (ca->bdev == bdev)
+		if (ca->bdev->bd_dev == dev)
 			return true;
 	}
 
 	return false;
 }
 
-static bool bch_is_open(struct block_device *bdev)
+static bool bch_is_open(dev_t dev)
 {
-	return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
+	return bch_is_open_cache(dev) || bch_is_open_backing(dev);
 }
 
 struct async_reg_args {
@@ -2535,9 +2535,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 				  sb);
 	if (IS_ERR(bdev)) {
 		if (bdev == ERR_PTR(-EBUSY)) {
-			bdev = lookup_bdev(strim(path));
+			dev_t dev;
+
 			mutex_lock(&bch_register_lock);
-			if (!IS_ERR(bdev) && bch_is_open(bdev))
+			if (lookup_bdev(strim(path), &dev) == 0 &&
+			    bch_is_open(dev))
 				err = "device already registered";
 			else
 				err = "device busy";
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ce543b761be7..dea677721710 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -348,16 +348,9 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 dev_t dm_get_dev_t(const char *path)
 {
 	dev_t dev;
-	struct block_device *bdev;
 
-	bdev = lookup_bdev(path);
-	if (IS_ERR(bdev))
+	if (lookup_bdev(path, &dev))
 		dev = name_to_dev_t(path);
-	else {
-		dev = bdev->bd_dev;
-		bdput(bdev);
-	}
-
 	return dev;
 }
 EXPORT_SYMBOL_GPL(dm_get_dev_t);
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index c3e2098372f2..38b6aa849c63 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -120,8 +120,8 @@ int get_tree_mtd(struct fs_context *fc,
 				struct fs_context *fc))
 {
 #ifdef CONFIG_BLOCK
-	struct block_device *bdev;
-	int ret, major;
+	dev_t dev;
+	int ret;
 #endif
 	int mtdnr;
 
@@ -169,20 +169,15 @@ int get_tree_mtd(struct fs_context *fc,
 	/* try the old way - the hack where we allowed users to mount
 	 * /dev/mtdblock$(n) but didn't actually _use_ the blockdev
 	 */
-	bdev = lookup_bdev(fc->source);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
+	ret = lookup_bdev(fc->source, &dev);
+	if (ret) {
 		errorf(fc, "MTD: Couldn't look up '%s': %d", fc->source, ret);
 		return ret;
 	}
 	pr_debug("MTDSB: lookup_bdev() returned 0\n");
 
-	major = MAJOR(bdev->bd_dev);
-	mtdnr = MINOR(bdev->bd_dev);
-	bdput(bdev);
-
-	if (major == MTD_BLOCK_MAJOR)
-		return mtd_get_sb_by_nr(fc, mtdnr, fill_super);
+	if (MAJOR(dev) == MTD_BLOCK_MAJOR)
+		return mtd_get_sb_by_nr(fc, MINOR(dev), fill_super);
 
 #endif /* CONFIG_BLOCK */
 
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 7143d03f0e02..b0cb5b95e892 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -133,10 +133,10 @@ static int fd_configure_device(struct se_device *dev)
 	 */
 	inode = file->f_mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
-		struct request_queue *q = bdev_get_queue(inode->i_bdev);
+		struct request_queue *q = bdev_get_queue(I_BDEV(inode));
 		unsigned long long dev_size;
 
-		fd_dev->fd_block_size = bdev_logical_block_size(inode->i_bdev);
+		fd_dev->fd_block_size = bdev_logical_block_size(I_BDEV(inode));
 		/*
 		 * Determine the number of bytes from i_size_read() minus
 		 * one (1) logical sector from underlying struct block_device
@@ -559,7 +559,7 @@ fd_execute_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb)
 
 	if (S_ISBLK(inode->i_mode)) {
 		/* The backend is block device, use discard */
-		struct block_device *bdev = inode->i_bdev;
+		struct block_device *bdev = I_BDEV(inode);
 		struct se_device *dev = cmd->se_dev;
 
 		ret = blkdev_issue_discard(bdev,
diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c
index f7e6c42558eb..b859a158a414 100644
--- a/drivers/usb/gadget/function/storage_common.c
+++ b/drivers/usb/gadget/function/storage_common.c
@@ -204,7 +204,7 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
 	if (!(filp->f_mode & FMODE_WRITE))
 		ro = 1;
 
-	inode = file_inode(filp);
+	inode = filp->f_mapping->host;
 	if ((!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) {
 		LINFO(curlun, "invalid file type: %s\n", filename);
 		goto out;
@@ -221,7 +221,7 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
 	if (!(filp->f_mode & FMODE_CAN_WRITE))
 		ro = 1;
 
-	size = i_size_read(inode->i_mapping->host);
+	size = i_size_read(inode);
 	if (size < 0) {
 		LINFO(curlun, "unable to find file size: %s\n", filename);
 		rc = (int) size;
@@ -231,8 +231,8 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
 	if (curlun->cdrom) {
 		blksize = 2048;
 		blkbits = 11;
-	} else if (inode->i_bdev) {
-		blksize = bdev_logical_block_size(inode->i_bdev);
+	} else if (S_ISBLK(inode->i_mode)) {
+		blksize = bdev_logical_block_size(I_BDEV(inode));
 		blkbits = blksize_bits(blksize);
 	} else {
 		blksize = 512;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2b8c0586314f..6d6e4d50834c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -883,7 +883,6 @@ static struct block_device *bdget(dev_t dev)
 		bdev->bd_dev = dev;
 		inode->i_mode = S_IFBLK;
 		inode->i_rdev = dev;
-		inode->i_bdev = bdev;
 		inode->i_data.a_ops = &def_blk_aops;
 		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
 		unlock_new_inode(inode);
@@ -928,67 +927,8 @@ void bdput(struct block_device *bdev)
 {
 	iput(bdev->bd_inode);
 }
-
 EXPORT_SYMBOL(bdput);
  
-static struct block_device *bd_acquire(struct inode *inode)
-{
-	struct block_device *bdev;
-
-	spin_lock(&bdev_lock);
-	bdev = inode->i_bdev;
-	if (bdev && !inode_unhashed(bdev->bd_inode)) {
-		bdgrab(bdev);
-		spin_unlock(&bdev_lock);
-		return bdev;
-	}
-	spin_unlock(&bdev_lock);
-
-	/*
-	 * i_bdev references block device inode that was already shut down
-	 * (corresponding device got removed).  Remove the reference and look
-	 * up block device inode again just in case new device got
-	 * reestablished under the same device number.
-	 */
-	if (bdev)
-		bd_forget(inode);
-
-	bdev = bdget(inode->i_rdev);
-	if (bdev) {
-		spin_lock(&bdev_lock);
-		if (!inode->i_bdev) {
-			/*
-			 * We take an additional reference to bd_inode,
-			 * and it's released in clear_inode() of inode.
-			 * So, we can access it via ->i_mapping always
-			 * without igrab().
-			 */
-			bdgrab(bdev);
-			inode->i_bdev = bdev;
-			inode->i_mapping = bdev->bd_inode->i_mapping;
-		}
-		spin_unlock(&bdev_lock);
-	}
-	return bdev;
-}
-
-/* Call when you free inode */
-
-void bd_forget(struct inode *inode)
-{
-	struct block_device *bdev = NULL;
-
-	spin_lock(&bdev_lock);
-	if (!sb_is_blkdev_sb(inode->i_sb))
-		bdev = inode->i_bdev;
-	inode->i_bdev = NULL;
-	inode->i_mapping = &inode->i_data;
-	spin_unlock(&bdev_lock);
-
-	if (bdev)
-		bdput(bdev);
-}
-
 /**
  * bd_may_claim - test whether a block device can be claimed
  * @bdev: block device of interest
@@ -1497,38 +1437,45 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk,
 }
 
 /**
- * blkdev_get - open a block device
- * @bdev: block_device to open
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
  * @mode: FMODE_* mask
  * @holder: exclusive holder identifier
  *
- * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
- * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
- * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ * Open the block device described by device number @dev. If @mode includes
+ * %FMODE_EXCL, the block device is opened with exclusive access.  Specifying
+ * %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may nest for
+ * the same @holder.
  *
- * On success, the reference count of @bdev is unchanged.  On failure,
- * @bdev is put.
+ * Use this interface ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a device
+ * number.  Everything else should use blkdev_get_by_path().
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
- * 0 on success, -errno on failure.
+ * Reference to the block_device on success, ERR_PTR(-errno) on failure.
  */
-static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 {
 	struct block_device *claiming;
 	bool unblock_events = true;
+	struct block_device *bdev;
 	struct gendisk *disk;
 	int partno;
 	int ret;
 
 	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
-			imajor(bdev->bd_inode), iminor(bdev->bd_inode),
+			MAJOR(dev), MINOR(dev),
 			((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
 			((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
 	if (ret)
-		goto bdput;
+		return ERR_PTR(ret);
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * If we lost a race with 'disk' being deleted, try again.  See md.c.
@@ -1589,10 +1536,13 @@ put_disk:
 	if (ret == -ERESTARTSYS)
 		goto retry;
 bdput:
-	if (ret)
+	if (ret) {
 		bdput(bdev);
-	return ret;
+		return ERR_PTR(ret);
+	}
+	return bdev;
 }
+EXPORT_SYMBOL(blkdev_get_by_dev);
 
 /**
  * blkdev_get_by_path - open a block device by name
@@ -1600,32 +1550,30 @@ bdput:
  * @mode: FMODE_* mask
  * @holder: exclusive holder identifier
  *
- * Open the blockdevice described by the device file at @path.  @mode
- * and @holder are identical to blkdev_get().
- *
- * On success, the returned block_device has reference count of one.
+ * Open the block device described by the device file at @path.  If @mode
+ * includes %FMODE_EXCL, the block device is opened with exclusive access.
+ * Specifying %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may
+ * nest for the same @holder.
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
- * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ * Reference to the block_device on success, ERR_PTR(-errno) on failure.
  */
 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 					void *holder)
 {
 	struct block_device *bdev;
-	int err;
-
-	bdev = lookup_bdev(path);
-	if (IS_ERR(bdev))
-		return bdev;
+	dev_t dev;
+	int error;
 
-	err = blkdev_get(bdev, mode, holder);
-	if (err)
-		return ERR_PTR(err);
+	error = lookup_bdev(path, &dev);
+	if (error)
+		return ERR_PTR(error);
 
-	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+	bdev = blkdev_get_by_dev(dev, mode, holder);
+	if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
 		blkdev_put(bdev, mode);
 		return ERR_PTR(-EACCES);
 	}
@@ -1634,45 +1582,6 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
 
-/**
- * blkdev_get_by_dev - open a block device by device number
- * @dev: device number of block device to open
- * @mode: FMODE_* mask
- * @holder: exclusive holder identifier
- *
- * Open the blockdevice described by device number @dev.  @mode and
- * @holder are identical to blkdev_get().
- *
- * Use it ONLY if you really do not have anything better - i.e. when
- * you are behind a truly sucky interface and all you are given is a
- * device number.  _Never_ to be used for internal purposes.  If you
- * ever need it - reconsider your API.
- *
- * On success, the returned block_device has reference count of one.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Pointer to block_device on success, ERR_PTR(-errno) on failure.
- */
-struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
-{
-	struct block_device *bdev;
-	int err;
-
-	bdev = bdget(dev);
-	if (!bdev)
-		return ERR_PTR(-ENOMEM);
-
-	err = blkdev_get(bdev, mode, holder);
-	if (err)
-		return ERR_PTR(err);
-
-	return bdev;
-}
-EXPORT_SYMBOL(blkdev_get_by_dev);
-
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
@@ -1694,14 +1603,12 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if ((filp->f_flags & O_ACCMODE) == 3)
 		filp->f_mode |= FMODE_WRITE_IOCTL;
 
-	bdev = bd_acquire(inode);
-	if (bdev == NULL)
-		return -ENOMEM;
-
+	bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
-
-	return blkdev_get(bdev, filp->f_mode, filp);
+	return 0;
 }
 
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -2010,37 +1917,32 @@ const struct file_operations def_blk_fops = {
  * namespace if possible and return it.  Return ERR_PTR(error)
  * otherwise.
  */
-struct block_device *lookup_bdev(const char *pathname)
+int lookup_bdev(const char *pathname, dev_t *dev)
 {
-	struct block_device *bdev;
 	struct inode *inode;
 	struct path path;
 	int error;
 
 	if (!pathname || !*pathname)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
 	if (error)
-		return ERR_PTR(error);
+		return error;
 
 	inode = d_backing_inode(path.dentry);
 	error = -ENOTBLK;
 	if (!S_ISBLK(inode->i_mode))
-		goto fail;
+		goto out_path_put;
 	error = -EACCES;
 	if (!may_open_dev(&path))
-		goto fail;
-	error = -ENOMEM;
-	bdev = bd_acquire(inode);
-	if (!bdev)
-		goto fail;
-out:
+		goto out_path_put;
+
+	*dev = inode->i_rdev;
+	error = 0;
+out_path_put:
 	path_put(&path);
-	return bdev;
-fail:
-	bdev = ERR_PTR(error);
-	goto out;
+	return error;
 }
 EXPORT_SYMBOL(lookup_bdev);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6406b3b8c2b..fbc4b58228f7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -929,16 +929,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 		 * make sure it's the same device if the device is mounted
 		 */
 		if (device->bdev) {
-			struct block_device *path_bdev;
+			int error;
+			dev_t path_dev;
 
-			path_bdev = lookup_bdev(path);
-			if (IS_ERR(path_bdev)) {
+			error = lookup_bdev(path, &path_dev);
+			if (error) {
 				mutex_unlock(&fs_devices->device_list_mutex);
-				return ERR_CAST(path_bdev);
+				return ERR_PTR(error);
 			}
 
-			if (device->bdev != path_bdev) {
-				bdput(path_bdev);
+			if (device->bdev->bd_dev != path_dev) {
 				mutex_unlock(&fs_devices->device_list_mutex);
 				btrfs_warn_in_rcu(device->fs_info,
 	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
@@ -947,7 +947,6 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 						  task_pid_nr(current));
 				return ERR_PTR(-EEXIST);
 			}
-			bdput(path_bdev);
 			btrfs_info_in_rcu(device->fs_info,
 	"devid %llu device path %s changed to %s scanned by %s (%d)",
 					  devid, rcu_str_deref(device->name),
diff --git a/fs/inode.c b/fs/inode.c
index 9d78c37b00b8..cb008acf0efd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -155,7 +155,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_bytes = 0;
 	inode->i_generation = 0;
 	inode->i_pipe = NULL;
-	inode->i_bdev = NULL;
 	inode->i_cdev = NULL;
 	inode->i_link = NULL;
 	inode->i_dir_seq = 0;
@@ -580,8 +579,6 @@ static void evict(struct inode *inode)
 		truncate_inode_pages_final(&inode->i_data);
 		clear_inode(inode);
 	}
-	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
-		bd_forget(inode);
 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 		cd_forget(inode);
 
diff --git a/fs/internal.h b/fs/internal.h
index 47be21dfeebe..53f890446e75 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,7 +25,6 @@ extern void __init bdev_cache_init(void);
 extern int __sync_blockdev(struct block_device *bdev, int wait);
 void iterate_bdevs(void (*)(struct block_device *, void *), void *);
 void emergency_thaw_bdev(struct super_block *sb);
-void bd_forget(struct inode *inode);
 #else
 static inline void bdev_cache_init(void)
 {
@@ -43,9 +42,6 @@ static inline int emergency_thaw_bdev(struct super_block *sb)
 {
 	return 0;
 }
-static inline void bd_forget(struct inode *inode)
-{
-}
 #endif /* CONFIG_BLOCK */
 
 /*
@@ -114,8 +110,7 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
  */
 extern int reconfigure_super(struct fs_context *);
 extern bool trylock_super(struct super_block *sb);
-struct super_block *__get_super(struct block_device *bdev, bool excl);
-extern struct super_block *user_get_super(dev_t);
+struct super_block *user_get_super(dev_t, bool excl);
 void put_super(struct super_block *sb);
 extern bool mount_capable(struct fs_context *);
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4ead291b2976..8f13c0417f94 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2716,11 +2716,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
 
 static bool io_bdev_nowait(struct block_device *bdev)
 {
-#ifdef CONFIG_BLOCK
 	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
-#else
-	return true;
-#endif
 }
 
 /*
@@ -2733,14 +2729,16 @@ static bool io_file_supports_async(struct file *file, int rw)
 	umode_t mode = file_inode(file)->i_mode;
 
 	if (S_ISBLK(mode)) {
-		if (io_bdev_nowait(file->f_inode->i_bdev))
+		if (IS_ENABLED(CONFIG_BLOCK) &&
+		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
 			return true;
 		return false;
 	}
 	if (S_ISCHR(mode) || S_ISSOCK(mode))
 		return true;
 	if (S_ISREG(mode)) {
-		if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+		if (IS_ENABLED(CONFIG_BLOCK) &&
+		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
 		    file->f_op != &io_uring_fops)
 			return true;
 		return false;
diff --git a/fs/pipe.c b/fs/pipe.c
index 0ac197658a2d..c5989cfd564d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1342,9 +1342,8 @@ out_revert_acct:
 }
 
 /*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
+ * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
+ * not enough to verify that this is a pipe.
  */
 struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
 {
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index f3d32b0d9008..6d16b2be5ac4 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -866,17 +866,18 @@ static bool quotactl_cmd_onoff(int cmd)
 static struct super_block *quotactl_block(const char __user *special, int cmd)
 {
 #ifdef CONFIG_BLOCK
-	struct block_device *bdev;
 	struct super_block *sb;
 	struct filename *tmp = getname(special);
 	bool excl = false, thawed = false;
+	int error;
+	dev_t dev;
 
 	if (IS_ERR(tmp))
 		return ERR_CAST(tmp);
-	bdev = lookup_bdev(tmp->name);
+	error = lookup_bdev(tmp->name, &dev);
 	putname(tmp);
-	if (IS_ERR(bdev))
-		return ERR_CAST(bdev);
+	if (error)
+		return ERR_PTR(error);
 
 	if (quotactl_cmd_onoff(cmd)) {
 		excl = true;
@@ -886,8 +887,10 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
 	}
 
 retry:
-	sb = __get_super(bdev, excl);
-	if (thawed && sb && sb->s_writers.frozen != SB_UNFROZEN) {
+	sb = user_get_super(dev, excl);
+	if (!sb)
+		return ERR_PTR(-ENODEV);
+	if (thawed && sb->s_writers.frozen != SB_UNFROZEN) {
 		if (excl)
 			up_write(&sb->s_umount);
 		else
@@ -897,10 +900,6 @@ retry:
 		put_super(sb);
 		goto retry;
 	}
-
-	bdput(bdev);
-	if (!sb)
-		return ERR_PTR(-ENODEV);
 	return sb;
 
 #else
diff --git a/fs/statfs.c b/fs/statfs.c
index 59f33752c131..68cb07788750 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -235,7 +235,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
 
 static int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
 {
-	struct super_block *s = user_get_super(dev);
+	struct super_block *s = user_get_super(dev, false);
 	int err;
 	if (!s)
 		return -EINVAL;
diff --git a/fs/super.c b/fs/super.c
index 343e5c1e538d..2c6cdea2ab2d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -740,7 +740,14 @@ void iterate_supers_type(struct file_system_type *type,
 
 EXPORT_SYMBOL(iterate_supers_type);
 
-struct super_block *__get_super(struct block_device *bdev, bool excl)
+/**
+ * get_super - get the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given. %NULL is returned if no match is found.
+ */
+struct super_block *get_super(struct block_device *bdev)
 {
 	struct super_block *sb;
 
@@ -755,17 +762,11 @@ rescan:
 		if (sb->s_bdev == bdev) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
-			if (!excl)
-				down_read(&sb->s_umount);
-			else
-				down_write(&sb->s_umount);
+			down_read(&sb->s_umount);
 			/* still alive? */
 			if (sb->s_root && (sb->s_flags & SB_BORN))
 				return sb;
-			if (!excl)
-				up_read(&sb->s_umount);
-			else
-				up_write(&sb->s_umount);
+			up_read(&sb->s_umount);
 			/* nope, got unmounted */
 			spin_lock(&sb_lock);
 			__put_super(sb);
@@ -776,19 +777,6 @@ rescan:
 	return NULL;
 }
 
-/**
- *	get_super - get the superblock of a device
- *	@bdev: device to get the superblock for
- *
- *	Scans the superblock list and finds the superblock of the file system
- *	mounted on the device given. %NULL is returned if no match is found.
- */
-struct super_block *get_super(struct block_device *bdev)
-{
-	return __get_super(bdev, false);
-}
-EXPORT_SYMBOL(get_super);
-
 /**
  * get_active_super - get an active reference to the superblock of a device
  * @bdev: device to get the superblock for
@@ -820,7 +808,7 @@ restart:
 	return NULL;
 }
 
-struct super_block *user_get_super(dev_t dev)
+struct super_block *user_get_super(dev_t dev, bool excl)
 {
 	struct super_block *sb;
 
@@ -832,11 +820,17 @@ rescan:
 		if (sb->s_dev ==  dev) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
-			down_read(&sb->s_umount);
+			if (excl)
+				down_write(&sb->s_umount);
+			else
+				down_read(&sb->s_umount);
 			/* still alive? */
 			if (sb->s_root && (sb->s_flags & SB_BORN))
 				return sb;
-			up_read(&sb->s_umount);
+			if (excl)
+				up_write(&sb->s_umount);
+			else
+				up_read(&sb->s_umount);
 			/* nope, got unmounted */
 			spin_lock(&sb_lock);
 			__put_super(sb);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12810a19edeb..bdd7339bcda4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1973,7 +1973,7 @@ int bdev_read_only(struct block_device *bdev);
 int set_blocksize(struct block_device *bdev, int size);
 
 const char *bdevname(struct block_device *bdev, char *buffer);
-struct block_device *lookup_bdev(const char *);
+int lookup_bdev(const char *pathname, dev_t *dev);
 
 void blkdev_show(struct seq_file *seqf, off_t offset);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a61df0dd4f19..b0b358309657 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -696,7 +696,6 @@ struct inode {
 	struct list_head	i_devices;
 	union {
 		struct pipe_inode_info	*i_pipe;
-		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 		char			*i_link;
 		unsigned		i_dir_seq;
-- 
cgit 


From 22ae8ce8b89241c94ac00c237752c0ffa37ba5ae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Nov 2020 09:23:26 +0100
Subject: block: simplify bdev/disk lookup in blkdev_get

To simplify block device lookup and a few other upcoming areas, make sure
that we always have a struct block_device available for each disk and
each partition, and only find existing block devices in bdget.  The only
downside of this is that each device and partition uses a little more
memory.  The upside will be that a lot of code can be simplified.

With that all we need to look up the block device is to lookup the inode
and do a few sanity checks on the gendisk, instead of the separate lookup
for the gendisk.  For blk-cgroup which wants to access a gendisk without
opening it, a new blkdev_{get,put}_no_open low-level interface is added
to replace the previous get_gendisk use.

Note that the change to look up block device directly instead of the two
step lookup using struct gendisk causes a subtile change in behavior:
accessing a non-existing partition on an existing block device can now
cause a call to request_module.  That call is harmless, and in practice
no recent system will access these nodes as they aren't created by udev
and static /dev/ setups are unusual.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         |  42 ++++-----
 block/blk-iocost.c         |  36 ++++----
 block/blk.h                |   2 +-
 block/genhd.c              | 210 ++++++---------------------------------------
 block/partitions/core.c    |  29 ++++---
 fs/block_dev.c             | 177 ++++++++++++++++++++++----------------
 include/linux/blk-cgroup.h |   4 +-
 include/linux/blkdev.h     |   6 ++
 include/linux/genhd.h      |   7 +-
 9 files changed, 194 insertions(+), 319 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c68bdf58c9a6..ad02289a4f7f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -556,22 +556,22 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
 }
 
 /**
- * blkg_conf_prep - parse and prepare for per-blkg config update
+ * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update
  * @inputp: input string pointer
  *
  * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
- * from @input and get and return the matching gendisk.  *@inputp is
+ * from @input and get and return the matching bdev.  *@inputp is
  * updated to point past the device node prefix.  Returns an ERR_PTR()
  * value on error.
  *
  * Use this function iff blkg_conf_prep() can't be used for some reason.
  */
-struct gendisk *blkcg_conf_get_disk(char **inputp)
+struct block_device *blkcg_conf_open_bdev(char **inputp)
 {
 	char *input = *inputp;
 	unsigned int major, minor;
-	struct gendisk *disk;
-	int key_len, part;
+	struct block_device *bdev;
+	int key_len;
 
 	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
 		return ERR_PTR(-EINVAL);
@@ -581,16 +581,16 @@ struct gendisk *blkcg_conf_get_disk(char **inputp)
 		return ERR_PTR(-EINVAL);
 	input = skip_spaces(input);
 
-	disk = get_gendisk(MKDEV(major, minor), &part);
-	if (!disk)
+	bdev = blkdev_get_no_open(MKDEV(major, minor));
+	if (!bdev)
 		return ERR_PTR(-ENODEV);
-	if (part) {
-		put_disk_and_module(disk);
+	if (bdev_is_partition(bdev)) {
+		blkdev_put_no_open(bdev);
 		return ERR_PTR(-ENODEV);
 	}
 
 	*inputp = input;
-	return disk;
+	return bdev;
 }
 
 /**
@@ -607,18 +607,18 @@ struct gendisk *blkcg_conf_get_disk(char **inputp)
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   char *input, struct blkg_conf_ctx *ctx)
-	__acquires(rcu) __acquires(&disk->queue->queue_lock)
+	__acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
 {
-	struct gendisk *disk;
+	struct block_device *bdev;
 	struct request_queue *q;
 	struct blkcg_gq *blkg;
 	int ret;
 
-	disk = blkcg_conf_get_disk(&input);
-	if (IS_ERR(disk))
-		return PTR_ERR(disk);
+	bdev = blkcg_conf_open_bdev(&input);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
 
-	q = disk->queue;
+	q = bdev->bd_disk->queue;
 
 	rcu_read_lock();
 	spin_lock_irq(&q->queue_lock);
@@ -689,7 +689,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 			goto success;
 	}
 success:
-	ctx->disk = disk;
+	ctx->bdev = bdev;
 	ctx->blkg = blkg;
 	ctx->body = input;
 	return 0;
@@ -700,7 +700,7 @@ fail_unlock:
 	spin_unlock_irq(&q->queue_lock);
 	rcu_read_unlock();
 fail:
-	put_disk_and_module(disk);
+	blkdev_put_no_open(bdev);
 	/*
 	 * If queue was bypassing, we should retry.  Do so after a
 	 * short msleep().  It isn't strictly necessary but queue
@@ -723,11 +723,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
  * with blkg_conf_prep().
  */
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
-	__releases(&ctx->disk->queue->queue_lock) __releases(rcu)
+	__releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
 {
-	spin_unlock_irq(&ctx->disk->queue->queue_lock);
+	spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
 	rcu_read_unlock();
-	put_disk_and_module(ctx->disk);
+	blkdev_put_no_open(ctx->bdev);
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index bbe86d1199dc..8e20fe4bddec 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3120,23 +3120,23 @@ static const match_table_t qos_tokens = {
 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 			     size_t nbytes, loff_t off)
 {
-	struct gendisk *disk;
+	struct block_device *bdev;
 	struct ioc *ioc;
 	u32 qos[NR_QOS_PARAMS];
 	bool enable, user;
 	char *p;
 	int ret;
 
-	disk = blkcg_conf_get_disk(&input);
-	if (IS_ERR(disk))
-		return PTR_ERR(disk);
+	bdev = blkcg_conf_open_bdev(&input);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
 
-	ioc = q_to_ioc(disk->queue);
+	ioc = q_to_ioc(bdev->bd_disk->queue);
 	if (!ioc) {
-		ret = blk_iocost_init(disk->queue);
+		ret = blk_iocost_init(bdev->bd_disk->queue);
 		if (ret)
 			goto err;
-		ioc = q_to_ioc(disk->queue);
+		ioc = q_to_ioc(bdev->bd_disk->queue);
 	}
 
 	spin_lock_irq(&ioc->lock);
@@ -3231,12 +3231,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 	ioc_refresh_params(ioc, true);
 	spin_unlock_irq(&ioc->lock);
 
-	put_disk_and_module(disk);
+	blkdev_put_no_open(bdev);
 	return nbytes;
 einval:
 	ret = -EINVAL;
 err:
-	put_disk_and_module(disk);
+	blkdev_put_no_open(bdev);
 	return ret;
 }
 
@@ -3287,23 +3287,23 @@ static const match_table_t i_lcoef_tokens = {
 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 				    size_t nbytes, loff_t off)
 {
-	struct gendisk *disk;
+	struct block_device *bdev;
 	struct ioc *ioc;
 	u64 u[NR_I_LCOEFS];
 	bool user;
 	char *p;
 	int ret;
 
-	disk = blkcg_conf_get_disk(&input);
-	if (IS_ERR(disk))
-		return PTR_ERR(disk);
+	bdev = blkcg_conf_open_bdev(&input);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
 
-	ioc = q_to_ioc(disk->queue);
+	ioc = q_to_ioc(bdev->bd_disk->queue);
 	if (!ioc) {
-		ret = blk_iocost_init(disk->queue);
+		ret = blk_iocost_init(bdev->bd_disk->queue);
 		if (ret)
 			goto err;
-		ioc = q_to_ioc(disk->queue);
+		ioc = q_to_ioc(bdev->bd_disk->queue);
 	}
 
 	spin_lock_irq(&ioc->lock);
@@ -3356,13 +3356,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	ioc_refresh_params(ioc, true);
 	spin_unlock_irq(&ioc->lock);
 
-	put_disk_and_module(disk);
+	blkdev_put_no_open(bdev);
 	return nbytes;
 
 einval:
 	ret = -EINVAL;
 err:
-	put_disk_and_module(disk);
+	blkdev_put_no_open(bdev);
 	return ret;
 }
 
diff --git a/block/blk.h b/block/blk.h
index dfab98465db9..c4839abcfa27 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -352,7 +352,6 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
 
 int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
 void blk_free_devt(dev_t devt);
-void blk_invalidate_devt(dev_t devt);
 char *disk_name(struct gendisk *hd, int partno, char *buf);
 #define ADDPART_FLAG_NONE	0
 #define ADDPART_FLAG_RAID	1
@@ -384,6 +383,7 @@ static inline void hd_free_part(struct hd_struct *part)
 {
 	free_percpu(part->dkstats);
 	kfree(part->info);
+	bdput(part->bdev);
 	percpu_ref_exit(&part->ref);
 }
 
diff --git a/block/genhd.c b/block/genhd.c
index f46e89226fdf..bf8fa82f135f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -27,17 +27,11 @@
 
 static struct kobject *block_depr;
 
-static DEFINE_XARRAY(bdev_map);
-static DEFINE_MUTEX(bdev_map_lock);
+DECLARE_RWSEM(bdev_lookup_sem);
 
 /* for extended dynamic devt allocation, currently only one major is used */
 #define NR_EXT_DEVT		(1 << MINORBITS)
-
-/* For extended devt allocation.  ext_devt_lock prevents look up
- * results from going away underneath its user.
- */
-static DEFINE_SPINLOCK(ext_devt_lock);
-static DEFINE_IDR(ext_devt_idr);
+static DEFINE_IDA(ext_devt_ida);
 
 static void disk_check_events(struct disk_events *ev,
 			      unsigned int *clearing_ptr);
@@ -580,14 +574,7 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 		return 0;
 	}
 
-	/* allocate ext devt */
-	idr_preload(GFP_KERNEL);
-
-	spin_lock_bh(&ext_devt_lock);
-	idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
-	spin_unlock_bh(&ext_devt_lock);
-
-	idr_preload_end();
+	idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
 	if (idx < 0)
 		return idx == -ENOSPC ? -EBUSY : idx;
 
@@ -606,26 +593,8 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
  */
 void blk_free_devt(dev_t devt)
 {
-	if (devt == MKDEV(0, 0))
-		return;
-
-	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
-		spin_lock_bh(&ext_devt_lock);
-		idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
-		spin_unlock_bh(&ext_devt_lock);
-	}
-}
-
-/*
- * We invalidate devt by assigning NULL pointer for devt in idr.
- */
-void blk_invalidate_devt(dev_t devt)
-{
-	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
-		spin_lock_bh(&ext_devt_lock);
-		idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt)));
-		spin_unlock_bh(&ext_devt_lock);
-	}
+	if (MAJOR(devt) == BLOCK_EXT_MAJOR)
+		ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt)));
 }
 
 static char *bdevt_str(dev_t devt, char *buf)
@@ -640,28 +609,6 @@ static char *bdevt_str(dev_t devt, char *buf)
 	return buf;
 }
 
-static void blk_register_region(struct gendisk *disk)
-{
-	int i;
-
-	mutex_lock(&bdev_map_lock);
-	for (i = 0; i < disk->minors; i++) {
-		if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL))
-			WARN_ON_ONCE(1);
-	}
-	mutex_unlock(&bdev_map_lock);
-}
-
-static void blk_unregister_region(struct gendisk *disk)
-{
-	int i;
-
-	mutex_lock(&bdev_map_lock);
-	for (i = 0; i < disk->minors; i++)
-		xa_erase(&bdev_map, disk_devt(disk) + i);
-	mutex_unlock(&bdev_map_lock);
-}
-
 static void disk_scan_partitions(struct gendisk *disk)
 {
 	struct block_device *bdev;
@@ -805,7 +752,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
 		ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
 		WARN_ON(ret);
 		bdi_set_owner(bdi, dev);
-		blk_register_region(disk);
+		bdev_add(disk->part0.bdev, devt);
 	}
 	register_disk(parent, disk, groups);
 	if (register_queue)
@@ -847,8 +794,8 @@ static void invalidate_partition(struct gendisk *disk, int partno)
 	__invalidate_device(bdev, true);
 
 	/*
-	 * Unhash the bdev inode for this device so that it gets evicted as soon
-	 * as last inode reference is dropped.
+	 * Unhash the bdev inode for this device so that it can't be looked
+	 * up any more even if openers still hold references to it.
 	 */
 	remove_inode_hash(bdev->bd_inode);
 	bdput(bdev);
@@ -890,7 +837,8 @@ void del_gendisk(struct gendisk *disk)
 	 * Block lookups of the disk until all bdevs are unhashed and the
 	 * disk is marked as dead (GENHD_FL_UP cleared).
 	 */
-	down_write(&disk->lookup_sem);
+	down_write(&bdev_lookup_sem);
+
 	/* invalidate stuff */
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
@@ -903,7 +851,7 @@ void del_gendisk(struct gendisk *disk)
 	invalidate_partition(disk, 0);
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
-	up_write(&disk->lookup_sem);
+	up_write(&bdev_lookup_sem);
 
 	if (!(disk->flags & GENHD_FL_HIDDEN)) {
 		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
@@ -916,16 +864,6 @@ void del_gendisk(struct gendisk *disk)
 	}
 
 	blk_unregister_queue(disk);
-	
-	if (!(disk->flags & GENHD_FL_HIDDEN))
-		blk_unregister_region(disk);
-	/*
-	 * Remove gendisk pointer from idr so that it cannot be looked up
-	 * while RCU period before freeing gendisk is running to prevent
-	 * use-after-free issues. Note that the device number stays
-	 * "in-use" until we really free the gendisk.
-	 */
-	blk_invalidate_devt(disk_devt(disk));
 
 	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
@@ -964,7 +902,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
 	return badblocks_store(disk->bb, page, len, 0);
 }
 
-static void request_gendisk_module(dev_t devt)
+void blk_request_module(dev_t devt)
 {
 	unsigned int major = MAJOR(devt);
 	struct blk_major_name **n;
@@ -984,84 +922,6 @@ static void request_gendisk_module(dev_t devt)
 		request_module("block-major-%d", MAJOR(devt));
 }
 
-static bool get_disk_and_module(struct gendisk *disk)
-{
-	struct module *owner;
-
-	if (!disk->fops)
-		return false;
-	owner = disk->fops->owner;
-	if (owner && !try_module_get(owner))
-		return false;
-	if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) {
-		module_put(owner);
-		return false;
-	}
-	return true;
-
-}
-
-/**
- * get_gendisk - get partitioning information for a given device
- * @devt: device to get partitioning information for
- * @partno: returned partition index
- *
- * This function gets the structure containing partitioning
- * information for the given device @devt.
- *
- * Context: can sleep
- */
-struct gendisk *get_gendisk(dev_t devt, int *partno)
-{
-	struct gendisk *disk = NULL;
-
-	might_sleep();
-
-	if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
-		mutex_lock(&bdev_map_lock);
-		disk = xa_load(&bdev_map, devt);
-		if (!disk) {
-			mutex_unlock(&bdev_map_lock);
-			request_gendisk_module(devt);
-			mutex_lock(&bdev_map_lock);
-			disk = xa_load(&bdev_map, devt);
-		}
-		if (disk && !get_disk_and_module(disk))
-			disk = NULL;
-		if (disk)
-			*partno = devt - disk_devt(disk);
-		mutex_unlock(&bdev_map_lock);
-	} else {
-		struct hd_struct *part;
-
-		spin_lock_bh(&ext_devt_lock);
-		part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
-		if (part && get_disk_and_module(part_to_disk(part))) {
-			*partno = part->partno;
-			disk = part_to_disk(part);
-		}
-		spin_unlock_bh(&ext_devt_lock);
-	}
-
-	if (!disk)
-		return NULL;
-
-	/*
-	 * Synchronize with del_gendisk() to not return disk that is being
-	 * destroyed.
-	 */
-	down_read(&disk->lookup_sem);
-	if (unlikely((disk->flags & GENHD_FL_HIDDEN) ||
-		     !(disk->flags & GENHD_FL_UP))) {
-		up_read(&disk->lookup_sem);
-		put_disk_and_module(disk);
-		disk = NULL;
-	} else {
-		up_read(&disk->lookup_sem);
-	}
-	return disk;
-}
-
 /**
  * bdget_disk - do bdget() by gendisk and partition number
  * @disk: gendisk of interest
@@ -1559,11 +1419,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
  *
  * This function releases all allocated resources of the gendisk.
  *
- * The struct gendisk refcount is incremented with get_gendisk() or
- * get_disk_and_module(), and its refcount is decremented with
- * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this
- * function is called.
- *
  * Drivers which used __device_add_disk() have a gendisk with a request_queue
  * assigned. Since the request_queue sits on top of the gendisk for these
  * drivers we also call blk_put_queue() for them, and we expect the
@@ -1748,16 +1603,17 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	if (!disk)
 		return NULL;
 
+	disk->part0.bdev = bdev_alloc(disk, 0);
+	if (!disk->part0.bdev)
+		goto out_free_disk;
+
 	disk->part0.dkstats = alloc_percpu(struct disk_stats);
 	if (!disk->part0.dkstats)
-		goto out_free_disk;
+		goto out_bdput;
 
-	init_rwsem(&disk->lookup_sem);
 	disk->node_id = node_id;
-	if (disk_expand_part_tbl(disk, 0)) {
-		free_percpu(disk->part0.dkstats);
-		goto out_free_disk;
-	}
+	if (disk_expand_part_tbl(disk, 0))
+		goto out_free_bdstats;
 
 	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
 	rcu_assign_pointer(ptbl->part[0], &disk->part0);
@@ -1773,7 +1629,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	 */
 	hd_sects_seq_init(&disk->part0);
 	if (hd_ref_init(&disk->part0))
-		goto out_free_part0;
+		goto out_free_bdstats;
 
 	disk->minors = minors;
 	rand_initialize_disk(disk);
@@ -1782,8 +1638,10 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	device_initialize(disk_to_dev(disk));
 	return disk;
 
-out_free_part0:
-	hd_free_part(&disk->part0);
+out_free_bdstats:
+	free_percpu(disk->part0.dkstats);
+out_bdput:
+	bdput(disk->part0.bdev);
 out_free_disk:
 	kfree(disk);
 	return NULL;
@@ -1807,26 +1665,6 @@ void put_disk(struct gendisk *disk)
 }
 EXPORT_SYMBOL(put_disk);
 
-/**
- * put_disk_and_module - decrements the module and gendisk refcount
- * @disk: the struct gendisk to decrement the refcount for
- *
- * This is a counterpart of get_disk_and_module() and thus also of
- * get_gendisk().
- *
- * Context: Any context, but the last reference must not be dropped from
- *          atomic context.
- */
-void put_disk_and_module(struct gendisk *disk)
-{
-	if (disk) {
-		struct module *owner = disk->fops->owner;
-
-		put_disk(disk);
-		module_put(owner);
-	}
-}
-
 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 {
 	char event[] = "DISK_RO=1";
diff --git a/block/partitions/core.c b/block/partitions/core.c
index a02e22411594..696bd9ff63c6 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -340,12 +340,11 @@ void delete_partition(struct hd_struct *part)
 	device_del(part_to_dev(part));
 
 	/*
-	 * Remove gendisk pointer from idr so that it cannot be looked up
-	 * while RCU period before freeing gendisk is running to prevent
-	 * use-after-free issues. Note that the device number stays
-	 * "in-use" until we really free the gendisk.
+	 * Remove the block device from the inode hash, so that it cannot be
+	 * looked up any more even when openers still hold references.
 	 */
-	blk_invalidate_devt(part_devt(part));
+	remove_inode_hash(part->bdev->bd_inode);
+
 	percpu_ref_kill(&part->ref);
 }
 
@@ -368,6 +367,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	dev_t devt = MKDEV(0, 0);
 	struct device *ddev = disk_to_dev(disk);
 	struct device *pdev;
+	struct block_device *bdev;
 	struct disk_part_tbl *ptbl;
 	const char *dname;
 	int err;
@@ -402,11 +402,15 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!p)
 		return ERR_PTR(-EBUSY);
 
+	err = -ENOMEM;
 	p->dkstats = alloc_percpu(struct disk_stats);
-	if (!p->dkstats) {
-		err = -ENOMEM;
+	if (!p->dkstats)
 		goto out_free;
-	}
+
+	bdev = bdev_alloc(disk, partno);
+	if (!bdev)
+		goto out_free_stats;
+	p->bdev = bdev;
 
 	hd_sects_seq_init(p);
 	pdev = part_to_dev(p);
@@ -420,10 +424,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 		struct partition_meta_info *pinfo;
 
 		pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id);
-		if (!pinfo) {
-			err = -ENOMEM;
-			goto out_free_stats;
-		}
+		if (!pinfo)
+			goto out_bdput;
 		memcpy(pinfo, info, sizeof(*info));
 		p->info = pinfo;
 	}
@@ -470,6 +472,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	}
 
 	/* everything is up and running, commence */
+	bdev_add(bdev, devt);
 	rcu_assign_pointer(ptbl->part[partno], p);
 
 	/* suppress uevent if the disk suppresses it */
@@ -479,6 +482,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 out_free_info:
 	kfree(p->info);
+out_bdput:
+	bdput(bdev);
 out_free_stats:
 	free_percpu(p->dkstats);
 out_free:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d6e4d50834c..b350ed3af83b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -863,31 +863,46 @@ void __init bdev_cache_init(void)
 	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
 }
 
-static struct block_device *bdget(dev_t dev)
+struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 {
 	struct block_device *bdev;
 	struct inode *inode;
 
-	inode = iget_locked(blockdev_superblock, dev);
+	inode = new_inode(blockdev_superblock);
 	if (!inode)
 		return NULL;
+	inode->i_mode = S_IFBLK;
+	inode->i_rdev = 0;
+	inode->i_data.a_ops = &def_blk_aops;
+	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+
+	bdev = I_BDEV(inode);
+	spin_lock_init(&bdev->bd_size_lock);
+	bdev->bd_disk = disk;
+	bdev->bd_partno = partno;
+	bdev->bd_contains = NULL;
+	bdev->bd_super = NULL;
+	bdev->bd_inode = inode;
+	bdev->bd_part_count = 0;
+	return bdev;
+}
 
-	bdev = &BDEV_I(inode)->bdev;
+void bdev_add(struct block_device *bdev, dev_t dev)
+{
+	bdev->bd_dev = dev;
+	bdev->bd_inode->i_rdev = dev;
+	bdev->bd_inode->i_ino = dev;
+	insert_inode_hash(bdev->bd_inode);
+}
 
-	if (inode->i_state & I_NEW) {
-		spin_lock_init(&bdev->bd_size_lock);
-		bdev->bd_contains = NULL;
-		bdev->bd_super = NULL;
-		bdev->bd_inode = inode;
-		bdev->bd_part_count = 0;
-		bdev->bd_dev = dev;
-		inode->i_mode = S_IFBLK;
-		inode->i_rdev = dev;
-		inode->i_data.a_ops = &def_blk_aops;
-		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-		unlock_new_inode(inode);
-	}
-	return bdev;
+static struct block_device *bdget(dev_t dev)
+{
+	struct inode *inode;
+
+	inode = ilookup(blockdev_superblock, dev);
+	if (!inode)
+		return NULL;
+	return &BDEV_I(inode)->bdev;
 }
 
 /**
@@ -1004,27 +1019,6 @@ retry:
 }
 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
 
-static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
-{
-	struct gendisk *disk = get_gendisk(bdev->bd_dev, partno);
-
-	if (!disk)
-		return NULL;
-	/*
-	 * Now that we hold gendisk reference we make sure bdev we looked up is
-	 * not stale. If it is, it means device got removed and created before
-	 * we looked up gendisk and we fail open in such case. Associating
-	 * unhashed bdev with newly created gendisk could lead to two bdevs
-	 * (and thus two independent caches) being associated with one device
-	 * which is bad.
-	 */
-	if (inode_unhashed(bdev->bd_inode)) {
-		put_disk_and_module(disk);
-		return NULL;
-	}
-	return disk;
-}
-
 static void bd_clear_claiming(struct block_device *whole, void *holder)
 {
 	lockdep_assert_held(&bdev_lock);
@@ -1347,19 +1341,17 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
  *  mutex_lock(part->bd_mutex)
  *    mutex_lock_nested(whole->bd_mutex, 1)
  */
-static int __blkdev_get(struct block_device *bdev, struct gendisk *disk,
-		int partno, fmode_t mode)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 {
+	struct gendisk *disk = bdev->bd_disk;
 	int ret;
 
 	if (!bdev->bd_openers) {
-		bdev->bd_disk = disk;
 		bdev->bd_contains = bdev;
-		bdev->bd_partno = partno;
 
-		if (!partno) {
+		if (!bdev->bd_partno) {
 			ret = -ENXIO;
-			bdev->bd_part = disk_get_part(disk, partno);
+			bdev->bd_part = disk_get_part(disk, 0);
 			if (!bdev->bd_part)
 				goto out_clear;
 
@@ -1388,7 +1380,7 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk,
 			struct block_device *whole = bdget_disk(disk, 0);
 
 			mutex_lock_nested(&whole->bd_mutex, 1);
-			ret = __blkdev_get(whole, disk, 0, mode);
+			ret = __blkdev_get(whole, mode);
 			if (ret) {
 				mutex_unlock(&whole->bd_mutex);
 				bdput(whole);
@@ -1398,7 +1390,7 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk,
 			mutex_unlock(&whole->bd_mutex);
 
 			bdev->bd_contains = whole;
-			bdev->bd_part = disk_get_part(disk, partno);
+			bdev->bd_part = disk_get_part(disk, bdev->bd_partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
 				__blkdev_put(whole, mode, 1);
@@ -1430,12 +1422,53 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk,
 
  out_clear:
 	disk_put_part(bdev->bd_part);
-	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_contains = NULL;
 	return ret;
 }
 
+struct block_device *blkdev_get_no_open(dev_t dev)
+{
+	struct block_device *bdev;
+	struct gendisk *disk;
+
+	down_read(&bdev_lookup_sem);
+	bdev = bdget(dev);
+	if (!bdev) {
+		up_read(&bdev_lookup_sem);
+		blk_request_module(dev);
+		down_read(&bdev_lookup_sem);
+
+		bdev = bdget(dev);
+		if (!bdev)
+			goto unlock;
+	}
+
+	disk = bdev->bd_disk;
+	if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj))
+		goto bdput;
+	if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
+		goto put_disk;
+	if (!try_module_get(bdev->bd_disk->fops->owner))
+		goto put_disk;
+	up_read(&bdev_lookup_sem);
+	return bdev;
+put_disk:
+	put_disk(disk);
+bdput:
+	bdput(bdev);
+unlock:
+	up_read(&bdev_lookup_sem);
+	return NULL;
+}
+
+void blkdev_put_no_open(struct block_device *bdev)
+{
+	module_put(bdev->bd_disk->fops->owner);
+	put_disk(bdev->bd_disk);
+	bdput(bdev);
+}
+
 /**
  * blkdev_get_by_dev - open a block device by device number
  * @dev: device number of block device to open
@@ -1463,7 +1496,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 	bool unblock_events = true;
 	struct block_device *bdev;
 	struct gendisk *disk;
-	int partno;
 	int ret;
 
 	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
@@ -1473,18 +1505,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 	if (ret)
 		return ERR_PTR(ret);
 
-	bdev = bdget(dev);
-	if (!bdev)
-		return ERR_PTR(-ENOMEM);
-
 	/*
 	 * If we lost a race with 'disk' being deleted, try again.  See md.c.
 	 */
 retry:
-	ret = -ENXIO;
-	disk = bdev_get_gendisk(bdev, &partno);
-	if (!disk)
-		goto bdput;
+	bdev = blkdev_get_no_open(dev);
+	if (!bdev)
+		return ERR_PTR(-ENXIO);
+	disk = bdev->bd_disk;
 
 	if (mode & FMODE_EXCL) {
 		WARN_ON_ONCE(!holder);
@@ -1492,7 +1520,7 @@ retry:
 		ret = -ENOMEM;
 		claiming = bdget_disk(disk, 0);
 		if (!claiming)
-			goto put_disk;
+			goto put_blkdev;
 		ret = bd_prepare_to_claim(bdev, claiming, holder);
 		if (ret)
 			goto put_claiming;
@@ -1501,12 +1529,10 @@ retry:
 	disk_block_events(disk);
 
 	mutex_lock(&bdev->bd_mutex);
-	ret =__blkdev_get(bdev, disk, partno, mode);
-	if (!(mode & FMODE_EXCL)) {
-		; /* nothing to do here */
-	} else if (ret) {
-		bd_abort_claiming(bdev, claiming, holder);
-	} else {
+	ret =__blkdev_get(bdev, mode);
+	if (ret)
+		goto abort_claiming;
+	if (mode & FMODE_EXCL) {
 		bd_finish_claiming(bdev, claiming, holder);
 
 		/*
@@ -1526,21 +1552,23 @@ retry:
 
 	if (unblock_events)
 		disk_unblock_events(disk);
+	if (mode & FMODE_EXCL)
+		bdput(claiming);
+	return bdev;
 
+abort_claiming:
+	if (mode & FMODE_EXCL)
+		bd_abort_claiming(bdev, claiming, holder);
+	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
 put_claiming:
 	if (mode & FMODE_EXCL)
 		bdput(claiming);
-put_disk:
-	if (ret)
-		put_disk_and_module(disk);
+put_blkdev:
+	blkdev_put_no_open(bdev);
 	if (ret == -ERESTARTSYS)
 		goto retry;
-bdput:
-	if (ret) {
-		bdput(bdev);
-		return ERR_PTR(ret);
-	}
-	return bdev;
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(blkdev_get_by_dev);
 
@@ -1641,7 +1669,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
-		bdev->bd_disk = NULL;
 		if (bdev_is_partition(bdev))
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
@@ -1699,12 +1726,10 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 	 * from userland - e.g. eject(1).
 	 */
 	disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
-
 	mutex_unlock(&bdev->bd_mutex);
 
 	__blkdev_put(bdev, mode, 0);
-	bdput(bdev);
-	put_disk_and_module(disk);
+	blkdev_put_no_open(bdev);
 }
 EXPORT_SYMBOL(blkdev_put);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index c8fc9792ac77..b9f3c246c3c9 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -197,12 +197,12 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
 
 struct blkg_conf_ctx {
-	struct gendisk			*disk;
+	struct block_device		*bdev;
 	struct blkcg_gq			*blkg;
 	char				*body;
 };
 
-struct gendisk *blkcg_conf_get_disk(char **inputp);
+struct block_device *blkcg_conf_open_bdev(char **inputp);
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bdd7339bcda4..5d48b92f5e43 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1994,6 +1994,12 @@ void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
 		void *holder);
 void blkdev_put(struct block_device *bdev, fmode_t mode);
 
+/* just for blk-cgroup, don't use elsewhere */
+struct block_device *blkdev_get_no_open(dev_t dev);
+void blkdev_put_no_open(struct block_device *bdev);
+
+struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
+void bdev_add(struct block_device *bdev, dev_t dev);
 struct block_device *I_BDEV(struct inode *inode);
 struct block_device *bdget_part(struct hd_struct *part);
 struct block_device *bdgrab(struct block_device *bdev);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ca5e356084c3..42a51653c730 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -65,6 +65,7 @@ struct hd_struct {
 	struct disk_stats __percpu *dkstats;
 	struct percpu_ref ref;
 
+	struct block_device *bdev;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
@@ -193,7 +194,6 @@ struct gendisk {
 	int flags;
 	unsigned long state;
 #define GD_NEED_PART_SCAN		0
-	struct rw_semaphore lookup_sem;
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;
@@ -300,7 +300,6 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
 }
 
 extern void del_gendisk(struct gendisk *gp);
-extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 
 extern void set_disk_ro(struct gendisk *disk, int flag);
@@ -338,7 +337,6 @@ int blk_drop_partitions(struct block_device *bdev);
 
 extern struct gendisk *__alloc_disk_node(int minors, int node_id);
 extern void put_disk(struct gendisk *disk);
-extern void put_disk_and_module(struct gendisk *disk);
 
 #define alloc_disk_node(minors, node_id)				\
 ({									\
@@ -388,7 +386,10 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
 }
 #endif /* CONFIG_SYSFS */
 
+extern struct rw_semaphore bdev_lookup_sem;
+
 dev_t blk_lookup_devt(const char *name, int partno);
+void blk_request_module(dev_t devt);
 #ifdef CONFIG_BLOCK
 void printk_all_partitions(void);
 #else /* CONFIG_BLOCK */
-- 
cgit 


From a954ea812018a84d350b316c39a2be3edc4b7ca8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 23 Nov 2020 13:29:55 +0100
Subject: block: remove ->bd_contains

Now that each hd_struct has a reference to the corresponding
block_device, there is no need for the bd_contains pointer.  Add
a bdev_whole() helper to look up the whole device block_device
struture instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c      |  2 +-
 drivers/scsi/scsicam.c    |  2 +-
 fs/block_dev.c            | 22 ++++++++--------------
 include/linux/blk_types.h |  4 +++-
 4 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 26c7aafba7c5..c0df88b3300c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1088,7 +1088,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	 * here to avoid changing device under exclusive owner.
 	 */
 	if (!(mode & FMODE_EXCL)) {
-		claimed_bdev = bdev->bd_contains;
+		claimed_bdev = bdev_whole(bdev);
 		error = bd_prepare_to_claim(bdev, claimed_bdev, loop_configure);
 		if (error)
 			goto out_putf;
diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c
index 682cf08ab041..f1553a453616 100644
--- a/drivers/scsi/scsicam.c
+++ b/drivers/scsi/scsicam.c
@@ -32,7 +32,7 @@
  */
 unsigned char *scsi_bios_ptable(struct block_device *dev)
 {
-	struct address_space *mapping = dev->bd_contains->bd_inode->i_mapping;
+	struct address_space *mapping = bdev_whole(dev)->bd_inode->i_mapping;
 	unsigned char *res = NULL;
 	struct page *page;
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b350ed3af83b..94baee369d26 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -119,7 +119,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
 	 * under live filesystem.
 	 */
 	if (!(mode & FMODE_EXCL)) {
-		claimed_bdev = bdev->bd_contains;
+		claimed_bdev = bdev_whole(bdev);
 		err = bd_prepare_to_claim(bdev, claimed_bdev,
 					  truncate_bdev_range);
 		if (err)
@@ -880,7 +880,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	spin_lock_init(&bdev->bd_size_lock);
 	bdev->bd_disk = disk;
 	bdev->bd_partno = partno;
-	bdev->bd_contains = NULL;
 	bdev->bd_super = NULL;
 	bdev->bd_inode = inode;
 	bdev->bd_part_count = 0;
@@ -1347,9 +1346,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 	int ret;
 
 	if (!bdev->bd_openers) {
-		bdev->bd_contains = bdev;
-
-		if (!bdev->bd_partno) {
+		if (!bdev_is_partition(bdev)) {
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, 0);
 			if (!bdev->bd_part)
@@ -1389,7 +1386,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 			whole->bd_part_count++;
 			mutex_unlock(&whole->bd_mutex);
 
-			bdev->bd_contains = whole;
 			bdev->bd_part = disk_get_part(disk, bdev->bd_partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1405,7 +1401,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 		if (bdev->bd_bdi == &noop_backing_dev_info)
 			bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
 	} else {
-		if (bdev->bd_contains == bdev) {
+		if (!bdev_is_partition(bdev)) {
 			ret = 0;
 			if (bdev->bd_disk->fops->open)
 				ret = bdev->bd_disk->fops->open(bdev, mode);
@@ -1423,7 +1419,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
  out_clear:
 	disk_put_part(bdev->bd_part);
 	bdev->bd_part = NULL;
-	bdev->bd_contains = NULL;
 	return ret;
 }
 
@@ -1670,8 +1665,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		if (bdev_is_partition(bdev))
-			victim = bdev->bd_contains;
-		bdev->bd_contains = NULL;
+			victim = bdev_whole(bdev);
 	} else {
 		if (!bdev_is_partition(bdev) && disk->fops->release)
 			disk->fops->release(disk, mode);
@@ -1690,6 +1684,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 	mutex_lock(&bdev->bd_mutex);
 
 	if (mode & FMODE_EXCL) {
+		struct block_device *whole = bdev_whole(bdev);
 		bool bdev_free;
 
 		/*
@@ -1700,13 +1695,12 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 		spin_lock(&bdev_lock);
 
 		WARN_ON_ONCE(--bdev->bd_holders < 0);
-		WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+		WARN_ON_ONCE(--whole->bd_holders < 0);
 
-		/* bd_contains might point to self, check in a separate step */
 		if ((bdev_free = !bdev->bd_holders))
 			bdev->bd_holder = NULL;
-		if (!bdev->bd_contains->bd_holders)
-			bdev->bd_contains->bd_holder = NULL;
+		if (!whole->bd_holders)
+			whole->bd_holder = NULL;
 
 		spin_unlock(&bdev_lock);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 9698f459cc65..2e0a9bd9688d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -32,7 +32,6 @@ struct block_device {
 #ifdef CONFIG_SYSFS
 	struct list_head	bd_holder_disks;
 #endif
-	struct block_device *	bd_contains;
 	u8			bd_partno;
 	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
@@ -49,6 +48,9 @@ struct block_device {
 	struct super_block	*bd_fsfreeze_sb;
 } __randomize_layout;
 
+#define bdev_whole(_bdev) \
+	((_bdev)->bd_disk->part0.bdev)
+
 #define bdev_kobj(_bdev) \
 	(&part_to_dev((_bdev)->bd_part)->kobj)
 
-- 
cgit 


From 37c3fc9abb25cd767ad5b048358336ac89488c16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 25 Nov 2020 21:20:08 +0100
Subject: block: simplify the block device claiming interface

Stop passing the whole device as a separate argument given that it
can be trivially deducted and cleanup the !holder debug check.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c   | 12 +++++-------
 fs/block_dev.c         | 51 ++++++++++++++++++--------------------------------
 include/linux/blkdev.h |  6 ++----
 3 files changed, 25 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c0df88b3300c..d643c67be6ac 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1069,7 +1069,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	struct file	*file;
 	struct inode	*inode;
 	struct address_space *mapping;
-	struct block_device *claimed_bdev = NULL;
 	int		error;
 	loff_t		size;
 	bool		partscan;
@@ -1088,8 +1087,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	 * here to avoid changing device under exclusive owner.
 	 */
 	if (!(mode & FMODE_EXCL)) {
-		claimed_bdev = bdev_whole(bdev);
-		error = bd_prepare_to_claim(bdev, claimed_bdev, loop_configure);
+		error = bd_prepare_to_claim(bdev, loop_configure);
 		if (error)
 			goto out_putf;
 	}
@@ -1176,15 +1174,15 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	mutex_unlock(&loop_ctl_mutex);
 	if (partscan)
 		loop_reread_partitions(lo, bdev);
-	if (claimed_bdev)
-		bd_abort_claiming(bdev, claimed_bdev, loop_configure);
+	if (!(mode & FMODE_EXCL))
+		bd_abort_claiming(bdev, loop_configure);
 	return 0;
 
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 out_bdev:
-	if (claimed_bdev)
-		bd_abort_claiming(bdev, claimed_bdev, loop_configure);
+	if (!(mode & FMODE_EXCL))
+		bd_abort_claiming(bdev, loop_configure);
 out_putf:
 	fput(file);
 out:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 94baee369d26..0569f5ebeb6f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -110,24 +110,20 @@ EXPORT_SYMBOL(invalidate_bdev);
 int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
 			loff_t lstart, loff_t lend)
 {
-	struct block_device *claimed_bdev = NULL;
-	int err;
-
 	/*
 	 * If we don't hold exclusive handle for the device, upgrade to it
 	 * while we discard the buffer cache to avoid discarding buffers
 	 * under live filesystem.
 	 */
 	if (!(mode & FMODE_EXCL)) {
-		claimed_bdev = bdev_whole(bdev);
-		err = bd_prepare_to_claim(bdev, claimed_bdev,
-					  truncate_bdev_range);
+		int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
 		if (err)
 			return err;
 	}
+
 	truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
-	if (claimed_bdev)
-		bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
+	if (!(mode & FMODE_EXCL))
+		bd_abort_claiming(bdev, truncate_bdev_range);
 	return 0;
 }
 EXPORT_SYMBOL(truncate_bdev_range);
@@ -978,7 +974,6 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 /**
  * bd_prepare_to_claim - claim a block device
  * @bdev: block device of interest
- * @whole: the whole device containing @bdev, may equal @bdev
  * @holder: holder trying to claim @bdev
  *
  * Claim @bdev.  This function fails if @bdev is already claimed by another
@@ -988,9 +983,12 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
  * RETURNS:
  * 0 if @bdev can be claimed, -EBUSY otherwise.
  */
-int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
-		void *holder)
+int bd_prepare_to_claim(struct block_device *bdev, void *holder)
 {
+	struct block_device *whole = bdev_whole(bdev);
+
+	if (WARN_ON_ONCE(!holder))
+		return -EINVAL;
 retry:
 	spin_lock(&bdev_lock);
 	/* if someone else claimed, fail */
@@ -1030,15 +1028,15 @@ static void bd_clear_claiming(struct block_device *whole, void *holder)
 /**
  * bd_finish_claiming - finish claiming of a block device
  * @bdev: block device of interest
- * @whole: whole block device
  * @holder: holder that has claimed @bdev
  *
  * Finish exclusive open of a block device. Mark the device as exlusively
  * open by the holder and wake up all waiters for exclusive open to finish.
  */
-static void bd_finish_claiming(struct block_device *bdev,
-		struct block_device *whole, void *holder)
+static void bd_finish_claiming(struct block_device *bdev, void *holder)
 {
+	struct block_device *whole = bdev_whole(bdev);
+
 	spin_lock(&bdev_lock);
 	BUG_ON(!bd_may_claim(bdev, whole, holder));
 	/*
@@ -1063,11 +1061,10 @@ static void bd_finish_claiming(struct block_device *bdev,
  * also used when exclusive open is not actually desired and we just needed
  * to block other exclusive openers for a while.
  */
-void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
-		       void *holder)
+void bd_abort_claiming(struct block_device *bdev, void *holder)
 {
 	spin_lock(&bdev_lock);
-	bd_clear_claiming(whole, holder);
+	bd_clear_claiming(bdev_whole(bdev), holder);
 	spin_unlock(&bdev_lock);
 }
 EXPORT_SYMBOL(bd_abort_claiming);
@@ -1487,7 +1484,6 @@ void blkdev_put_no_open(struct block_device *bdev)
  */
 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 {
-	struct block_device *claiming;
 	bool unblock_events = true;
 	struct block_device *bdev;
 	struct gendisk *disk;
@@ -1510,15 +1506,9 @@ retry:
 	disk = bdev->bd_disk;
 
 	if (mode & FMODE_EXCL) {
-		WARN_ON_ONCE(!holder);
-	
-		ret = -ENOMEM;
-		claiming = bdget_disk(disk, 0);
-		if (!claiming)
-			goto put_blkdev;
-		ret = bd_prepare_to_claim(bdev, claiming, holder);
+		ret = bd_prepare_to_claim(bdev, holder);
 		if (ret)
-			goto put_claiming;
+			goto put_blkdev;
 	}
 
 	disk_block_events(disk);
@@ -1528,7 +1518,7 @@ retry:
 	if (ret)
 		goto abort_claiming;
 	if (mode & FMODE_EXCL) {
-		bd_finish_claiming(bdev, claiming, holder);
+		bd_finish_claiming(bdev, holder);
 
 		/*
 		 * Block event polling for write claims if requested.  Any write
@@ -1547,18 +1537,13 @@ retry:
 
 	if (unblock_events)
 		disk_unblock_events(disk);
-	if (mode & FMODE_EXCL)
-		bdput(claiming);
 	return bdev;
 
 abort_claiming:
 	if (mode & FMODE_EXCL)
-		bd_abort_claiming(bdev, claiming, holder);
+		bd_abort_claiming(bdev, holder);
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
-put_claiming:
-	if (mode & FMODE_EXCL)
-		bdput(claiming);
 put_blkdev:
 	blkdev_put_no_open(bdev);
 	if (ret == -ERESTARTSYS)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5d48b92f5e43..43a25d855e04 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1988,10 +1988,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset);
 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 		void *holder);
 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder);
-int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
-		void *holder);
-void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
-		void *holder);
+int bd_prepare_to_claim(struct block_device *bdev, void *holder);
+void bd_abort_claiming(struct block_device *bdev, void *holder);
 void blkdev_put(struct block_device *bdev, fmode_t mode);
 
 /* just for blk-cgroup, don't use elsewhere */
-- 
cgit 


From c64dc3bd87097e7f08b9437819440f8bfddef995 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 23 Nov 2020 16:38:15 +0100
Subject: block: simplify part_to_disk

Now that struct hd_struct has a block_device pointer use that to
find the disk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 42a51653c730..6ba91ee54cb2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -218,13 +218,9 @@ struct gendisk {
 
 static inline struct gendisk *part_to_disk(struct hd_struct *part)
 {
-	if (likely(part)) {
-		if (part->partno)
-			return dev_to_disk(part_to_dev(part)->parent);
-		else
-			return dev_to_disk(part_to_dev(part));
-	}
-	return NULL;
+	if (unlikely(!part))
+		return NULL;
+	return part->bdev->bd_disk;
 }
 
 static inline int disk_max_parts(struct gendisk *disk)
-- 
cgit 


From a782483cc1f875355690625d8253a232f2581418 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Nov 2020 18:43:37 +0100
Subject: block: remove the nr_sects field in struct hd_struct

Now that the hd_struct always has a block device attached to it, there is
no need for having two size field that just get out of sync.

Additionally the field in hd_struct did not use proper serialization,
possibly allowing for torn writes.  By only using the block_device field
this problem also gets fixed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Coly Li <colyli@suse.de>			[bcache]
Acked-by: Chao Yu <yuchao0@huawei.com>			[f2fs]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                        |  4 +--
 block/blk-core.c                   |  2 +-
 block/blk.h                        | 53 ---------------------------
 block/genhd.c                      | 59 +++++++++++++++++-------------
 block/partitions/core.c            | 17 +++++----
 drivers/block/loop.c               |  1 -
 drivers/block/nbd.c                |  2 +-
 drivers/block/xen-blkback/common.h |  4 +--
 drivers/md/bcache/super.c          |  2 +-
 drivers/s390/block/dasd_ioctl.c    |  4 +--
 drivers/target/target_core_pscsi.c |  5 ++-
 fs/block_dev.c                     | 73 ++------------------------------------
 fs/f2fs/super.c                    |  2 +-
 fs/pstore/blk.c                    |  2 +-
 include/linux/genhd.h              | 29 ++++-----------
 kernel/trace/blktrace.c            |  2 +-
 16 files changed, 67 insertions(+), 194 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index fa01bef35bb1..669bb47a3198 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -613,8 +613,8 @@ void guard_bio_eod(struct bio *bio)
 	rcu_read_lock();
 	part = __disk_get_part(bio->bi_disk, bio->bi_partno);
 	if (part)
-		maxsector = part_nr_sects_read(part);
-	else
+		maxsector = bdev_nr_sectors(part->bdev);
+	else	
 		maxsector = get_capacity(bio->bi_disk);
 	rcu_read_unlock();
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 2db8bda43b6e..988f45094a38 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -755,7 +755,7 @@ static inline int blk_partition_remap(struct bio *bio)
 		goto out;
 
 	if (bio_sectors(bio)) {
-		if (bio_check_eod(bio, part_nr_sects_read(p)))
+		if (bio_check_eod(bio, bdev_nr_sectors(p->bdev)))
 			goto out;
 		bio->bi_iter.bi_sector += p->start_sect;
 		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
diff --git a/block/blk.h b/block/blk.h
index c4839abcfa27..09cee7024fb4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -387,59 +387,6 @@ static inline void hd_free_part(struct hd_struct *part)
 	percpu_ref_exit(&part->ref);
 }
 
-/*
- * Any access of part->nr_sects which is not protected by partition
- * bd_mutex or gendisk bdev bd_mutex, should be done using this
- * accessor function.
- *
- * Code written along the lines of i_size_read() and i_size_write().
- * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption
- * on.
- */
-static inline sector_t part_nr_sects_read(struct hd_struct *part)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	sector_t nr_sects;
-	unsigned seq;
-	do {
-		seq = read_seqcount_begin(&part->nr_sects_seq);
-		nr_sects = part->nr_sects;
-	} while (read_seqcount_retry(&part->nr_sects_seq, seq));
-	return nr_sects;
-#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
-	sector_t nr_sects;
-
-	preempt_disable();
-	nr_sects = part->nr_sects;
-	preempt_enable();
-	return nr_sects;
-#else
-	return part->nr_sects;
-#endif
-}
-
-/*
- * Should be called with mutex lock held (typically bd_mutex) of partition
- * to provide mutual exlusion among writers otherwise seqcount might be
- * left in wrong state leaving the readers spinning infinitely.
- */
-static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	preempt_disable();
-	write_seqcount_begin(&part->nr_sects_seq);
-	part->nr_sects = size;
-	write_seqcount_end(&part->nr_sects_seq);
-	preempt_enable();
-#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
-	preempt_disable();
-	part->nr_sects = size;
-	preempt_enable();
-#else
-	part->nr_sects = size;
-#endif
-}
-
 int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
 		unsigned int max_sectors, bool *same_page);
diff --git a/block/genhd.c b/block/genhd.c
index bf8fa82f135f..c65f485b9db5 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -40,6 +40,16 @@ static void disk_add_events(struct gendisk *disk);
 static void disk_del_events(struct gendisk *disk);
 static void disk_release_events(struct gendisk *disk);
 
+void set_capacity(struct gendisk *disk, sector_t sectors)
+{
+	struct block_device *bdev = disk->part0.bdev;
+
+	spin_lock(&bdev->bd_size_lock);
+	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+	spin_unlock(&bdev->bd_size_lock);
+}
+EXPORT_SYMBOL(set_capacity);
+
 /*
  * Set disk capacity and notify if the size is not currently zero and will not
  * be set to zero.  Returns true if a uevent was sent, otherwise false.
@@ -47,18 +57,30 @@ static void disk_release_events(struct gendisk *disk);
 bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
 {
 	sector_t capacity = get_capacity(disk);
+	char *envp[] = { "RESIZE=1", NULL };
 
 	set_capacity(disk, size);
-	revalidate_disk_size(disk, true);
 
-	if (capacity != size && capacity != 0 && size != 0) {
-		char *envp[] = { "RESIZE=1", NULL };
+	/*
+	 * Only print a message and send a uevent if the gendisk is user visible
+	 * and alive.  This avoids spamming the log and udev when setting the
+	 * initial capacity during probing.
+	 */
+	if (size == capacity ||
+	    (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
+		return false;
 
-		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
-		return true;
-	}
+	pr_info("%s: detected capacity change from %lld to %lld\n",
+		disk->disk_name, size, capacity);
 
-	return false;
+	/*
+	 * Historically we did not send a uevent for changes to/from an empty
+	 * device.
+	 */
+	if (!capacity || !size)
+		return false;
+	kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+	return true;
 }
 EXPORT_SYMBOL_GPL(set_capacity_and_notify);
 
@@ -247,7 +269,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 		part = rcu_dereference(ptbl->part[piter->idx]);
 		if (!part)
 			continue;
-		if (!part_nr_sects_read(part) &&
+		if (!bdev_nr_sectors(part->bdev) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
 		      piter->idx == 0))
@@ -284,7 +306,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 static inline int sector_in_part(struct hd_struct *part, sector_t sector)
 {
 	return part->start_sect <= sector &&
-		sector < part->start_sect + part_nr_sects_read(part);
+		sector < part->start_sect + bdev_nr_sectors(part->bdev);
 }
 
 /**
@@ -986,8 +1008,8 @@ void __init printk_all_partitions(void)
 
 			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
-			       (unsigned long long)part_nr_sects_read(part) >> 1
-			       , disk_name(disk, part->partno, name_buf),
+			       bdev_nr_sectors(part->bdev) >> 1,
+			       disk_name(disk, part->partno, name_buf),
 			       part->info ? part->info->uuid : "");
 			if (is_part0) {
 				if (dev->parent && dev->parent->driver)
@@ -1079,7 +1101,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	while ((part = disk_part_iter_next(&piter)))
 		seq_printf(seqf, "%4d  %7d %10llu %s\n",
 			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
-			   (unsigned long long)part_nr_sects_read(part) >> 1,
+			   bdev_nr_sectors(part->bdev) >> 1,
 			   disk_name(sgp, part->partno, buf));
 	disk_part_iter_exit(&piter);
 
@@ -1161,8 +1183,7 @@ ssize_t part_size_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%llu\n",
-		(unsigned long long)part_nr_sects_read(p));
+	return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev));
 }
 
 ssize_t part_stat_show(struct device *dev,
@@ -1618,16 +1639,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
 	rcu_assign_pointer(ptbl->part[0], &disk->part0);
 
-	/*
-	 * set_capacity() and get_capacity() currently don't use
-	 * seqcounter to read/update the part0->nr_sects. Still init
-	 * the counter as we can read the sectors in IO submission
-	 * patch using seqence counters.
-	 *
-	 * TODO: Ideally set_capacity() and get_capacity() should be
-	 * converted to make use of bd_mutex and sequence counters.
-	 */
-	hd_sects_seq_init(&disk->part0);
 	if (hd_ref_init(&disk->part0))
 		goto out_free_bdstats;
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 696bd9ff63c6..bcfa8215bd5e 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -85,6 +85,13 @@ static int (*check_part[])(struct parsed_partitions *) = {
 	NULL
 };
 
+static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
+{
+	spin_lock(&bdev->bd_size_lock);
+	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+	spin_unlock(&bdev->bd_size_lock);
+}
+
 static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
 {
 	struct parsed_partitions *state;
@@ -295,7 +302,7 @@ static void hd_struct_free_work(struct work_struct *work)
 	put_device(disk_to_dev(disk));
 
 	part->start_sect = 0;
-	part->nr_sects = 0;
+	bdev_set_nr_sectors(part->bdev, 0);
 	part_stat_set_all(part, 0);
 	put_device(part_to_dev(part));
 }
@@ -412,11 +419,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 		goto out_free_stats;
 	p->bdev = bdev;
 
-	hd_sects_seq_init(p);
 	pdev = part_to_dev(p);
 
 	p->start_sect = start;
-	p->nr_sects = len;
+	bdev_set_nr_sectors(bdev, len);
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
 
@@ -509,7 +515,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter))) {
 		if (part->partno == skip_partno ||
-		    start >= part->start_sect + part->nr_sects ||
+		    start >= part->start_sect + bdev_nr_sectors(part->bdev) ||
 		    start + length <= part->start_sect)
 			continue;
 		overlap = true;
@@ -600,8 +606,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno,
 	if (partition_overlaps(bdev->bd_disk, start, length, partno))
 		goto out_unlock;
 
-	part_nr_sects_write(part, length);
-	bd_set_nr_sectors(bdevp, length);
+	bdev_set_nr_sectors(bdevp, length);
 
 	ret = 0;
 out_unlock:
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d643c67be6ac..d2ce1ddc192d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1241,7 +1241,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	set_capacity(lo->lo_disk, 0);
 	loop_sysfs_exit(lo);
 	if (bdev) {
-		bd_set_nr_sectors(bdev, 0);
 		/* let user-space know about this change */
 		kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
 	}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 45b0423ef2c5..014683968ce1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1132,7 +1132,7 @@ static void nbd_bdev_reset(struct block_device *bdev)
 {
 	if (bdev->bd_openers > 1)
 		return;
-	bd_set_nr_sectors(bdev, 0);
+	set_capacity(bdev->bd_disk, 0);
 }
 
 static void nbd_parse_flags(struct nbd_device *nbd)
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index c6ea5d38c509..0762db247b41 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -358,9 +358,7 @@ struct pending_req {
 };
 
 
-#define vbd_sz(_v)	((_v)->bdev->bd_part ? \
-			 (_v)->bdev->bd_part->nr_sects : \
-			  get_capacity((_v)->bdev->bd_disk))
+#define vbd_sz(_v)	bdev_nr_sectors((_v)->bdev)
 
 #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
 #define xen_blkif_put(_b)				\
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c55d3c58a7ef..04fa40868fbe 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1408,7 +1408,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
 			q->limits.raid_partial_stripes_expensive;
 
 	ret = bcache_device_init(&dc->disk, block_size,
-			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset,
+			 bdev_nr_sectors(dc->bdev) - dc->sb.data_offset,
 			 dc->bdev, &bcache_cached_ops);
 	if (ret)
 		return ret;
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 3359559517bf..304eba1acf16 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -54,8 +54,6 @@ dasd_ioctl_enable(struct block_device *bdev)
 		return -ENODEV;
 
 	dasd_enable_device(base);
-	/* Formatting the dasd device can change the capacity. */
-	bd_set_nr_sectors(bdev, get_capacity(base->block->gdp));
 	dasd_put_device(base);
 	return 0;
 }
@@ -88,7 +86,7 @@ dasd_ioctl_disable(struct block_device *bdev)
 	 * Set i_size to zero, since read, write, etc. check against this
 	 * value.
 	 */
-	bd_set_nr_sectors(bdev, 0);
+	set_capacity(bdev->bd_disk, 0);
 	dasd_put_device(base);
 	return 0;
 }
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 4e37fa9b409d..7994f27e4527 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1029,9 +1029,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev)
 {
 	struct pscsi_dev_virt *pdv = PSCSI_DEV(dev);
 
-	if (pdv->pdv_bd && pdv->pdv_bd->bd_part)
-		return pdv->pdv_bd->bd_part->nr_sects;
-
+	if (pdv->pdv_bd)
+		return bdev_nr_sectors(pdv->pdv_bd);
 	return 0;
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a5b6955a841f..31ee5a857f71 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1208,70 +1208,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 #endif
 
-/**
- * check_disk_size_change - checks for disk size change and adjusts bdev size.
- * @disk: struct gendisk to check
- * @bdev: struct bdev to adjust.
- * @verbose: if %true log a message about a size change if there is any
- *
- * This routine checks to see if the bdev size does not match the disk size
- * and adjusts it if it differs. When shrinking the bdev size, its all caches
- * are freed.
- */
-static void check_disk_size_change(struct gendisk *disk,
-		struct block_device *bdev, bool verbose)
-{
-	loff_t disk_size, bdev_size;
-
-	spin_lock(&bdev->bd_size_lock);
-	disk_size = (loff_t)get_capacity(disk) << 9;
-	bdev_size = i_size_read(bdev->bd_inode);
-	if (disk_size != bdev_size) {
-		if (verbose) {
-			printk(KERN_INFO
-			       "%s: detected capacity change from %lld to %lld\n",
-			       disk->disk_name, bdev_size, disk_size);
-		}
-		i_size_write(bdev->bd_inode, disk_size);
-	}
-	spin_unlock(&bdev->bd_size_lock);
-}
-
-/**
- * revalidate_disk_size - checks for disk size change and adjusts bdev size.
- * @disk: struct gendisk to check
- * @verbose: if %true log a message about a size change if there is any
- *
- * This routine checks to see if the bdev size does not match the disk size
- * and adjusts it if it differs. When shrinking the bdev size, its all caches
- * are freed.
- */
-void revalidate_disk_size(struct gendisk *disk, bool verbose)
-{
-	struct block_device *bdev;
-
-	/*
-	 * Hidden disks don't have associated bdev so there's no point in
-	 * revalidating them.
-	 */
-	if (disk->flags & GENHD_FL_HIDDEN)
-		return;
-
-	bdev = bdget_disk(disk, 0);
-	if (bdev) {
-		check_disk_size_change(disk, bdev, verbose);
-		bdput(bdev);
-	}
-}
-
-void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
-{
-	spin_lock(&bdev->bd_size_lock);
-	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
-	spin_unlock(&bdev->bd_size_lock);
-}
-EXPORT_SYMBOL(bd_set_nr_sectors);
-
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 int bdev_disk_changed(struct block_device *bdev, bool invalidate)
@@ -1305,8 +1241,6 @@ rescan:
 			disk->fops->revalidate_disk(disk);
 	}
 
-	check_disk_size_change(disk, bdev, !invalidate);
-
 	if (get_capacity(disk)) {
 		ret = blk_add_partitions(disk, bdev);
 		if (ret == -EAGAIN)
@@ -1349,10 +1283,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 			if (disk->fops->open)
 				ret = disk->fops->open(bdev, mode);
 
-			if (!ret) {
-				bd_set_nr_sectors(bdev, get_capacity(disk));
+			if (!ret)
 				set_init_blocksize(bdev);
-			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1381,13 +1313,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 
 			bdev->bd_part = disk_get_part(disk, bdev->bd_partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
-			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
+			    !bdev->bd_part || !bdev_nr_sectors(bdev)) {
 				__blkdev_put(whole, mode, 1);
 				bdput(whole);
 				ret = -ENXIO;
 				goto out_clear;
 			}
-			bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
 			set_init_blocksize(bdev);
 		}
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 00eff2f51807..d4e7fab352ba 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3151,7 +3151,7 @@ static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx,
 static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 {
 	struct block_device *bdev = FDEV(devi).bdev;
-	sector_t nr_sectors = bdev->bd_part->nr_sects;
+	sector_t nr_sectors = bdev_nr_sectors(bdev);
 	struct f2fs_report_zones_args rep_zone_arg;
 	int ret;
 
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index fcd5563dde06..777a26f7bbe2 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -245,7 +245,7 @@ static struct block_device *psblk_get_bdev(void *holder,
 			return bdev;
 	}
 
-	nr_sects = part_nr_sects_read(bdev->bd_part);
+	nr_sects = bdev_nr_sectors(bdev);
 	if (!nr_sects) {
 		pr_err("not enough space for '%s'\n", blkdev);
 		blkdev_put(bdev, mode);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6ba91ee54cb2..30d4785b7df8 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -52,15 +52,6 @@ struct partition_meta_info {
 
 struct hd_struct {
 	sector_t start_sect;
-	/*
-	 * nr_sects is protected by sequence counter. One might extend a
-	 * partition while IO is happening to it and update of nr_sects
-	 * can be non-atomic on 32bit machines with 64bit sector_t.
-	 */
-	sector_t nr_sects;
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	seqcount_t nr_sects_seq;
-#endif
 	unsigned long stamp;
 	struct disk_stats __percpu *dkstats;
 	struct percpu_ref ref;
@@ -254,13 +245,6 @@ static inline void disk_put_part(struct hd_struct *part)
 		put_device(part_to_dev(part));
 }
 
-static inline void hd_sects_seq_init(struct hd_struct *p)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	seqcount_init(&p->nr_sects_seq);
-#endif
-}
-
 /*
  * Smarter partition iterator without context limits.
  */
@@ -318,13 +302,15 @@ static inline sector_t get_start_sect(struct block_device *bdev)
 {
 	return bdev->bd_part->start_sect;
 }
-static inline sector_t get_capacity(struct gendisk *disk)
+
+static inline sector_t bdev_nr_sectors(struct block_device *bdev)
 {
-	return disk->part0.nr_sects;
+	return i_size_read(bdev->bd_inode) >> 9;
 }
-static inline void set_capacity(struct gendisk *disk, sector_t size)
+
+static inline sector_t get_capacity(struct gendisk *disk)
 {
-	disk->part0.nr_sects = size;
+	return bdev_nr_sectors(disk->part0.bdev);
 }
 
 int bdev_disk_changed(struct block_device *bdev, bool invalidate);
@@ -358,10 +344,9 @@ int __register_blkdev(unsigned int major, const char *name,
 	__register_blkdev(major, name, NULL)
 void unregister_blkdev(unsigned int major, const char *name);
 
-void revalidate_disk_size(struct gendisk *disk, bool verbose);
 bool bdev_check_media_change(struct block_device *bdev);
 int __invalidate_device(struct block_device *bdev, bool kill_dirty);
-void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors);
+void set_capacity(struct gendisk *disk, sector_t size);
 
 /* for drivers/char/raw.c: */
 int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f1022945e346..7076d588a50d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -465,7 +465,7 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
 
 	if (part) {
 		bt->start_lba = part->start_sect;
-		bt->end_lba = part->start_sect + part->nr_sects;
+		bt->end_lba = part->start_sect + bdev_nr_sectors(bdev);
 	} else {
 		bt->start_lba = 0;
 		bt->end_lba = -1ULL;
-- 
cgit 


From 15e3d2c5cd53298272e59ad9072d3468f9dd3781 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 09:34:00 +0100
Subject: block: move disk stat accounting to struct block_device

Move the dkstats and stamp field to struct block_device in preparation
of killing struct hd_struct.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |  2 +-
 block/blk-core.c          |  4 ++--
 block/blk.h               |  1 -
 block/genhd.c             | 14 ++++----------
 block/partitions/core.c   |  9 +--------
 fs/block_dev.c            | 10 ++++++++++
 include/linux/blk_types.h |  2 ++
 include/linux/genhd.h     |  2 --
 include/linux/part_stat.h | 38 +++++++++++++++++++-------------------
 9 files changed, 39 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ad02289a4f7f..79aa96240cec 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void)
 		for_each_possible_cpu(cpu) {
 			struct disk_stats *cpu_dkstats;
 
-			cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
+			cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu);
 			tmp.ios[BLKG_IOSTAT_READ] +=
 				cpu_dkstats->ios[STAT_READ];
 			tmp.ios[BLKG_IOSTAT_WRITE] +=
diff --git a/block/blk-core.c b/block/blk-core.c
index 988f45094a38..d2c9cb24e087 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1264,9 +1264,9 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
 {
 	unsigned long stamp;
 again:
-	stamp = READ_ONCE(part->stamp);
+	stamp = READ_ONCE(part->bdev->bd_stamp);
 	if (unlikely(stamp != now)) {
-		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
+		if (likely(cmpxchg(&part->bdev->bd_stamp, stamp, now) == stamp))
 			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
 	}
 	if (part->partno) {
diff --git a/block/blk.h b/block/blk.h
index 09cee7024fb4..3f801f6e86f8 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -381,7 +381,6 @@ static inline void hd_struct_put(struct hd_struct *part)
 
 static inline void hd_free_part(struct hd_struct *part)
 {
-	free_percpu(part->dkstats);
 	kfree(part->info);
 	bdput(part->bdev);
 	percpu_ref_exit(&part->ref);
diff --git a/block/genhd.c b/block/genhd.c
index c65f485b9db5..2cbda8139556 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -112,7 +112,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
 
 	memset(stat, 0, sizeof(struct disk_stats));
 	for_each_possible_cpu(cpu) {
-		struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu);
+		struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu);
 		int group;
 
 		for (group = 0; group < NR_STAT_GROUPS; group++) {
@@ -891,7 +891,7 @@ void del_gendisk(struct gendisk *disk)
 	kobject_put(disk->slave_dir);
 
 	part_stat_set_all(&disk->part0, 0);
-	disk->part0.stamp = 0;
+	disk->part0.bdev->bd_stamp = 0;
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -1628,19 +1628,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	if (!disk->part0.bdev)
 		goto out_free_disk;
 
-	disk->part0.dkstats = alloc_percpu(struct disk_stats);
-	if (!disk->part0.dkstats)
-		goto out_bdput;
-
 	disk->node_id = node_id;
 	if (disk_expand_part_tbl(disk, 0))
-		goto out_free_bdstats;
+		goto out_bdput;
 
 	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
 	rcu_assign_pointer(ptbl->part[0], &disk->part0);
 
 	if (hd_ref_init(&disk->part0))
-		goto out_free_bdstats;
+		goto out_bdput;
 
 	disk->minors = minors;
 	rand_initialize_disk(disk);
@@ -1649,8 +1645,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	device_initialize(disk_to_dev(disk));
 	return disk;
 
-out_free_bdstats:
-	free_percpu(disk->part0.dkstats);
 out_bdput:
 	bdput(disk->part0.bdev);
 out_free_disk:
diff --git a/block/partitions/core.c b/block/partitions/core.c
index bcfa8215bd5e..8924e1ea8b2a 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -409,14 +409,9 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!p)
 		return ERR_PTR(-EBUSY);
 
-	err = -ENOMEM;
-	p->dkstats = alloc_percpu(struct disk_stats);
-	if (!p->dkstats)
-		goto out_free;
-
 	bdev = bdev_alloc(disk, partno);
 	if (!bdev)
-		goto out_free_stats;
+		goto out_free;
 	p->bdev = bdev;
 
 	pdev = part_to_dev(p);
@@ -490,8 +485,6 @@ out_free_info:
 	kfree(p->info);
 out_bdput:
 	bdput(bdev);
-out_free_stats:
-	free_percpu(p->dkstats);
 out_free:
 	kfree(p);
 	return ERR_PTR(err);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 31ee5a857f71..0832c7830f3a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -32,6 +32,7 @@
 #include <linux/cleancache.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/falloc.h>
+#include <linux/part_stat.h>
 #include <linux/uaccess.h>
 #include <linux/suspend.h>
 #include "internal.h"
@@ -781,6 +782,10 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
 
 static void bdev_free_inode(struct inode *inode)
 {
+	struct block_device *bdev = I_BDEV(inode);
+
+	free_percpu(bdev->bd_stats);
+
 	kmem_cache_free(bdev_cachep, BDEV_I(inode));
 }
 
@@ -875,6 +880,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 #ifdef CONFIG_SYSFS
 	INIT_LIST_HEAD(&bdev->bd_holder_disks);
 #endif
+	bdev->bd_stats = alloc_percpu(struct disk_stats);
+	if (!bdev->bd_stats) {
+		iput(inode);
+		return NULL;
+	}
 	return bdev;
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 2e0a9bd9688d..520011b95276 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -20,6 +20,8 @@ typedef void (bio_end_io_t) (struct bio *);
 struct bio_crypt_ctx;
 
 struct block_device {
+	struct disk_stats __percpu *bd_stats;
+	unsigned long		bd_stamp;
 	dev_t			bd_dev;
 	int			bd_openers;
 	struct inode *		bd_inode;	/* will die */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 30d4785b7df8..804ac45fbfbc 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -52,8 +52,6 @@ struct partition_meta_info {
 
 struct hd_struct {
 	sector_t start_sect;
-	unsigned long stamp;
-	struct disk_stats __percpu *dkstats;
 	struct percpu_ref ref;
 
 	struct block_device *bdev;
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 24125778ef3e..87ad60106e1d 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -25,17 +25,17 @@ struct disk_stats {
 #define part_stat_unlock()	preempt_enable()
 
 #define part_stat_get_cpu(part, field, cpu)				\
-	(per_cpu_ptr((part)->dkstats, (cpu))->field)
+	(per_cpu_ptr((part)->bdev->bd_stats, (cpu))->field)
 
 #define part_stat_get(part, field)					\
 	part_stat_get_cpu(part, field, smp_processor_id())
 
 #define part_stat_read(part, field)					\
 ({									\
-	typeof((part)->dkstats->field) res = 0;				\
+	typeof((part)->bdev->bd_stats->field) res = 0;			\
 	unsigned int _cpu;						\
 	for_each_possible_cpu(_cpu)					\
-		res += per_cpu_ptr((part)->dkstats, _cpu)->field;	\
+		res += per_cpu_ptr((part)->bdev->bd_stats, _cpu)->field; \
 	res;								\
 })
 
@@ -44,7 +44,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 	int i;
 
 	for_each_possible_cpu(i)
-		memset(per_cpu_ptr(part->dkstats, i), value,
+		memset(per_cpu_ptr(part->bdev->bd_stats, i), value,
 				sizeof(struct disk_stats));
 }
 
@@ -54,7 +54,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 	 part_stat_read(part, field[STAT_DISCARD]))
 
 #define __part_stat_add(part, field, addnd)				\
-	__this_cpu_add((part)->dkstats->field, addnd)
+	__this_cpu_add((part)->bdev->bd_stats->field, addnd)
 
 #define part_stat_add(part, field, addnd)	do {			\
 	__part_stat_add((part), field, addnd);				\
@@ -63,20 +63,20 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 				field, addnd);				\
 } while (0)
 
-#define part_stat_dec(gendiskp, field)					\
-	part_stat_add(gendiskp, field, -1)
-#define part_stat_inc(gendiskp, field)					\
-	part_stat_add(gendiskp, field, 1)
-#define part_stat_sub(gendiskp, field, subnd)				\
-	part_stat_add(gendiskp, field, -subnd)
+#define part_stat_dec(part, field)					\
+	part_stat_add(part, field, -1)
+#define part_stat_inc(part, field)					\
+	part_stat_add(part, field, 1)
+#define part_stat_sub(part, field, subnd)				\
+	part_stat_add(part, field, -subnd)
 
-#define part_stat_local_dec(gendiskp, field)				\
-	local_dec(&(part_stat_get(gendiskp, field)))
-#define part_stat_local_inc(gendiskp, field)				\
-	local_inc(&(part_stat_get(gendiskp, field)))
-#define part_stat_local_read(gendiskp, field)				\
-	local_read(&(part_stat_get(gendiskp, field)))
-#define part_stat_local_read_cpu(gendiskp, field, cpu)			\
-	local_read(&(part_stat_get_cpu(gendiskp, field, cpu)))
+#define part_stat_local_dec(part, field)				\
+	local_dec(&(part_stat_get(part, field)))
+#define part_stat_local_inc(part, field)				\
+	local_inc(&(part_stat_get(part, field)))
+#define part_stat_local_read(part, field)				\
+	local_read(&(part_stat_get(part, field)))
+#define part_stat_local_read_cpu(part, field, cpu)			\
+	local_read(&(part_stat_get_cpu(part, field, cpu)))
 
 #endif /* _LINUX_PART_STAT_H */
-- 
cgit 


From 29ff57c61094e7bbd921ab10b5a99dce9a0132e0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 09:34:24 +0100
Subject: block: move the start_sect field to struct block_device

Move the start_sect field to struct block_device in preparation
of killing struct hd_struct.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          |  5 +++--
 block/blk-lib.c           |  2 +-
 block/genhd.c             |  4 ++--
 block/partitions/core.c   | 17 +++++++++--------
 include/linux/blk_types.h |  1 +
 include/linux/blkdev.h    |  4 ++--
 include/linux/genhd.h     |  3 +--
 kernel/trace/blktrace.c   | 11 +++--------
 8 files changed, 22 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index d2c9cb24e087..9a3793d5ce38 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -757,9 +757,10 @@ static inline int blk_partition_remap(struct bio *bio)
 	if (bio_sectors(bio)) {
 		if (bio_check_eod(bio, bdev_nr_sectors(p->bdev)))
 			goto out;
-		bio->bi_iter.bi_sector += p->start_sect;
+		bio->bi_iter.bi_sector += p->bdev->bd_start_sect;
 		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
-				      bio->bi_iter.bi_sector - p->start_sect);
+				      bio->bi_iter.bi_sector -
+				      p->bdev->bd_start_sect);
 	}
 	bio->bi_partno = 0;
 	ret = 0;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index e90614fd8d6a..752f9c722062 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -65,7 +65,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 
 	/* In case the discard request is in a partition */
 	if (bdev_is_partition(bdev))
-		part_offset = bdev->bd_part->start_sect;
+		part_offset = bdev->bd_start_sect;
 
 	while (nr_sects) {
 		sector_t granularity_aligned_lba, req_sects;
diff --git a/block/genhd.c b/block/genhd.c
index 2cbda8139556..5efb2df1f079 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -305,8 +305,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 
 static inline int sector_in_part(struct hd_struct *part, sector_t sector)
 {
-	return part->start_sect <= sector &&
-		sector < part->start_sect + bdev_nr_sectors(part->bdev);
+	return part->bdev->bd_start_sect <= sector &&
+		sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev);
 }
 
 /**
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 8924e1ea8b2a..460a745812c6 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -192,7 +192,7 @@ static ssize_t part_start_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
+	return sprintf(buf, "%llu\n", p->bdev->bd_start_sect);
 }
 
 static ssize_t part_ro_show(struct device *dev,
@@ -209,7 +209,7 @@ static ssize_t part_alignment_offset_show(struct device *dev,
 
 	return sprintf(buf, "%u\n",
 		queue_limit_alignment_offset(&part_to_disk(p)->queue->limits,
-				p->start_sect));
+				p->bdev->bd_start_sect));
 }
 
 static ssize_t part_discard_alignment_show(struct device *dev,
@@ -219,7 +219,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
 
 	return sprintf(buf, "%u\n",
 		queue_limit_discard_alignment(&part_to_disk(p)->queue->limits,
-				p->start_sect));
+				p->bdev->bd_start_sect));
 }
 
 static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
@@ -301,7 +301,7 @@ static void hd_struct_free_work(struct work_struct *work)
 	 */
 	put_device(disk_to_dev(disk));
 
-	part->start_sect = 0;
+	part->bdev->bd_start_sect = 0;
 	bdev_set_nr_sectors(part->bdev, 0);
 	part_stat_set_all(part, 0);
 	put_device(part_to_dev(part));
@@ -416,7 +416,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	pdev = part_to_dev(p);
 
-	p->start_sect = start;
+	bdev->bd_start_sect = start;
 	bdev_set_nr_sectors(bdev, len);
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
@@ -508,8 +508,9 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter))) {
 		if (part->partno == skip_partno ||
-		    start >= part->start_sect + bdev_nr_sectors(part->bdev) ||
-		    start + length <= part->start_sect)
+		    start >= part->bdev->bd_start_sect +
+			bdev_nr_sectors(part->bdev) ||
+		    start + length <= part->bdev->bd_start_sect)
 			continue;
 		overlap = true;
 		break;
@@ -592,7 +593,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno,
 	mutex_lock_nested(&bdev->bd_mutex, 1);
 
 	ret = -EINVAL;
-	if (start != part->start_sect)
+	if (start != part->bdev->bd_start_sect)
 		goto out_unlock;
 
 	ret = -EBUSY;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 520011b95276..a690008f60cd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -20,6 +20,7 @@ typedef void (bio_end_io_t) (struct bio *);
 struct bio_crypt_ctx;
 
 struct block_device {
+	sector_t		bd_start_sect;
 	struct disk_stats __percpu *bd_stats;
 	unsigned long		bd_stamp;
 	dev_t			bd_dev;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 43a25d855e04..619adea57098 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1488,7 +1488,7 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
 		return -1;
 	if (bdev_is_partition(bdev))
 		return queue_limit_alignment_offset(&q->limits,
-				bdev->bd_part->start_sect);
+				bdev->bd_start_sect);
 	return q->limits.alignment_offset;
 }
 
@@ -1529,7 +1529,7 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
 
 	if (bdev_is_partition(bdev))
 		return queue_limit_discard_alignment(&q->limits,
-				bdev->bd_part->start_sect);
+				bdev->bd_start_sect);
 	return q->limits.discard_alignment;
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 804ac45fbfbc..50d27f5d38e2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -51,7 +51,6 @@ struct partition_meta_info {
 };
 
 struct hd_struct {
-	sector_t start_sect;
 	struct percpu_ref ref;
 
 	struct block_device *bdev;
@@ -298,7 +297,7 @@ extern void rand_initialize_disk(struct gendisk *disk);
 
 static inline sector_t get_start_sect(struct block_device *bdev)
 {
-	return bdev->bd_part->start_sect;
+	return bdev->bd_start_sect;
 }
 
 static inline sector_t bdev_nr_sectors(struct block_device *bdev)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7076d588a50d..8a723a91ec5a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -458,14 +458,9 @@ static struct rchan_callbacks blk_relay_callbacks = {
 static void blk_trace_setup_lba(struct blk_trace *bt,
 				struct block_device *bdev)
 {
-	struct hd_struct *part = NULL;
-
-	if (bdev)
-		part = bdev->bd_part;
-
-	if (part) {
-		bt->start_lba = part->start_sect;
-		bt->end_lba = part->start_sect + bdev_nr_sectors(bdev);
+	if (bdev) {
+		bt->start_lba = bdev->bd_start_sect;
+		bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
 	} else {
 		bt->start_lba = 0;
 		bt->end_lba = -1ULL;
-- 
cgit 


From 231926dbf0f084211e4ec4f4c006f0bf1f47809a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 12:01:45 +0100
Subject: block: move the partition_meta_info to struct block_device

Move the partition_meta_info to struct block_device in preparation for
killing struct hd_struct.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h               |  1 -
 block/genhd.c             |  3 ++-
 block/partitions/core.c   | 18 +++++++-----------
 fs/block_dev.c            |  1 +
 include/linux/blk_types.h |  2 ++
 include/linux/genhd.h     |  1 -
 init/do_mounts.c          |  7 ++++---
 7 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk.h b/block/blk.h
index 3f801f6e86f8..0bd4b58bcbaf 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -381,7 +381,6 @@ static inline void hd_struct_put(struct hd_struct *part)
 
 static inline void hd_free_part(struct hd_struct *part)
 {
-	kfree(part->info);
 	bdput(part->bdev);
 	percpu_ref_exit(&part->ref);
 }
diff --git a/block/genhd.c b/block/genhd.c
index 5efb2df1f079..4273e89f07e8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1010,7 +1010,8 @@ void __init printk_all_partitions(void)
 			       bdevt_str(part_devt(part), devt_buf),
 			       bdev_nr_sectors(part->bdev) >> 1,
 			       disk_name(disk, part->partno, name_buf),
-			       part->info ? part->info->uuid : "");
+			       part->bdev->bd_meta_info ?
+					part->bdev->bd_meta_info->uuid : "");
 			if (is_part0) {
 				if (dev->parent && dev->parent->driver)
 					printk(" driver: %s\n",
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 460a745812c6..07df9ff55462 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -275,8 +275,9 @@ static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
 	struct hd_struct *part = dev_to_part(dev);
 
 	add_uevent_var(env, "PARTN=%u", part->partno);
-	if (part->info && part->info->volname[0])
-		add_uevent_var(env, "PARTNAME=%s", part->info->volname);
+	if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0])
+		add_uevent_var(env, "PARTNAME=%s",
+			       part->bdev->bd_meta_info->volname);
 	return 0;
 }
 
@@ -422,13 +423,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->policy = get_disk_ro(disk);
 
 	if (info) {
-		struct partition_meta_info *pinfo;
-
-		pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id);
-		if (!pinfo)
+		err = -ENOMEM;
+		bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL);
+		if (!bdev->bd_meta_info)
 			goto out_bdput;
-		memcpy(pinfo, info, sizeof(*info));
-		p->info = pinfo;
 	}
 
 	dname = dev_name(ddev);
@@ -444,7 +442,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free_info;
+		goto out_bdput;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -481,8 +479,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 	return p;
 
-out_free_info:
-	kfree(p->info);
 out_bdput:
 	bdput(bdev);
 out_free:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0832c7830f3a..0770f654b09c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -785,6 +785,7 @@ static void bdev_free_inode(struct inode *inode)
 	struct block_device *bdev = I_BDEV(inode);
 
 	free_percpu(bdev->bd_stats);
+	kfree(bdev->bd_meta_info);
 
 	kmem_cache_free(bdev_cachep, BDEV_I(inode));
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a690008f60cd..2f8ede04e5a9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -49,6 +49,8 @@ struct block_device {
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
 	struct super_block	*bd_fsfreeze_sb;
+
+	struct partition_meta_info *bd_meta_info;
 } __randomize_layout;
 
 #define bdev_whole(_bdev) \
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 50d27f5d38e2..30d7076155b4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -57,7 +57,6 @@ struct hd_struct {
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
-	struct partition_meta_info *info;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	int make_it_fail;
 #endif
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 5879edf083b3..368ccb718501 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -79,8 +79,8 @@ static int match_dev_by_uuid(struct device *dev, const void *data)
 	const struct uuidcmp *cmp = data;
 	struct hd_struct *part = dev_to_part(dev);
 
-	if (!part->info ||
-	    strncasecmp(cmp->uuid, part->info->uuid, cmp->len))
+	if (!part->bdev->bd_meta_info ||
+	    strncasecmp(cmp->uuid, part->bdev->bd_meta_info->uuid, cmp->len))
 		return 0;
 	return 1;
 }
@@ -169,7 +169,8 @@ static int match_dev_by_label(struct device *dev, const void *data)
 	const char *label = data;
 	struct hd_struct *part = dev_to_part(dev);
 
-	if (!part->info || strcmp(label, part->info->volname))
+	if (!part->bdev->bd_meta_info ||
+	    strcmp(label, part->bdev->bd_meta_info->volname))
 		return 0;
 	return 1;
 }
-- 
cgit 


From 1bdd5ae0251d678488dffcf455d4633c2beef1bc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 23 Nov 2020 19:00:13 +0100
Subject: block: move holder_dir to struct block_device

Move the holder_dir field to struct block_device in preparation for
kill struct hd_struct.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c             |  5 +++--
 block/partitions/core.c   |  8 ++++----
 fs/block_dev.c            | 11 +++++------
 include/linux/blk_types.h |  1 +
 include/linux/genhd.h     |  1 -
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 4273e89f07e8..0bd7026cee62 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -681,7 +681,8 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 	 */
 	pm_runtime_set_memalloc_noio(ddev, true);
 
-	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->part0.bdev->bd_holder_dir =
+			kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
 	if (disk->flags & GENHD_FL_HIDDEN) {
@@ -887,7 +888,7 @@ void del_gendisk(struct gendisk *disk)
 
 	blk_unregister_queue(disk);
 
-	kobject_put(disk->part0.holder_dir);
+	kobject_put(disk->part0.bdev->bd_holder_dir);
 	kobject_put(disk->slave_dir);
 
 	part_stat_set_all(&disk->part0, 0);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 07df9ff55462..c068471fa654 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -344,7 +344,7 @@ void delete_partition(struct hd_struct *part)
 	 */
 	get_device(disk_to_dev(disk));
 	rcu_assign_pointer(ptbl->part[part->partno], NULL);
-	kobject_put(part->holder_dir);
+	kobject_put(part->bdev->bd_holder_dir);
 	device_del(part_to_dev(part));
 
 	/*
@@ -452,8 +452,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 		goto out_put;
 
 	err = -ENOMEM;
-	p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
-	if (!p->holder_dir)
+	bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+	if (!bdev->bd_holder_dir)
 		goto out_del;
 
 	dev_set_uevent_suppress(pdev, 0);
@@ -487,7 +487,7 @@ out_free:
 out_remove_file:
 	device_remove_file(pdev, &dev_attr_whole_disk);
 out_del:
-	kobject_put(p->holder_dir);
+	kobject_put(bdev->bd_holder_dir);
 	device_del(pdev);
 out_put:
 	put_device(pdev);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0770f654b09c..381c22426f43 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1142,7 +1142,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	WARN_ON_ONCE(!bdev->bd_holder);
 
 	/* FIXME: remove the following once add_disk() handles errors */
-	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
+	if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir))
 		goto out_unlock;
 
 	holder = bd_find_holder_disk(bdev, disk);
@@ -1165,14 +1165,14 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	if (ret)
 		goto out_free;
 
-	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
 	if (ret)
 		goto out_del;
 	/*
 	 * bdev could be deleted beneath us which would implicitly destroy
 	 * the holder directory.  Hold on to it.
 	 */
-	kobject_get(bdev->bd_part->holder_dir);
+	kobject_get(bdev->bd_holder_dir);
 
 	list_add(&holder->list, &bdev->bd_holder_disks);
 	goto out_unlock;
@@ -1207,9 +1207,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 
 	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
 		del_symlink(disk->slave_dir, bdev_kobj(bdev));
-		del_symlink(bdev->bd_part->holder_dir,
-			    &disk_to_dev(disk)->kobj);
-		kobject_put(bdev->bd_part->holder_dir);
+		del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
+		kobject_put(bdev->bd_holder_dir);
 		list_del_init(&holder->list);
 		kfree(holder);
 	}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 2f8ede04e5a9..c0591e52d7d7 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -35,6 +35,7 @@ struct block_device {
 #ifdef CONFIG_SYSFS
 	struct list_head	bd_holder_disks;
 #endif
+	struct kobject		*bd_holder_dir;
 	u8			bd_partno;
 	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 30d7076155b4..b4a5c05593b9 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -55,7 +55,6 @@ struct hd_struct {
 
 	struct block_device *bdev;
 	struct device __dev;
-	struct kobject *holder_dir;
 	int policy, partno;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	int make_it_fail;
-- 
cgit 


From b309e9936347232c724eaa13f70533128b4864e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 23 Nov 2020 16:28:47 +0100
Subject: block: move make_it_fail to struct block_device

Move the make_it_fail flag to struct block_device an turn it into a bool
in preparation of killing struct hd_struct.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 3 ++-
 block/genhd.c             | 4 ++--
 include/linux/blk_types.h | 3 +++
 include/linux/genhd.h     | 3 ---
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9a3793d5ce38..9121390be97a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -668,7 +668,8 @@ __setup("fail_make_request=", setup_fail_make_request);
 
 static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
 {
-	return part->make_it_fail && should_fail(&fail_make_request, bytes);
+	return part->bdev->bd_make_it_fail &&
+		should_fail(&fail_make_request, bytes);
 }
 
 static int __init fail_make_request_debugfs(void)
diff --git a/block/genhd.c b/block/genhd.c
index 0bd7026cee62..f9c957739d4b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1292,7 +1292,7 @@ ssize_t part_fail_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%d\n", p->make_it_fail);
+	return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail);
 }
 
 ssize_t part_fail_store(struct device *dev,
@@ -1303,7 +1303,7 @@ ssize_t part_fail_store(struct device *dev,
 	int i;
 
 	if (count > 0 && sscanf(buf, "%d", &i) > 0)
-		p->make_it_fail = (i == 0) ? 0 : 1;
+		p->bdev->bd_make_it_fail = i;
 
 	return count;
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c0591e52d7d7..b237f1e40814 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -52,6 +52,9 @@ struct block_device {
 	struct super_block	*bd_fsfreeze_sb;
 
 	struct partition_meta_info *bd_meta_info;
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+	bool			bd_make_it_fail;
+#endif
 } __randomize_layout;
 
 #define bdev_whole(_bdev) \
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index b4a5c05593b9..349cf6403ccd 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -56,9 +56,6 @@ struct hd_struct {
 	struct block_device *bdev;
 	struct device __dev;
 	int policy, partno;
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-	int make_it_fail;
-#endif
 	struct rcu_work rcu_work;
 };
 
-- 
cgit 


From 83950d359010a493462d58c712b1124c877d1b3b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 23 Nov 2020 16:36:02 +0100
Subject: block: move the policy field to struct block_device

Move the policy field to struct block_device and rename it to the
more descriptive bd_read_only.  Also turn the field into a bool as it
is used as such.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 2 +-
 block/genhd.c             | 8 ++++----
 block/ioctl.c             | 2 +-
 block/partitions/core.c   | 4 ++--
 include/linux/blk_types.h | 1 +
 include/linux/genhd.h     | 4 ++--
 6 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9121390be97a..d64ffcb6f9ae 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -696,7 +696,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
 {
 	const int op = bio_op(bio);
 
-	if (part->policy && op_is_write(op)) {
+	if (part->bdev->bd_read_only && op_is_write(op)) {
 		char b[BDEVNAME_SIZE];
 
 		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
diff --git a/block/genhd.c b/block/genhd.c
index f9c957739d4b..2db1204920a9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1687,14 +1687,14 @@ void set_disk_ro(struct gendisk *disk, int flag)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 
-	if (disk->part0.policy != flag) {
+	if (disk->part0.bdev->bd_read_only != flag) {
 		set_disk_ro_uevent(disk, flag);
-		disk->part0.policy = flag;
+		disk->part0.bdev->bd_read_only = flag;
 	}
 
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter)))
-		part->policy = flag;
+		part->bdev->bd_read_only = flag;
 	disk_part_iter_exit(&piter);
 }
 
@@ -1704,7 +1704,7 @@ int bdev_read_only(struct block_device *bdev)
 {
 	if (!bdev)
 		return 0;
-	return bdev->bd_part->policy;
+	return bdev->bd_read_only;
 }
 
 EXPORT_SYMBOL(bdev_read_only);
diff --git a/block/ioctl.c b/block/ioctl.c
index a6d8171221c7..d61d652078f4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -345,7 +345,7 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode,
 		if (ret)
 			return ret;
 	}
-	bdev->bd_part->policy = n;
+	bdev->bd_read_only = n;
 	return 0;
 }
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index c068471fa654..060c1be13cd8 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -199,7 +199,7 @@ static ssize_t part_ro_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
-	return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+	return sprintf(buf, "%d\n", p->bdev->bd_read_only);
 }
 
 static ssize_t part_alignment_offset_show(struct device *dev,
@@ -420,7 +420,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	bdev->bd_start_sect = start;
 	bdev_set_nr_sectors(bdev, len);
 	p->partno = partno;
-	p->policy = get_disk_ro(disk);
+	bdev->bd_read_only = get_disk_ro(disk);
 
 	if (info) {
 		err = -ENOMEM;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b237f1e40814..758cf71c9aa2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -23,6 +23,7 @@ struct block_device {
 	sector_t		bd_start_sect;
 	struct disk_stats __percpu *bd_stats;
 	unsigned long		bd_stamp;
+	bool			bd_read_only;	/* read-only policy */
 	dev_t			bd_dev;
 	int			bd_openers;
 	struct inode *		bd_inode;	/* will die */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 349cf6403ccd..dcbf9ef7610e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -55,7 +55,7 @@ struct hd_struct {
 
 	struct block_device *bdev;
 	struct device __dev;
-	int policy, partno;
+	int partno;
 	struct rcu_work rcu_work;
 };
 
@@ -278,7 +278,7 @@ extern void set_disk_ro(struct gendisk *disk, int flag);
 
 static inline int get_disk_ro(struct gendisk *disk)
 {
-	return disk->part0.policy;
+	return disk->part0.bdev->bd_read_only;
 }
 
 extern void disk_block_events(struct gendisk *disk);
-- 
cgit 


From cb8432d650fe3be58bb962bc8e602dc405510327 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Nov 2020 18:47:17 +0100
Subject: block: allocate struct hd_struct as part of struct bdev_inode

Allocate hd_struct together with struct block_device to pre-load
the lifetime rule changes in preparation of merging the two structures.

Note that part0 was previously embedded into struct gendisk, but is
a separate allocation now, and already points to the block_device instead
of the hd_struct.  The lifetime of struct gendisk is still controlled by
the struct device embedded in the part0 hd_struct.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c                   | 16 ++++-----
 block/blk-flush.c                  |  2 +-
 block/blk-merge.c                  |  2 --
 block/blk.h                        | 21 ------------
 block/genhd.c                      | 50 +++++++++++-----------------
 block/partitions/core.c            | 67 ++++----------------------------------
 drivers/block/drbd/drbd_receiver.c |  2 +-
 drivers/block/drbd/drbd_worker.c   |  3 +-
 drivers/block/zram/zram_drv.c      |  2 +-
 drivers/md/dm.c                    |  4 +--
 drivers/md/md.c                    |  2 +-
 fs/block_dev.c                     | 39 +++++++---------------
 include/linux/blk_types.h          |  2 +-
 include/linux/genhd.h              | 14 ++++----
 include/linux/part_stat.h          |  4 +--
 15 files changed, 61 insertions(+), 169 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index d64ffcb6f9ae..9ea70275fc1c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -714,7 +714,8 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
 
 static noinline int should_fail_bio(struct bio *bio)
 {
-	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+	if (should_fail_request(bio->bi_disk->part0->bd_part,
+			bio->bi_iter.bi_size))
 		return -EIO;
 	return 0;
 }
@@ -831,7 +832,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		if (unlikely(blk_partition_remap(bio)))
 			goto end_io;
 	} else {
-		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+		if (unlikely(bio_check_ro(bio, bio->bi_disk->part0->bd_part)))
 			goto end_io;
 		if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
 			goto end_io;
@@ -1203,7 +1204,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 		return ret;
 
 	if (rq->rq_disk &&
-	    should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
+	    should_fail_request(rq->rq_disk->part0->bd_part, blk_rq_bytes(rq)))
 		return BLK_STS_IOERR;
 
 	if (blk_crypto_insert_cloned_request(rq))
@@ -1272,7 +1273,7 @@ again:
 			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
 	}
 	if (part->partno) {
-		part = &part_to_disk(part)->part0;
+		part = part_to_disk(part)->part0->bd_part;
 		goto again;
 	}
 }
@@ -1309,8 +1310,6 @@ void blk_account_io_done(struct request *req, u64 now)
 		part_stat_inc(part, ios[sgrp]);
 		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
 		part_stat_unlock();
-
-		hd_struct_put(part);
 	}
 }
 
@@ -1354,7 +1353,7 @@ EXPORT_SYMBOL_GPL(part_start_io_acct);
 unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
 				 unsigned int op)
 {
-	return __part_start_io_acct(&disk->part0, sectors, op);
+	return __part_start_io_acct(disk->part0->bd_part, sectors, op);
 }
 EXPORT_SYMBOL(disk_start_io_acct);
 
@@ -1376,14 +1375,13 @@ void part_end_io_acct(struct hd_struct *part, struct bio *bio,
 		      unsigned long start_time)
 {
 	__part_end_io_acct(part, bio_op(bio), start_time);
-	hd_struct_put(part);
 }
 EXPORT_SYMBOL_GPL(part_end_io_acct);
 
 void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 		      unsigned long start_time)
 {
-	__part_end_io_acct(&disk->part0, op, start_time);
+	__part_end_io_acct(disk->part0->bd_part, op, start_time);
 }
 EXPORT_SYMBOL(disk_end_io_acct);
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e32958f0b687..fcd0a60574df 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -139,7 +139,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
 
 static void blk_account_io_flush(struct request *rq)
 {
-	struct hd_struct *part = &rq->rq_disk->part0;
+	struct hd_struct *part = rq->rq_disk->part0->bd_part;
 
 	part_stat_lock();
 	part_stat_inc(part, ios[STAT_FLUSH]);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index bcf5e4580603..cb351ab9b77d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -683,8 +683,6 @@ static void blk_account_io_merge_request(struct request *req)
 		part_stat_lock();
 		part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
 		part_stat_unlock();
-
-		hd_struct_put(req->part);
 	}
 }
 
diff --git a/block/blk.h b/block/blk.h
index 0bd4b58bcbaf..32ac41f7557f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -363,27 +363,6 @@ int bdev_del_partition(struct block_device *bdev, int partno);
 int bdev_resize_partition(struct block_device *bdev, int partno,
 		sector_t start, sector_t length);
 int disk_expand_part_tbl(struct gendisk *disk, int target);
-int hd_ref_init(struct hd_struct *part);
-
-/* no need to get/put refcount of part0 */
-static inline int hd_struct_try_get(struct hd_struct *part)
-{
-	if (part->partno)
-		return percpu_ref_tryget_live(&part->ref);
-	return 1;
-}
-
-static inline void hd_struct_put(struct hd_struct *part)
-{
-	if (part->partno)
-		percpu_ref_put(&part->ref);
-}
-
-static inline void hd_free_part(struct hd_struct *part)
-{
-	bdput(part->bdev);
-	percpu_ref_exit(&part->ref);
-}
 
 int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
diff --git a/block/genhd.c b/block/genhd.c
index 2db1204920a9..c35b03dac5e5 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -42,7 +42,7 @@ static void disk_release_events(struct gendisk *disk);
 
 void set_capacity(struct gendisk *disk, sector_t sectors)
 {
-	struct block_device *bdev = disk->part0.bdev;
+	struct block_device *bdev = disk->part0;
 
 	spin_lock(&bdev->bd_size_lock);
 	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
@@ -318,9 +318,7 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector)
  * primarily used for stats accounting.
  *
  * CONTEXT:
- * RCU read locked.  The returned partition pointer is always valid
- * because its refcount is grabbed except for part0, which lifetime
- * is same with the disk.
+ * RCU read locked.
  *
  * RETURNS:
  * Found partition on success, part0 is returned if no partition matches
@@ -336,26 +334,19 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 	ptbl = rcu_dereference(disk->part_tbl);
 
 	part = rcu_dereference(ptbl->last_lookup);
-	if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
+	if (part && sector_in_part(part, sector))
 		goto out_unlock;
 
 	for (i = 1; i < ptbl->len; i++) {
 		part = rcu_dereference(ptbl->part[i]);
 
 		if (part && sector_in_part(part, sector)) {
-			/*
-			 * only live partition can be cached for lookup,
-			 * so use-after-free on cached & deleting partition
-			 * can be avoided
-			 */
-			if (!hd_struct_try_get(part))
-				break;
 			rcu_assign_pointer(ptbl->last_lookup, part);
 			goto out_unlock;
 		}
 	}
 
-	part = &disk->part0;
+	part = disk->part0->bd_part;
 out_unlock:
 	rcu_read_unlock();
 	return part;
@@ -681,8 +672,8 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 	 */
 	pm_runtime_set_memalloc_noio(ddev, true);
 
-	disk->part0.bdev->bd_holder_dir =
-			kobject_create_and_add("holders", &ddev->kobj);
+	disk->part0->bd_holder_dir =
+		kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
 	if (disk->flags & GENHD_FL_HIDDEN) {
@@ -748,7 +739,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
 
 	disk->flags |= GENHD_FL_UP;
 
-	retval = blk_alloc_devt(&disk->part0, &devt);
+	retval = blk_alloc_devt(disk->part0->bd_part, &devt);
 	if (retval) {
 		WARN_ON(1);
 		return;
@@ -775,7 +766,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
 		ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
 		WARN_ON(ret);
 		bdi_set_owner(bdi, dev);
-		bdev_add(disk->part0.bdev, devt);
+		bdev_add(disk->part0, devt);
 	}
 	register_disk(parent, disk, groups);
 	if (register_queue)
@@ -888,11 +879,11 @@ void del_gendisk(struct gendisk *disk)
 
 	blk_unregister_queue(disk);
 
-	kobject_put(disk->part0.bdev->bd_holder_dir);
+	kobject_put(disk->part0->bd_holder_dir);
 	kobject_put(disk->slave_dir);
 
-	part_stat_set_all(&disk->part0, 0);
-	disk->part0.bdev->bd_stamp = 0;
+	part_stat_set_all(disk->part0->bd_part, 0);
+	disk->part0->bd_stamp = 0;
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -1005,7 +996,7 @@ void __init printk_all_partitions(void)
 		 */
 		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 		while ((part = disk_part_iter_next(&piter))) {
-			bool is_part0 = part == &disk->part0;
+			bool is_part0 = part == disk->part0->bd_part;
 
 			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
@@ -1460,7 +1451,7 @@ static void disk_release(struct device *dev)
 	disk_release_events(disk);
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
-	hd_free_part(&disk->part0);
+	bdput(disk->part0);
 	if (disk->queue)
 		blk_put_queue(disk->queue);
 	kfree(disk);
@@ -1626,8 +1617,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	if (!disk)
 		return NULL;
 
-	disk->part0.bdev = bdev_alloc(disk, 0);
-	if (!disk->part0.bdev)
+	disk->part0 = bdev_alloc(disk, 0);
+	if (!disk->part0)
 		goto out_free_disk;
 
 	disk->node_id = node_id;
@@ -1635,10 +1626,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 		goto out_bdput;
 
 	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
-	rcu_assign_pointer(ptbl->part[0], &disk->part0);
-
-	if (hd_ref_init(&disk->part0))
-		goto out_bdput;
+	rcu_assign_pointer(ptbl->part[0], disk->part0->bd_part);
 
 	disk->minors = minors;
 	rand_initialize_disk(disk);
@@ -1648,7 +1636,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	return disk;
 
 out_bdput:
-	bdput(disk->part0.bdev);
+	bdput(disk->part0);
 out_free_disk:
 	kfree(disk);
 	return NULL;
@@ -1687,9 +1675,9 @@ void set_disk_ro(struct gendisk *disk, int flag)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 
-	if (disk->part0.bdev->bd_read_only != flag) {
+	if (disk->part0->bd_read_only != flag) {
 		set_disk_ro_uevent(disk, flag);
-		disk->part0.bdev->bd_read_only = flag;
+		disk->part0->bd_read_only = flag;
 	}
 
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 060c1be13cd8..6d1fca193cbd 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -265,9 +265,9 @@ static const struct attribute_group *part_attr_groups[] = {
 static void part_release(struct device *dev)
 {
 	struct hd_struct *p = dev_to_part(dev);
+
 	blk_free_devt(dev->devt);
-	hd_free_part(p);
-	kfree(p);
+	bdput(p->bdev);
 }
 
 static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
@@ -288,46 +288,6 @@ struct device_type part_type = {
 	.uevent		= part_uevent,
 };
 
-static void hd_struct_free_work(struct work_struct *work)
-{
-	struct hd_struct *part =
-		container_of(to_rcu_work(work), struct hd_struct, rcu_work);
-	struct gendisk *disk = part_to_disk(part);
-
-	/*
-	 * Release the disk reference acquired in delete_partition here.
-	 * We can't release it in hd_struct_free because the final put_device
-	 * needs process context and thus can't be run directly from a
-	 * percpu_ref ->release handler.
-	 */
-	put_device(disk_to_dev(disk));
-
-	part->bdev->bd_start_sect = 0;
-	bdev_set_nr_sectors(part->bdev, 0);
-	part_stat_set_all(part, 0);
-	put_device(part_to_dev(part));
-}
-
-static void hd_struct_free(struct percpu_ref *ref)
-{
-	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
-	struct gendisk *disk = part_to_disk(part);
-	struct disk_part_tbl *ptbl =
-		rcu_dereference_protected(disk->part_tbl, 1);
-
-	rcu_assign_pointer(ptbl->last_lookup, NULL);
-
-	INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work);
-	queue_rcu_work(system_wq, &part->rcu_work);
-}
-
-int hd_ref_init(struct hd_struct *part)
-{
-	if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL))
-		return -ENOMEM;
-	return 0;
-}
-
 /*
  * Must be called either with bd_mutex held, before a disk can be opened or
  * after all disk users are gone.
@@ -342,8 +302,8 @@ void delete_partition(struct hd_struct *part)
 	 * ->part_tbl is referenced in this part's release handler, so
 	 *  we have to hold the disk device
 	 */
-	get_device(disk_to_dev(disk));
 	rcu_assign_pointer(ptbl->part[part->partno], NULL);
+	rcu_assign_pointer(ptbl->last_lookup, NULL);
 	kobject_put(part->bdev->bd_holder_dir);
 	device_del(part_to_dev(part));
 
@@ -353,7 +313,7 @@ void delete_partition(struct hd_struct *part)
 	 */
 	remove_inode_hash(part->bdev->bd_inode);
 
-	percpu_ref_kill(&part->ref);
+	put_device(part_to_dev(part));
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -406,15 +366,11 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (ptbl->part[partno])
 		return ERR_PTR(-EBUSY);
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
-	if (!p)
-		return ERR_PTR(-EBUSY);
-
 	bdev = bdev_alloc(disk, partno);
 	if (!bdev)
-		goto out_free;
-	p->bdev = bdev;
+		return ERR_PTR(-ENOMEM);
 
+	p = bdev->bd_part;
 	pdev = part_to_dev(p);
 
 	bdev->bd_start_sect = start;
@@ -463,13 +419,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 			goto out_del;
 	}
 
-	err = hd_ref_init(p);
-	if (err) {
-		if (flags & ADDPART_FLAG_WHOLEDISK)
-			goto out_remove_file;
-		goto out_del;
-	}
-
 	/* everything is up and running, commence */
 	bdev_add(bdev, devt);
 	rcu_assign_pointer(ptbl->part[partno], p);
@@ -481,11 +430,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 out_bdput:
 	bdput(bdev);
-out_free:
-	kfree(p);
 	return ERR_PTR(err);
-out_remove_file:
-	device_remove_file(pdev, &dev_attr_whole_disk);
 out_del:
 	kobject_put(bdev->bd_holder_dir);
 	device_del(pdev);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index dc333dbe5232..9e5c2fdfda36 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2802,7 +2802,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 	if (c_min_rate == 0)
 		return false;
 
-	curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
+	curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) -
 			atomic_read(&device->rs_sect_ev);
 
 	if (atomic_read(&device->ap_actlog_cnt)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ba56f3f05312..343f56b86bb7 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1678,7 +1678,8 @@ void drbd_rs_controller_reset(struct drbd_device *device)
 	atomic_set(&device->rs_sect_in, 0);
 	atomic_set(&device->rs_sect_ev, 0);
 	device->rs_in_flight = 0;
-	device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors);
+	device->rs_last_events =
+		(int)part_stat_read_accum(disk->part0->bd_part, sectors);
 
 	/* Updating the RCU protected object in place is necessary since
 	   this function gets called from atomic context.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index b5f68951c9d2..6d84876a9cd0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1687,7 +1687,7 @@ static void zram_reset_device(struct zram *zram)
 	zram->disksize = 0;
 
 	set_capacity_and_notify(zram->disk, 0);
-	part_stat_set_all(&zram->disk->part0, 0);
+	part_stat_set_all(zram->disk->part0->bd_part, 0);
 
 	up_write(&zram->init_lock);
 	/* I/O operation under all of CPU are done so let's free */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 48051db006f3..1b2db4d530ea 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1607,7 +1607,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 				 * (by eliminating DM's splitting and just using bio_split)
 				 */
 				part_stat_lock();
-				__dm_part_stat_sub(&dm_disk(md)->part0,
+				__dm_part_stat_sub(dm_disk(md)->part0->bd_part,
 						   sectors[op_stat_group(bio_op(bio))], ci.sector_count);
 				part_stat_unlock();
 
@@ -2242,7 +2242,7 @@ EXPORT_SYMBOL_GPL(dm_put);
 static bool md_in_flight_bios(struct mapped_device *md)
 {
 	int cpu;
-	struct hd_struct *part = &dm_disk(md)->part0;
+	struct hd_struct *part = dm_disk(md)->part0->bd_part;
 	long sum = 0;
 
 	for_each_possible_cpu(cpu) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7ce6047c856e..3696c2d77a4d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8441,7 +8441,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_disk;
-		curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
+		curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) -
 			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 381c22426f43..61cf33b6284f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -39,6 +39,7 @@
 
 struct bdev_inode {
 	struct block_device bdev;
+	struct hd_struct hd;
 	struct inode vfs_inode;
 };
 
@@ -886,6 +887,9 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		iput(inode);
 		return NULL;
 	}
+	bdev->bd_part = &BDEV_I(inode)->hd;
+	memset(bdev->bd_part, 0, sizeof(*bdev->bd_part));
+	bdev->bd_part->bdev = bdev;
 	return bdev;
 }
 
@@ -1280,15 +1284,10 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int ret;
+	int ret = 0;
 
 	if (!bdev->bd_openers) {
 		if (!bdev_is_partition(bdev)) {
-			ret = -ENXIO;
-			bdev->bd_part = disk_get_part(disk, 0);
-			if (!bdev->bd_part)
-				goto out_clear;
-
 			ret = 0;
 			if (disk->fops->open)
 				ret = disk->fops->open(bdev, mode);
@@ -1307,7 +1306,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 				bdev_disk_changed(bdev, ret == -ENOMEDIUM);
 
 			if (ret)
-				goto out_clear;
+				return ret;
 		} else {
 			struct block_device *whole = bdget_disk(disk, 0);
 
@@ -1316,18 +1315,16 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 			if (ret) {
 				mutex_unlock(&whole->bd_mutex);
 				bdput(whole);
-				goto out_clear;
+				return ret;
 			}
 			whole->bd_part_count++;
 			mutex_unlock(&whole->bd_mutex);
 
-			bdev->bd_part = disk_get_part(disk, bdev->bd_partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
-			    !bdev->bd_part || !bdev_nr_sectors(bdev)) {
+			    !bdev_nr_sectors(bdev)) {
 				__blkdev_put(whole, mode, 1);
 				bdput(whole);
-				ret = -ENXIO;
-				goto out_clear;
+				return -ENXIO;
 			}
 			set_init_blocksize(bdev);
 		}
@@ -1336,7 +1333,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 			bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
 	} else {
 		if (!bdev_is_partition(bdev)) {
-			ret = 0;
 			if (bdev->bd_disk->fops->open)
 				ret = bdev->bd_disk->fops->open(bdev, mode);
 			/* the same as first opener case, read comment there */
@@ -1349,11 +1345,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
 	}
 	bdev->bd_openers++;
 	return 0;
-
- out_clear:
-	disk_put_part(bdev->bd_part);
-	bdev->bd_part = NULL;
-	return ret;
 }
 
 struct block_device *blkdev_get_no_open(dev_t dev)
@@ -1580,18 +1571,12 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		sync_blockdev(bdev);
 		kill_bdev(bdev);
 		bdev_write_inode(bdev);
-
-		if (!bdev_is_partition(bdev) && disk->fops->release)
-			disk->fops->release(disk, mode);
-
-		disk_put_part(bdev->bd_part);
-		bdev->bd_part = NULL;
 		if (bdev_is_partition(bdev))
 			victim = bdev_whole(bdev);
-	} else {
-		if (!bdev_is_partition(bdev) && disk->fops->release)
-			disk->fops->release(disk, mode);
 	}
+
+	if (!bdev_is_partition(bdev) && disk->fops->release)
+		disk->fops->release(disk, mode);
 	mutex_unlock(&bdev->bd_mutex);
 	if (victim) {
 		__blkdev_put(victim, mode, 1);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 758cf71c9aa2..6edea5c16259 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -59,7 +59,7 @@ struct block_device {
 } __randomize_layout;
 
 #define bdev_whole(_bdev) \
-	((_bdev)->bd_disk->part0.bdev)
+	((_bdev)->bd_disk->part0)
 
 #define bdev_kobj(_bdev) \
 	(&part_to_dev((_bdev)->bd_part)->kobj)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index dcbf9ef7610e..df7319da013c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -19,11 +19,12 @@
 #include <linux/blk_types.h>
 #include <asm/local.h>
 
-#define dev_to_disk(device)	container_of((device), struct gendisk, part0.__dev)
 #define dev_to_part(device)	container_of((device), struct hd_struct, __dev)
-#define disk_to_dev(disk)	(&(disk)->part0.__dev)
 #define part_to_dev(part)	(&((part)->__dev))
 
+#define dev_to_disk(device)	(dev_to_part(device)->bdev->bd_disk)
+#define disk_to_dev(disk)	(part_to_dev((disk)->part0->bd_part))
+
 extern const struct device_type disk_type;
 extern struct device_type part_type;
 extern struct class block_class;
@@ -51,12 +52,9 @@ struct partition_meta_info {
 };
 
 struct hd_struct {
-	struct percpu_ref ref;
-
 	struct block_device *bdev;
 	struct device __dev;
 	int partno;
-	struct rcu_work rcu_work;
 };
 
 /**
@@ -168,7 +166,7 @@ struct gendisk {
 	 * helpers.
 	 */
 	struct disk_part_tbl __rcu *part_tbl;
-	struct hd_struct part0;
+	struct block_device *part0;
 
 	const struct block_device_operations *fops;
 	struct request_queue *queue;
@@ -278,7 +276,7 @@ extern void set_disk_ro(struct gendisk *disk, int flag);
 
 static inline int get_disk_ro(struct gendisk *disk)
 {
-	return disk->part0.bdev->bd_read_only;
+	return disk->part0->bd_read_only;
 }
 
 extern void disk_block_events(struct gendisk *disk);
@@ -302,7 +300,7 @@ static inline sector_t bdev_nr_sectors(struct block_device *bdev)
 
 static inline sector_t get_capacity(struct gendisk *disk)
 {
-	return bdev_nr_sectors(disk->part0.bdev);
+	return bdev_nr_sectors(disk->part0);
 }
 
 int bdev_disk_changed(struct block_device *bdev, bool invalidate);
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 87ad60106e1d..680de036691e 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -59,8 +59,8 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 #define part_stat_add(part, field, addnd)	do {			\
 	__part_stat_add((part), field, addnd);				\
 	if ((part)->partno)						\
-		__part_stat_add(&part_to_disk((part))->part0,		\
-				field, addnd);				\
+		__part_stat_add(part_to_disk((part))->part0->bd_part,	\
+			field, addnd); \
 } while (0)
 
 #define part_stat_dec(part, field)					\
-- 
cgit 


From 8446fe9255be821cb38ffd306d7e8edc4b9ea662 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 09:36:54 +0100
Subject: block: switch partition lookup to use struct block_device

Use struct block_device to lookup partitions on a disk.  This removes
all usage of struct hd_struct from the I/O path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Coly Li <colyli@suse.de>			[bcache]
Acked-by: Chao Yu <yuchao0@huawei.com>			[f2fs]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                        |  4 +--
 block/blk-core.c                   | 66 +++++++++++++++++---------------------
 block/blk-flush.c                  |  2 +-
 block/blk-mq.c                     |  9 +++---
 block/blk-mq.h                     |  7 ++--
 block/blk.h                        |  4 +--
 block/genhd.c                      | 57 +++++++++++++++++---------------
 block/partitions/core.c            |  7 ++--
 drivers/block/drbd/drbd_receiver.c |  2 +-
 drivers/block/drbd/drbd_worker.c   |  2 +-
 drivers/block/zram/zram_drv.c      |  2 +-
 drivers/md/bcache/request.c        |  4 +--
 drivers/md/dm.c                    |  4 +--
 drivers/md/md.c                    |  4 +--
 drivers/nvme/target/admin-cmd.c    | 20 ++++++------
 fs/ext4/super.c                    | 18 ++++-------
 fs/ext4/sysfs.c                    | 10 ++----
 fs/f2fs/f2fs.h                     |  2 +-
 fs/f2fs/super.c                    |  6 ++--
 include/linux/blkdev.h             |  8 ++---
 include/linux/genhd.h              |  4 +--
 include/linux/part_stat.h          | 17 +++++-----
 22 files changed, 122 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 669bb47a3198..ebb18136b86f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -608,12 +608,12 @@ void bio_truncate(struct bio *bio, unsigned new_size)
 void guard_bio_eod(struct bio *bio)
 {
 	sector_t maxsector;
-	struct hd_struct *part;
+	struct block_device *part;
 
 	rcu_read_lock();
 	part = __disk_get_part(bio->bi_disk, bio->bi_partno);
 	if (part)
-		maxsector = bdev_nr_sectors(part->bdev);
+		maxsector = bdev_nr_sectors(part);
 	else	
 		maxsector = get_capacity(bio->bi_disk);
 	rcu_read_unlock();
diff --git a/block/blk-core.c b/block/blk-core.c
index 9ea70275fc1c..cee568389b7e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -666,10 +666,9 @@ static int __init setup_fail_make_request(char *str)
 }
 __setup("fail_make_request=", setup_fail_make_request);
 
-static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
+static bool should_fail_request(struct block_device *part, unsigned int bytes)
 {
-	return part->bdev->bd_make_it_fail &&
-		should_fail(&fail_make_request, bytes);
+	return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
 }
 
 static int __init fail_make_request_debugfs(void)
@@ -684,7 +683,7 @@ late_initcall(fail_make_request_debugfs);
 
 #else /* CONFIG_FAIL_MAKE_REQUEST */
 
-static inline bool should_fail_request(struct hd_struct *part,
+static inline bool should_fail_request(struct block_device *part,
 					unsigned int bytes)
 {
 	return false;
@@ -692,11 +691,11 @@ static inline bool should_fail_request(struct hd_struct *part,
 
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
-static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+static inline bool bio_check_ro(struct bio *bio, struct block_device *part)
 {
 	const int op = bio_op(bio);
 
-	if (part->bdev->bd_read_only && op_is_write(op)) {
+	if (part->bd_read_only && op_is_write(op)) {
 		char b[BDEVNAME_SIZE];
 
 		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
@@ -704,7 +703,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
 
 		WARN_ONCE(1,
 		       "Trying to write to read-only block-device %s (partno %d)\n",
-			bio_devname(bio, b), part->partno);
+			bio_devname(bio, b), part->bd_partno);
 		/* Older lvm-tools actually trigger this */
 		return false;
 	}
@@ -714,8 +713,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
 
 static noinline int should_fail_bio(struct bio *bio)
 {
-	if (should_fail_request(bio->bi_disk->part0->bd_part,
-			bio->bi_iter.bi_size))
+	if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size))
 		return -EIO;
 	return 0;
 }
@@ -744,7 +742,7 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
  */
 static inline int blk_partition_remap(struct bio *bio)
 {
-	struct hd_struct *p;
+	struct block_device *p;
 	int ret = -EIO;
 
 	rcu_read_lock();
@@ -757,12 +755,12 @@ static inline int blk_partition_remap(struct bio *bio)
 		goto out;
 
 	if (bio_sectors(bio)) {
-		if (bio_check_eod(bio, bdev_nr_sectors(p->bdev)))
+		if (bio_check_eod(bio, bdev_nr_sectors(p)))
 			goto out;
-		bio->bi_iter.bi_sector += p->bdev->bd_start_sect;
-		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+		bio->bi_iter.bi_sector += p->bd_start_sect;
+		trace_block_bio_remap(bio->bi_disk->queue, bio, p->bd_dev,
 				      bio->bi_iter.bi_sector -
-				      p->bdev->bd_start_sect);
+				      p->bd_start_sect);
 	}
 	bio->bi_partno = 0;
 	ret = 0;
@@ -832,7 +830,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		if (unlikely(blk_partition_remap(bio)))
 			goto end_io;
 	} else {
-		if (unlikely(bio_check_ro(bio, bio->bi_disk->part0->bd_part)))
+		if (unlikely(bio_check_ro(bio, bio->bi_disk->part0)))
 			goto end_io;
 		if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
 			goto end_io;
@@ -1204,7 +1202,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 		return ret;
 
 	if (rq->rq_disk &&
-	    should_fail_request(rq->rq_disk->part0->bd_part, blk_rq_bytes(rq)))
+	    should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
 		return BLK_STS_IOERR;
 
 	if (blk_crypto_insert_cloned_request(rq))
@@ -1263,17 +1261,18 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
 
-static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
+static void update_io_ticks(struct block_device *part, unsigned long now,
+		bool end)
 {
 	unsigned long stamp;
 again:
-	stamp = READ_ONCE(part->bdev->bd_stamp);
+	stamp = READ_ONCE(part->bd_stamp);
 	if (unlikely(stamp != now)) {
-		if (likely(cmpxchg(&part->bdev->bd_stamp, stamp, now) == stamp))
+		if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp))
 			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
 	}
-	if (part->partno) {
-		part = part_to_disk(part)->part0->bd_part;
+	if (part->bd_partno) {
+		part = bdev_whole(part);
 		goto again;
 	}
 }
@@ -1282,11 +1281,9 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (req->part && blk_do_io_stat(req)) {
 		const int sgrp = op_stat_group(req_op(req));
-		struct hd_struct *part;
 
 		part_stat_lock();
-		part = req->part;
-		part_stat_add(part, sectors[sgrp], bytes >> 9);
+		part_stat_add(req->part, sectors[sgrp], bytes >> 9);
 		part_stat_unlock();
 	}
 }
@@ -1301,14 +1298,11 @@ void blk_account_io_done(struct request *req, u64 now)
 	if (req->part && blk_do_io_stat(req) &&
 	    !(req->rq_flags & RQF_FLUSH_SEQ)) {
 		const int sgrp = op_stat_group(req_op(req));
-		struct hd_struct *part;
 
 		part_stat_lock();
-		part = req->part;
-
-		update_io_ticks(part, jiffies, true);
-		part_stat_inc(part, ios[sgrp]);
-		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
+		update_io_ticks(req->part, jiffies, true);
+		part_stat_inc(req->part, ios[sgrp]);
+		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
 		part_stat_unlock();
 	}
 }
@@ -1325,7 +1319,7 @@ void blk_account_io_start(struct request *rq)
 	part_stat_unlock();
 }
 
-static unsigned long __part_start_io_acct(struct hd_struct *part,
+static unsigned long __part_start_io_acct(struct block_device *part,
 					  unsigned int sectors, unsigned int op)
 {
 	const int sgrp = op_stat_group(op);
@@ -1341,7 +1335,7 @@ static unsigned long __part_start_io_acct(struct hd_struct *part,
 	return now;
 }
 
-unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
+unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part,
 				 struct bio *bio)
 {
 	*part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
@@ -1353,11 +1347,11 @@ EXPORT_SYMBOL_GPL(part_start_io_acct);
 unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
 				 unsigned int op)
 {
-	return __part_start_io_acct(disk->part0->bd_part, sectors, op);
+	return __part_start_io_acct(disk->part0, sectors, op);
 }
 EXPORT_SYMBOL(disk_start_io_acct);
 
-static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
+static void __part_end_io_acct(struct block_device *part, unsigned int op,
 			       unsigned long start_time)
 {
 	const int sgrp = op_stat_group(op);
@@ -1371,7 +1365,7 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
 	part_stat_unlock();
 }
 
-void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+void part_end_io_acct(struct block_device *part, struct bio *bio,
 		      unsigned long start_time)
 {
 	__part_end_io_acct(part, bio_op(bio), start_time);
@@ -1381,7 +1375,7 @@ EXPORT_SYMBOL_GPL(part_end_io_acct);
 void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 		      unsigned long start_time)
 {
-	__part_end_io_acct(disk->part0->bd_part, op, start_time);
+	__part_end_io_acct(disk->part0, op, start_time);
 }
 EXPORT_SYMBOL(disk_end_io_acct);
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index fcd0a60574df..9507dcdd5881 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -139,7 +139,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
 
 static void blk_account_io_flush(struct request *rq)
 {
-	struct hd_struct *part = rq->rq_disk->part0->bd_part;
+	struct block_device *part = rq->rq_disk->part0;
 
 	part_stat_lock();
 	part_stat_inc(part, ios[STAT_FLUSH]);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 55bcee5dc032..a2593748fa53 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -95,7 +95,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 }
 
 struct mq_inflight {
-	struct hd_struct *part;
+	struct block_device *part;
 	unsigned int inflight[2];
 };
 
@@ -111,7 +111,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
-unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
+unsigned int blk_mq_in_flight(struct request_queue *q,
+		struct block_device *part)
 {
 	struct mq_inflight mi = { .part = part };
 
@@ -120,8 +121,8 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
 	return mi.inflight[0] + mi.inflight[1];
 }
 
-void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
-			 unsigned int inflight[2])
+void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
+		unsigned int inflight[2])
 {
 	struct mq_inflight mi = { .part = part };
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index a52703c98b77..c696515766c7 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -182,9 +182,10 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 	return hctx->nr_ctx && hctx->tags;
 }
 
-unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
-void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
-			 unsigned int inflight[2]);
+unsigned int blk_mq_in_flight(struct request_queue *q,
+		struct block_device *part);
+void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
+		unsigned int inflight[2]);
 
 static inline void blk_mq_put_dispatch_budget(struct request_queue *q)
 {
diff --git a/block/blk.h b/block/blk.h
index 32ac41f7557f..d5bf8f3a0781 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -215,7 +215,7 @@ static inline void elevator_exit(struct request_queue *q,
 	__elevator_exit(q, e);
 }
 
-struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
+struct block_device *__disk_get_part(struct gendisk *disk, int partno);
 
 ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
 		char *buf);
@@ -348,7 +348,7 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q);
 static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
 #endif
 
-struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
+struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
 
 int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
 void blk_free_devt(dev_t devt);
diff --git a/block/genhd.c b/block/genhd.c
index c35b03dac5e5..ed06466b305d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -126,7 +126,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
 	}
 }
 
-static unsigned int part_in_flight(struct hd_struct *part)
+static unsigned int part_in_flight(struct block_device *part)
 {
 	unsigned int inflight = 0;
 	int cpu;
@@ -141,7 +141,8 @@ static unsigned int part_in_flight(struct hd_struct *part)
 	return inflight;
 }
 
-static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2])
+static void part_in_flight_rw(struct block_device *part,
+		unsigned int inflight[2])
 {
 	int cpu;
 
@@ -157,7 +158,7 @@ static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2])
 		inflight[1] = 0;
 }
 
-struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
+struct block_device *__disk_get_part(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
 
@@ -182,15 +183,21 @@ struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
  */
 struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
 {
+	struct block_device *bdev;
 	struct hd_struct *part;
 
 	rcu_read_lock();
-	part = __disk_get_part(disk, partno);
-	if (part)
-		get_device(part_to_dev(part));
+	bdev = __disk_get_part(disk, partno);
+	if (!bdev)
+		goto fail;
+	part = bdev->bd_part;
+	if (!kobject_get_unless_zero(&part_to_dev(part)->kobj))
+		goto fail;
 	rcu_read_unlock();
-
 	return part;
+fail:
+	rcu_read_unlock();
+	return NULL;
 }
 
 /**
@@ -264,19 +271,19 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 
 	/* iterate to the next partition */
 	for (; piter->idx != end; piter->idx += inc) {
-		struct hd_struct *part;
+		struct block_device *part;
 
 		part = rcu_dereference(ptbl->part[piter->idx]);
 		if (!part)
 			continue;
-		if (!bdev_nr_sectors(part->bdev) &&
+		if (!bdev_nr_sectors(part) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
 		      piter->idx == 0))
 			continue;
 
-		get_device(part_to_dev(part));
-		piter->part = part;
+		get_device(part_to_dev(part->bd_part));
+		piter->part = part->bd_part;
 		piter->idx += inc;
 		break;
 	}
@@ -303,10 +310,10 @@ void disk_part_iter_exit(struct disk_part_iter *piter)
 }
 EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 
-static inline int sector_in_part(struct hd_struct *part, sector_t sector)
+static inline int sector_in_part(struct block_device *part, sector_t sector)
 {
-	return part->bdev->bd_start_sect <= sector &&
-		sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev);
+	return part->bd_start_sect <= sector &&
+		sector < part->bd_start_sect + bdev_nr_sectors(part);
 }
 
 /**
@@ -324,10 +331,10 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector)
  * Found partition on success, part0 is returned if no partition matches
  * or the matched partition is being deleted.
  */
-struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
+struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 {
 	struct disk_part_tbl *ptbl;
-	struct hd_struct *part;
+	struct block_device *part;
 	int i;
 
 	rcu_read_lock();
@@ -346,7 +353,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 		}
 	}
 
-	part = disk->part0->bd_part;
+	part = disk->part0;
 out_unlock:
 	rcu_read_unlock();
 	return part;
@@ -882,7 +889,7 @@ void del_gendisk(struct gendisk *disk)
 	kobject_put(disk->part0->bd_holder_dir);
 	kobject_put(disk->slave_dir);
 
-	part_stat_set_all(disk->part0->bd_part, 0);
+	part_stat_set_all(disk->part0, 0);
 	disk->part0->bd_stamp = 0;
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
@@ -1189,9 +1196,9 @@ ssize_t part_stat_show(struct device *dev,
 
 	part_stat_read_all(p, &stat);
 	if (queue_is_mq(q))
-		inflight = blk_mq_in_flight(q, p);
+		inflight = blk_mq_in_flight(q, p->bdev);
 	else
-		inflight = part_in_flight(p);
+		inflight = part_in_flight(p->bdev);
 
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
@@ -1231,9 +1238,9 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
 	unsigned int inflight[2];
 
 	if (queue_is_mq(q))
-		blk_mq_in_flight_rw(q, p, inflight);
+		blk_mq_in_flight_rw(q, p->bdev, inflight);
 	else
-		part_in_flight_rw(p, inflight);
+		part_in_flight_rw(p->bdev, inflight);
 
 	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
 }
@@ -1506,9 +1513,9 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	while ((hd = disk_part_iter_next(&piter))) {
 		part_stat_read_all(hd, &stat);
 		if (queue_is_mq(gp->queue))
-			inflight = blk_mq_in_flight(gp->queue, hd);
+			inflight = blk_mq_in_flight(gp->queue, hd->bdev);
 		else
-			inflight = part_in_flight(hd);
+			inflight = part_in_flight(hd->bdev);
 
 		seq_printf(seqf, "%4d %7d %s "
 			   "%lu %lu %lu %u "
@@ -1626,7 +1633,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 		goto out_bdput;
 
 	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
-	rcu_assign_pointer(ptbl->part[0], disk->part0->bd_part);
+	rcu_assign_pointer(ptbl->part[0], disk->part0);
 
 	disk->minors = minors;
 	rand_initialize_disk(disk);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 6d1fca193cbd..c2f6721633b8 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -298,12 +298,9 @@ void delete_partition(struct hd_struct *part)
 	struct disk_part_tbl *ptbl =
 		rcu_dereference_protected(disk->part_tbl, 1);
 
-	/*
-	 * ->part_tbl is referenced in this part's release handler, so
-	 *  we have to hold the disk device
-	 */
 	rcu_assign_pointer(ptbl->part[part->partno], NULL);
 	rcu_assign_pointer(ptbl->last_lookup, NULL);
+
 	kobject_put(part->bdev->bd_holder_dir);
 	device_del(part_to_dev(part));
 
@@ -421,7 +418,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	/* everything is up and running, commence */
 	bdev_add(bdev, devt);
-	rcu_assign_pointer(ptbl->part[partno], p);
+	rcu_assign_pointer(ptbl->part[partno], bdev);
 
 	/* suppress uevent if the disk suppresses it */
 	if (!dev_get_uevent_suppress(ddev))
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 9e5c2fdfda36..09c86ef3f0fd 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2802,7 +2802,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 	if (c_min_rate == 0)
 		return false;
 
-	curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) -
+	curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
 			atomic_read(&device->rs_sect_ev);
 
 	if (atomic_read(&device->ap_actlog_cnt)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 343f56b86bb7..02044ab7f767 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1679,7 +1679,7 @@ void drbd_rs_controller_reset(struct drbd_device *device)
 	atomic_set(&device->rs_sect_ev, 0);
 	device->rs_in_flight = 0;
 	device->rs_last_events =
-		(int)part_stat_read_accum(disk->part0->bd_part, sectors);
+		(int)part_stat_read_accum(disk->part0, sectors);
 
 	/* Updating the RCU protected object in place is necessary since
 	   this function gets called from atomic context.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 6d84876a9cd0..dc8957d173d3 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1687,7 +1687,7 @@ static void zram_reset_device(struct zram *zram)
 	zram->disksize = 0;
 
 	set_capacity_and_notify(zram->disk, 0);
-	part_stat_set_all(zram->disk->part0->bd_part, 0);
+	part_stat_set_all(zram->disk->part0, 0);
 
 	up_write(&zram->init_lock);
 	/* I/O operation under all of CPU are done so let's free */
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index afac8d07c1bd..85b1f2a9b72d 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -475,7 +475,7 @@ struct search {
 	unsigned int		read_dirty_data:1;
 	unsigned int		cache_missed:1;
 
-	struct hd_struct	*part;
+	struct block_device	*part;
 	unsigned long		start_time;
 
 	struct btree_op		op;
@@ -1073,7 +1073,7 @@ struct detached_dev_io_private {
 	unsigned long		start_time;
 	bio_end_io_t		*bi_end_io;
 	void			*bi_private;
-	struct hd_struct	*part;
+	struct block_device	*part;
 };
 
 static void detached_dev_end_io(struct bio *bio)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1b2db4d530ea..176adcff56b3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1607,7 +1607,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 				 * (by eliminating DM's splitting and just using bio_split)
 				 */
 				part_stat_lock();
-				__dm_part_stat_sub(dm_disk(md)->part0->bd_part,
+				__dm_part_stat_sub(dm_disk(md)->part0,
 						   sectors[op_stat_group(bio_op(bio))], ci.sector_count);
 				part_stat_unlock();
 
@@ -2242,7 +2242,7 @@ EXPORT_SYMBOL_GPL(dm_put);
 static bool md_in_flight_bios(struct mapped_device *md)
 {
 	int cpu;
-	struct hd_struct *part = dm_disk(md)->part0->bd_part;
+	struct block_device *part = dm_disk(md)->part0;
 	long sum = 0;
 
 	for_each_possible_cpu(cpu) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3696c2d77a4d..0065736f05b4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -464,7 +464,7 @@ struct md_io {
 	bio_end_io_t *orig_bi_end_io;
 	void *orig_bi_private;
 	unsigned long start_time;
-	struct hd_struct *part;
+	struct block_device *part;
 };
 
 static void md_end_io(struct bio *bio)
@@ -8441,7 +8441,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_disk;
-		curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) -
+		curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
 			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index dca34489a1dc..8d90235e4fcc 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -89,12 +89,12 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 	if (!ns->bdev)
 		goto out;
 
-	host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
-	data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part,
-		sectors[READ]), 1000);
-	host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
-	data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part,
-		sectors[WRITE]), 1000);
+	host_reads = part_stat_read(ns->bdev, ios[READ]);
+	data_units_read =
+		DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[READ]), 1000);
+	host_writes = part_stat_read(ns->bdev, ios[WRITE]);
+	data_units_written =
+		DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[WRITE]), 1000);
 
 	put_unaligned_le64(host_reads, &slog->host_reads[0]);
 	put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
@@ -120,12 +120,12 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
 		/* we don't have the right data for file backed ns */
 		if (!ns->bdev)
 			continue;
-		host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
+		host_reads += part_stat_read(ns->bdev, ios[READ]);
 		data_units_read += DIV_ROUND_UP(
-			part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000);
-		host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
+			part_stat_read(ns->bdev, sectors[READ]), 1000);
+		host_writes += part_stat_read(ns->bdev, ios[WRITE]);
 		data_units_written += DIV_ROUND_UP(
-			part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000);
+			part_stat_read(ns->bdev, sectors[WRITE]), 1000);
 	}
 
 	put_unaligned_le64(host_reads, &slog->host_reads[0]);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6633b20224d5..c303a0ff0b17 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4048,9 +4048,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_sb = sb;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
-	if (sb->s_bdev->bd_part)
-		sbi->s_sectors_written_start =
-			part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
+	sbi->s_sectors_written_start =
+		part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
 
 	/* Cleanup superblock name */
 	strreplace(sb->s_id, '/', '!');
@@ -5509,15 +5508,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	 */
 	if (!(sb->s_flags & SB_RDONLY))
 		ext4_update_tstamp(es, s_wtime);
-	if (sb->s_bdev->bd_part)
-		es->s_kbytes_written =
-			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
-			    ((part_stat_read(sb->s_bdev->bd_part,
-					     sectors[STAT_WRITE]) -
-			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
-	else
-		es->s_kbytes_written =
-			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+	es->s_kbytes_written =
+		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+		    ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+		      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
 		ext4_free_blocks_count_set(es,
 			EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 4e27fe6ed3ae..075aa3a19ff5 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -62,11 +62,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
 {
 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
 
-	if (!sb->s_bdev->bd_part)
-		return snprintf(buf, PAGE_SIZE, "0\n");
 	return snprintf(buf, PAGE_SIZE, "%lu\n",
-			(part_stat_read(sb->s_bdev->bd_part,
-					sectors[STAT_WRITE]) -
+			(part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
 			 sbi->s_sectors_written_start) >> 1);
 }
 
@@ -74,12 +71,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
 {
 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
 
-	if (!sb->s_bdev->bd_part)
-		return snprintf(buf, PAGE_SIZE, "0\n");
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
 			(unsigned long long)(sbi->s_kbytes_written +
-			((part_stat_read(sb->s_bdev->bd_part,
-					 sectors[STAT_WRITE]) -
+			((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
 			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cb700d797296..49681a8d2b14 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1675,7 +1675,7 @@ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi)
  * and the return value is in kbytes. s is of struct f2fs_sb_info.
  */
 #define BD_PART_WRITTEN(s)						 \
-(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) -   \
+	(((u64)part_stat_read((s)->sb->s_bdev, sectors[STAT_WRITE]) -   \
 		(s)->sectors_written_start) >> 1)
 
 static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d4e7fab352ba..af9f449da64b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3700,10 +3700,8 @@ try_onemore:
 	}
 
 	/* For write statistics */
-	if (sb->s_bdev->bd_part)
-		sbi->sectors_written_start =
-			(u64)part_stat_read(sb->s_bdev->bd_part,
-					    sectors[STAT_WRITE]);
+	sbi->sectors_written_start =
+		(u64)part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
 
 	/* Read accumulated write IO statistics if exists */
 	seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 619adea57098..1d4be1fc6007 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -191,7 +191,7 @@ struct request {
 	};
 
 	struct gendisk *rq_disk;
-	struct hd_struct *part;
+	struct block_device *part;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 	/* Time that the first bio started allocating this request. */
 	u64 alloc_time_ns;
@@ -1943,9 +1943,9 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
 void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 		unsigned long start_time);
 
-unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
-				 struct bio *bio);
-void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+unsigned long part_start_io_acct(struct gendisk *disk,
+		struct block_device **part, struct bio *bio);
+void part_end_io_acct(struct block_device *part, struct bio *bio,
 		      unsigned long start_time);
 
 /**
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index df7319da013c..fe6fee77e2b9 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -131,8 +131,8 @@ enum {
 struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
-	struct hd_struct __rcu *last_lookup;
-	struct hd_struct __rcu *part[];
+	struct block_device __rcu *last_lookup;
+	struct block_device __rcu *part[];
 };
 
 struct disk_events;
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 680de036691e..d2558121d48c 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -25,26 +25,26 @@ struct disk_stats {
 #define part_stat_unlock()	preempt_enable()
 
 #define part_stat_get_cpu(part, field, cpu)				\
-	(per_cpu_ptr((part)->bdev->bd_stats, (cpu))->field)
+	(per_cpu_ptr((part)->bd_stats, (cpu))->field)
 
 #define part_stat_get(part, field)					\
 	part_stat_get_cpu(part, field, smp_processor_id())
 
 #define part_stat_read(part, field)					\
 ({									\
-	typeof((part)->bdev->bd_stats->field) res = 0;			\
+	typeof((part)->bd_stats->field) res = 0;			\
 	unsigned int _cpu;						\
 	for_each_possible_cpu(_cpu)					\
-		res += per_cpu_ptr((part)->bdev->bd_stats, _cpu)->field; \
+		res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \
 	res;								\
 })
 
-static inline void part_stat_set_all(struct hd_struct *part, int value)
+static inline void part_stat_set_all(struct block_device *part, int value)
 {
 	int i;
 
 	for_each_possible_cpu(i)
-		memset(per_cpu_ptr(part->bdev->bd_stats, i), value,
+		memset(per_cpu_ptr(part->bd_stats, i), value,
 				sizeof(struct disk_stats));
 }
 
@@ -54,13 +54,12 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 	 part_stat_read(part, field[STAT_DISCARD]))
 
 #define __part_stat_add(part, field, addnd)				\
-	__this_cpu_add((part)->bdev->bd_stats->field, addnd)
+	__this_cpu_add((part)->bd_stats->field, addnd)
 
 #define part_stat_add(part, field, addnd)	do {			\
 	__part_stat_add((part), field, addnd);				\
-	if ((part)->partno)						\
-		__part_stat_add(part_to_disk((part))->part0->bd_part,	\
-			field, addnd); \
+	if ((part)->bd_partno)						\
+		__part_stat_add(bdev_whole(part), field, addnd);	\
 } while (0)
 
 #define part_stat_dec(part, field)					\
-- 
cgit 


From 41e5c81984eac8ce87f2b4f57fec0bd90a049b2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 09:37:14 +0100
Subject: block: remove the partno field from struct hd_struct

Just use the bd_partno field in struct block_device everywhere.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c           | 12 ++++++------
 block/partitions/core.c |  9 ++++-----
 include/linux/genhd.h   |  1 -
 init/do_mounts.c        |  2 +-
 4 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index ed06466b305d..b7e39b41a275 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -589,8 +589,8 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 	int idx;
 
 	/* in consecutive minor range? */
-	if (part->partno < disk->minors) {
-		*devt = MKDEV(disk->major, disk->first_minor + part->partno);
+	if (part->bdev->bd_partno < disk->minors) {
+		*devt = MKDEV(disk->major, disk->first_minor + part->bdev->bd_partno);
 		return 0;
 	}
 
@@ -864,7 +864,7 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 	while ((part = disk_part_iter_next(&piter))) {
-		invalidate_partition(disk, part->partno);
+		invalidate_partition(disk, part->bdev->bd_partno);
 		delete_partition(part);
 	}
 	disk_part_iter_exit(&piter);
@@ -1008,7 +1008,7 @@ void __init printk_all_partitions(void)
 			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
 			       bdev_nr_sectors(part->bdev) >> 1,
-			       disk_name(disk, part->partno, name_buf),
+			       disk_name(disk, part->bdev->bd_partno, name_buf),
 			       part->bdev->bd_meta_info ?
 					part->bdev->bd_meta_info->uuid : "");
 			if (is_part0) {
@@ -1102,7 +1102,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 		seq_printf(seqf, "%4d  %7d %10llu %s\n",
 			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
 			   bdev_nr_sectors(part->bdev) >> 1,
-			   disk_name(sgp, part->partno, buf));
+			   disk_name(sgp, part->bdev->bd_partno, buf));
 	disk_part_iter_exit(&piter);
 
 	return 0;
@@ -1525,7 +1525,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   "%lu %u"
 			   "\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
-			   disk_name(gp, hd->partno, buf),
+			   disk_name(gp, hd->bdev->bd_partno, buf),
 			   stat.ios[STAT_READ],
 			   stat.merges[STAT_READ],
 			   stat.sectors[STAT_READ],
diff --git a/block/partitions/core.c b/block/partitions/core.c
index c2f6721633b8..6db9ca8b722d 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -184,7 +184,7 @@ static ssize_t part_partition_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%d\n", p->partno);
+	return sprintf(buf, "%d\n", p->bdev->bd_partno);
 }
 
 static ssize_t part_start_show(struct device *dev,
@@ -274,7 +274,7 @@ static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct hd_struct *part = dev_to_part(dev);
 
-	add_uevent_var(env, "PARTN=%u", part->partno);
+	add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno);
 	if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0])
 		add_uevent_var(env, "PARTNAME=%s",
 			       part->bdev->bd_meta_info->volname);
@@ -298,7 +298,7 @@ void delete_partition(struct hd_struct *part)
 	struct disk_part_tbl *ptbl =
 		rcu_dereference_protected(disk->part_tbl, 1);
 
-	rcu_assign_pointer(ptbl->part[part->partno], NULL);
+	rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL);
 	rcu_assign_pointer(ptbl->last_lookup, NULL);
 
 	kobject_put(part->bdev->bd_holder_dir);
@@ -372,7 +372,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	bdev->bd_start_sect = start;
 	bdev_set_nr_sectors(bdev, len);
-	p->partno = partno;
 	bdev->bd_read_only = get_disk_ro(disk);
 
 	if (info) {
@@ -445,7 +444,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
 
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter))) {
-		if (part->partno == skip_partno ||
+		if (part->bdev->bd_partno == skip_partno ||
 		    start >= part->bdev->bd_start_sect +
 			bdev_nr_sectors(part->bdev) ||
 		    start + length <= part->bdev->bd_start_sect)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index fe6fee77e2b9..3c13d4708e3f 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -54,7 +54,6 @@ struct partition_meta_info {
 struct hd_struct {
 	struct block_device *bdev;
 	struct device __dev;
-	int partno;
 };
 
 /**
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 368ccb718501..86bef93e72eb 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -136,7 +136,7 @@ static dev_t devt_from_partuuid(const char *uuid_str)
 		struct hd_struct *part;
 
 		part = disk_get_part(dev_to_disk(dev),
-				     dev_to_part(dev)->partno + offset);
+				     dev_to_part(dev)->bdev->bd_partno + offset);
 		if (part) {
 			devt = part_devt(part);
 			put_device(part_to_dev(part));
-- 
cgit 


From ad1eaa5344b293552b6ba43f5709c76a9aa14d17 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 09:52:59 +0100
Subject: block: switch disk_part_iter_* to use a struct block_device

Switch the partition iter infrastructure to iterate over block_device
references instead of hd_struct ones mostly used to get at the
block_device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c             | 59 ++++++++++++++++++++++++-----------------------
 block/partitions/core.c   | 13 +++++------
 drivers/s390/block/dasd.c |  8 +++----
 include/linux/genhd.h     |  4 ++--
 4 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 452f7c646e02..2d34dd2da4e9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -244,7 +244,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_init);
  * CONTEXT:
  * Don't care.
  */
-struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
+struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
 {
 	struct disk_part_tbl *ptbl;
 	int inc, end;
@@ -282,8 +282,9 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 		      piter->idx == 0))
 			continue;
 
-		get_device(part_to_dev(part->bd_part));
-		piter->part = part->bd_part;
+		piter->part = bdgrab(part);
+		if (!piter->part)
+			continue;
 		piter->idx += inc;
 		break;
 	}
@@ -305,7 +306,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_next);
  */
 void disk_part_iter_exit(struct disk_part_iter *piter)
 {
-	disk_put_part(piter->part);
+	if (piter->part)
+		bdput(piter->part);
 	piter->part = NULL;
 }
 EXPORT_SYMBOL_GPL(disk_part_iter_exit);
@@ -346,7 +348,6 @@ struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 
 	for (i = 1; i < ptbl->len; i++) {
 		part = rcu_dereference(ptbl->part[i]);
-
 		if (part && sector_in_part(part, sector)) {
 			rcu_assign_pointer(ptbl->last_lookup, part);
 			goto out_unlock;
@@ -647,7 +648,7 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 {
 	struct device *ddev = disk_to_dev(disk);
 	struct disk_part_iter piter;
-	struct hd_struct *part;
+	struct block_device *part;
 	int err;
 
 	ddev->parent = parent;
@@ -697,7 +698,7 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 	/* announce possible partitions */
 	disk_part_iter_init(&piter, disk, 0);
 	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+		kobject_uevent(bdev_kobj(part), KOBJ_ADD);
 	disk_part_iter_exit(&piter);
 
 	if (disk->queue->backing_dev_info->dev) {
@@ -837,7 +838,7 @@ static void invalidate_partition(struct block_device *bdev)
 void del_gendisk(struct gendisk *disk)
 {
 	struct disk_part_iter piter;
-	struct hd_struct *part;
+	struct block_device *part;
 
 	might_sleep();
 
@@ -857,8 +858,8 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 	while ((part = disk_part_iter_next(&piter))) {
-		invalidate_partition(part->bdev);
-		delete_partition(part);
+		invalidate_partition(part);
+		delete_partition(part->bd_part);
 	}
 	disk_part_iter_exit(&piter);
 
@@ -977,7 +978,7 @@ void __init printk_all_partitions(void)
 	while ((dev = class_dev_iter_next(&iter))) {
 		struct gendisk *disk = dev_to_disk(dev);
 		struct disk_part_iter piter;
-		struct hd_struct *part;
+		struct block_device *part;
 		char name_buf[BDEVNAME_SIZE];
 		char devt_buf[BDEVT_SIZE];
 
@@ -996,14 +997,14 @@ void __init printk_all_partitions(void)
 		 */
 		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 		while ((part = disk_part_iter_next(&piter))) {
-			bool is_part0 = part == disk->part0->bd_part;
+			bool is_part0 = part == disk->part0;
 
 			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
-			       bdevt_str(part_devt(part), devt_buf),
-			       bdev_nr_sectors(part->bdev) >> 1,
-			       disk_name(disk, part->bdev->bd_partno, name_buf),
-			       part->bdev->bd_meta_info ?
-					part->bdev->bd_meta_info->uuid : "");
+			       bdevt_str(part->bd_dev, devt_buf),
+			       bdev_nr_sectors(part) >> 1,
+			       disk_name(disk, part->bd_partno, name_buf),
+			       part->bd_meta_info ?
+					part->bd_meta_info->uuid : "");
 			if (is_part0) {
 				if (dev->parent && dev->parent->driver)
 					printk(" driver: %s\n",
@@ -1079,7 +1080,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 {
 	struct gendisk *sgp = v;
 	struct disk_part_iter piter;
-	struct hd_struct *part;
+	struct block_device *part;
 	char buf[BDEVNAME_SIZE];
 
 	/* Don't show non-partitionable removeable devices or empty devices */
@@ -1093,9 +1094,9 @@ static int show_partition(struct seq_file *seqf, void *v)
 	disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
 	while ((part = disk_part_iter_next(&piter)))
 		seq_printf(seqf, "%4d  %7d %10llu %s\n",
-			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
-			   bdev_nr_sectors(part->bdev) >> 1,
-			   disk_name(sgp, part->bdev->bd_partno, buf));
+			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
+			   bdev_nr_sectors(part) >> 1,
+			   disk_name(sgp, part->bd_partno, buf));
 	disk_part_iter_exit(&piter);
 
 	return 0;
@@ -1489,7 +1490,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 {
 	struct gendisk *gp = v;
 	struct disk_part_iter piter;
-	struct hd_struct *hd;
+	struct block_device *hd;
 	char buf[BDEVNAME_SIZE];
 	unsigned int inflight;
 	struct disk_stats stat;
@@ -1504,11 +1505,11 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
-		part_stat_read_all(hd, &stat);
+		part_stat_read_all(hd->bd_part, &stat);
 		if (queue_is_mq(gp->queue))
-			inflight = blk_mq_in_flight(gp->queue, hd->bdev);
+			inflight = blk_mq_in_flight(gp->queue, hd);
 		else
-			inflight = part_in_flight(hd->bdev);
+			inflight = part_in_flight(hd);
 
 		seq_printf(seqf, "%4d %7d %s "
 			   "%lu %lu %lu %u "
@@ -1517,8 +1518,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   "%lu %lu %lu %u "
 			   "%lu %u"
 			   "\n",
-			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
-			   disk_name(gp, hd->bdev->bd_partno, buf),
+			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev),
+			   disk_name(gp, hd->bd_partno, buf),
 			   stat.ios[STAT_READ],
 			   stat.merges[STAT_READ],
 			   stat.sectors[STAT_READ],
@@ -1673,7 +1674,7 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 void set_disk_ro(struct gendisk *disk, int flag)
 {
 	struct disk_part_iter piter;
-	struct hd_struct *part;
+	struct block_device *part;
 
 	if (disk->part0->bd_read_only != flag) {
 		set_disk_ro_uevent(disk, flag);
@@ -1682,7 +1683,7 @@ void set_disk_ro(struct gendisk *disk, int flag)
 
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter)))
-		part->bdev->bd_read_only = flag;
+		part->bd_read_only = flag;
 	disk_part_iter_exit(&piter);
 }
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 3d8243334c7c..4cb6df175f90 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -439,15 +439,14 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
 		sector_t length, int skip_partno)
 {
 	struct disk_part_iter piter;
-	struct hd_struct *part;
+	struct block_device *part;
 	bool overlap = false;
 
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter))) {
-		if (part->bdev->bd_partno == skip_partno ||
-		    start >= part->bdev->bd_start_sect +
-			bdev_nr_sectors(part->bdev) ||
-		    start + length <= part->bdev->bd_start_sect)
+		if (part->bd_partno == skip_partno ||
+		    start >= part->bd_start_sect + bdev_nr_sectors(part) ||
+		    start + length <= part->bd_start_sect)
 			continue;
 		overlap = true;
 		break;
@@ -568,7 +567,7 @@ static bool disk_unlock_native_capacity(struct gendisk *disk)
 int blk_drop_partitions(struct block_device *bdev)
 {
 	struct disk_part_iter piter;
-	struct hd_struct *part;
+	struct block_device *part;
 
 	if (bdev->bd_part_count)
 		return -EBUSY;
@@ -578,7 +577,7 @@ int blk_drop_partitions(struct block_device *bdev)
 
 	disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter)))
-		delete_partition(part);
+		delete_partition(part->bd_part);
 	disk_part_iter_exit(&piter);
 
 	return 0;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index db24e04ee978..1825fa8d05a7 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -432,7 +432,7 @@ dasd_state_ready_to_online(struct dasd_device * device)
 {
 	struct gendisk *disk;
 	struct disk_part_iter piter;
-	struct hd_struct *part;
+	struct block_device *part;
 
 	device->state = DASD_STATE_ONLINE;
 	if (device->block) {
@@ -445,7 +445,7 @@ dasd_state_ready_to_online(struct dasd_device * device)
 		disk = device->block->bdev->bd_disk;
 		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 		while ((part = disk_part_iter_next(&piter)))
-			kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
+			kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
 		disk_part_iter_exit(&piter);
 	}
 	return 0;
@@ -459,7 +459,7 @@ static int dasd_state_online_to_ready(struct dasd_device *device)
 	int rc;
 	struct gendisk *disk;
 	struct disk_part_iter piter;
-	struct hd_struct *part;
+	struct block_device *part;
 
 	if (device->discipline->online_to_ready) {
 		rc = device->discipline->online_to_ready(device);
@@ -472,7 +472,7 @@ static int dasd_state_online_to_ready(struct dasd_device *device)
 		disk = device->block->bdev->bd_disk;
 		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 		while ((part = disk_part_iter_next(&piter)))
-			kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
+			kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
 		disk_part_iter_exit(&piter);
 	}
 	return 0;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3c13d4708e3f..cd23c80265b2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -244,14 +244,14 @@ static inline void disk_put_part(struct hd_struct *part)
 
 struct disk_part_iter {
 	struct gendisk		*disk;
-	struct hd_struct	*part;
+	struct block_device	*part;
 	int			idx;
 	unsigned int		flags;
 };
 
 extern void disk_part_iter_init(struct disk_part_iter *piter,
 				 struct gendisk *disk, unsigned int flags);
-extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter);
+struct block_device *disk_part_iter_next(struct disk_part_iter *piter);
 extern void disk_part_iter_exit(struct disk_part_iter *piter);
 extern bool disk_has_partitions(struct gendisk *disk);
 
-- 
cgit 


From 0d02129e76edf91cf04fabf1efbc3a9a1f1d729a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Nov 2020 16:43:51 +0100
Subject: block: merge struct block_device and struct hd_struct

Instead of having two structures that represent each block device with
different life time rules, merge them into a single one.  This also
greatly simplifies the reference counting rules, as we can use the inode
reference count as the main reference count for the new struct
block_device, with the device model reference front ending it for device
model interaction.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |   8 ++--
 block/blk.h               |   2 +-
 block/genhd.c             |  90 +++++++++++------------------------
 block/partitions/core.c   | 116 ++++++++++++++++++----------------------------
 fs/block_dev.c            |   9 ----
 include/linux/blk_types.h |   8 +++-
 include/linux/blkdev.h    |   1 -
 include/linux/genhd.h     |  40 ++++------------
 init/do_mounts.c          |  21 ++++-----
 kernel/trace/blktrace.c   |  43 ++++-------------
 10 files changed, 108 insertions(+), 230 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 79aa96240cec..031114d454a6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -820,9 +820,9 @@ static void blkcg_fill_root_iostats(void)
 
 	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
 	while ((dev = class_dev_iter_next(&iter))) {
-		struct gendisk *disk = dev_to_disk(dev);
-		struct hd_struct *part = disk_get_part(disk, 0);
-		struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
+		struct block_device *bdev = dev_to_bdev(dev);
+		struct blkcg_gq *blkg =
+			blk_queue_root_blkg(bdev->bd_disk->queue);
 		struct blkg_iostat tmp;
 		int cpu;
 
@@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void)
 		for_each_possible_cpu(cpu) {
 			struct disk_stats *cpu_dkstats;
 
-			cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu);
+			cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
 			tmp.ios[BLKG_IOSTAT_READ] +=
 				cpu_dkstats->ios[STAT_READ];
 			tmp.ios[BLKG_IOSTAT_WRITE] +=
diff --git a/block/blk.h b/block/blk.h
index 9657c6da7c77..98f0b1ae2641 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -356,7 +356,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf);
 #define ADDPART_FLAG_NONE	0
 #define ADDPART_FLAG_RAID	1
 #define ADDPART_FLAG_WHOLEDISK	2
-void delete_partition(struct hd_struct *part);
+void delete_partition(struct block_device *part);
 int bdev_add_partition(struct block_device *bdev, int partno,
 		sector_t start, sector_t length);
 int bdev_del_partition(struct block_device *bdev, int partno);
diff --git a/block/genhd.c b/block/genhd.c
index 2d34dd2da4e9..0fabfc90b8e4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -106,13 +106,14 @@ const char *bdevname(struct block_device *bdev, char *buf)
 }
 EXPORT_SYMBOL(bdevname);
 
-static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
+static void part_stat_read_all(struct block_device *part,
+		struct disk_stats *stat)
 {
 	int cpu;
 
 	memset(stat, 0, sizeof(struct disk_stats));
 	for_each_possible_cpu(cpu) {
-		struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu);
+		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
 		int group;
 
 		for (group = 0; group < NR_STAT_GROUPS; group++) {
@@ -167,39 +168,6 @@ struct block_device *__disk_get_part(struct gendisk *disk, int partno)
 	return rcu_dereference(ptbl->part[partno]);
 }
 
-/**
- * disk_get_part - get partition
- * @disk: disk to look partition from
- * @partno: partition number
- *
- * Look for partition @partno from @disk.  If found, increment
- * reference count and return it.
- *
- * CONTEXT:
- * Don't care.
- *
- * RETURNS:
- * Pointer to the found partition on success, NULL if not found.
- */
-struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
-{
-	struct block_device *bdev;
-	struct hd_struct *part;
-
-	rcu_read_lock();
-	bdev = __disk_get_part(disk, partno);
-	if (!bdev)
-		goto fail;
-	part = bdev->bd_part;
-	if (!kobject_get_unless_zero(&part_to_dev(part)->kobj))
-		goto fail;
-	rcu_read_unlock();
-	return part;
-fail:
-	rcu_read_unlock();
-	return NULL;
-}
-
 /**
  * disk_part_iter_init - initialize partition iterator
  * @piter: iterator to initialize
@@ -859,7 +827,7 @@ void del_gendisk(struct gendisk *disk)
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 	while ((part = disk_part_iter_next(&piter))) {
 		invalidate_partition(part);
-		delete_partition(part->bd_part);
+		delete_partition(part);
 	}
 	disk_part_iter_exit(&piter);
 
@@ -952,13 +920,13 @@ void blk_request_module(dev_t devt)
  */
 struct block_device *bdget_disk(struct gendisk *disk, int partno)
 {
-	struct hd_struct *part;
 	struct block_device *bdev = NULL;
 
-	part = disk_get_part(disk, partno);
-	if (part)
-		bdev = bdget_part(part);
-	disk_put_part(part);
+	rcu_read_lock();
+	bdev = __disk_get_part(disk, partno);
+	if (bdev && !bdgrab(bdev))
+		bdev = NULL;
+	rcu_read_unlock();
 
 	return bdev;
 }
@@ -1175,24 +1143,22 @@ static ssize_t disk_ro_show(struct device *dev,
 ssize_t part_size_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
-
-	return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev));
+	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
 }
 
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
-	struct request_queue *q = part_to_disk(p)->queue;
+	struct block_device *bdev = dev_to_bdev(dev);
+	struct request_queue *q = bdev->bd_disk->queue;
 	struct disk_stats stat;
 	unsigned int inflight;
 
-	part_stat_read_all(p, &stat);
+	part_stat_read_all(bdev, &stat);
 	if (queue_is_mq(q))
-		inflight = blk_mq_in_flight(q, p->bdev);
+		inflight = blk_mq_in_flight(q, bdev);
 	else
-		inflight = part_in_flight(p->bdev);
+		inflight = part_in_flight(bdev);
 
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
@@ -1227,14 +1193,14 @@ ssize_t part_stat_show(struct device *dev,
 ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
 			   char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
-	struct request_queue *q = part_to_disk(p)->queue;
+	struct block_device *bdev = dev_to_bdev(dev);
+	struct request_queue *q = bdev->bd_disk->queue;
 	unsigned int inflight[2];
 
 	if (queue_is_mq(q))
-		blk_mq_in_flight_rw(q, p->bdev, inflight);
+		blk_mq_in_flight_rw(q, bdev, inflight);
 	else
-		part_in_flight_rw(p->bdev, inflight);
+		part_in_flight_rw(bdev, inflight);
 
 	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
 }
@@ -1282,20 +1248,17 @@ static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
 ssize_t part_fail_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
-
-	return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail);
+	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
 }
 
 ssize_t part_fail_store(struct device *dev,
 			struct device_attribute *attr,
 			const char *buf, size_t count)
 {
-	struct hd_struct *p = dev_to_part(dev);
 	int i;
 
 	if (count > 0 && sscanf(buf, "%d", &i) > 0)
-		p->bdev->bd_make_it_fail = i;
+		dev_to_bdev(dev)->bd_make_it_fail = i;
 
 	return count;
 }
@@ -1505,7 +1468,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
-		part_stat_read_all(hd->bd_part, &stat);
+		part_stat_read_all(hd, &stat);
 		if (queue_is_mq(gp->queue))
 			inflight = blk_mq_in_flight(gp->queue, hd);
 		else
@@ -1577,7 +1540,7 @@ dev_t blk_lookup_devt(const char *name, int partno)
 	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
 	while ((dev = class_dev_iter_next(&iter))) {
 		struct gendisk *disk = dev_to_disk(dev);
-		struct hd_struct *part;
+		struct block_device *part;
 
 		if (strcmp(dev_name(dev), name))
 			continue;
@@ -1590,13 +1553,12 @@ dev_t blk_lookup_devt(const char *name, int partno)
 				     MINOR(dev->devt) + partno);
 			break;
 		}
-		part = disk_get_part(disk, partno);
+		part = bdget_disk(disk, partno);
 		if (part) {
-			devt = part_devt(part);
-			disk_put_part(part);
+			devt = part->bd_dev;
+			bdput(part);
 			break;
 		}
-		disk_put_part(part);
 	}
 	class_dev_iter_exit(&iter);
 	return devt;
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 4cb6df175f90..deca253583bd 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -182,44 +182,39 @@ static struct parsed_partitions *check_partition(struct gendisk *hd,
 static ssize_t part_partition_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
-
-	return sprintf(buf, "%d\n", p->bdev->bd_partno);
+	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno);
 }
 
 static ssize_t part_start_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
-
-	return sprintf(buf, "%llu\n", p->bdev->bd_start_sect);
+	return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect);
 }
 
 static ssize_t part_ro_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
-	return sprintf(buf, "%d\n", p->bdev->bd_read_only);
+	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only);
 }
 
 static ssize_t part_alignment_offset_show(struct device *dev,
 					  struct device_attribute *attr, char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
+	struct block_device *bdev = dev_to_bdev(dev);
 
 	return sprintf(buf, "%u\n",
-		queue_limit_alignment_offset(&part_to_disk(p)->queue->limits,
-				p->bdev->bd_start_sect));
+		queue_limit_alignment_offset(&bdev->bd_disk->queue->limits,
+				bdev->bd_start_sect));
 }
 
 static ssize_t part_discard_alignment_show(struct device *dev,
 					   struct device_attribute *attr, char *buf)
 {
-	struct hd_struct *p = dev_to_part(dev);
+	struct block_device *bdev = dev_to_bdev(dev);
 
 	return sprintf(buf, "%u\n",
-		queue_limit_discard_alignment(&part_to_disk(p)->queue->limits,
-				p->bdev->bd_start_sect));
+		queue_limit_discard_alignment(&bdev->bd_disk->queue->limits,
+				bdev->bd_start_sect));
 }
 
 static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
@@ -264,20 +259,17 @@ static const struct attribute_group *part_attr_groups[] = {
 
 static void part_release(struct device *dev)
 {
-	struct hd_struct *p = dev_to_part(dev);
-
 	blk_free_devt(dev->devt);
-	bdput(p->bdev);
+	bdput(dev_to_bdev(dev));
 }
 
 static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
-	struct hd_struct *part = dev_to_part(dev);
+	struct block_device *part = dev_to_bdev(dev);
 
-	add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno);
-	if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0])
-		add_uevent_var(env, "PARTNAME=%s",
-			       part->bdev->bd_meta_info->volname);
+	add_uevent_var(env, "PARTN=%u", part->bd_partno);
+	if (part->bd_meta_info && part->bd_meta_info->volname[0])
+		add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname);
 	return 0;
 }
 
@@ -292,25 +284,25 @@ struct device_type part_type = {
  * Must be called either with bd_mutex held, before a disk can be opened or
  * after all disk users are gone.
  */
-void delete_partition(struct hd_struct *part)
+void delete_partition(struct block_device *part)
 {
-	struct gendisk *disk = part_to_disk(part);
+	struct gendisk *disk = part->bd_disk;
 	struct disk_part_tbl *ptbl =
 		rcu_dereference_protected(disk->part_tbl, 1);
 
-	rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL);
+	rcu_assign_pointer(ptbl->part[part->bd_partno], NULL);
 	rcu_assign_pointer(ptbl->last_lookup, NULL);
 
-	kobject_put(part->bdev->bd_holder_dir);
-	device_del(part_to_dev(part));
+	kobject_put(part->bd_holder_dir);
+	device_del(&part->bd_device);
 
 	/*
 	 * Remove the block device from the inode hash, so that it cannot be
 	 * looked up any more even when openers still hold references.
 	 */
-	remove_inode_hash(part->bdev->bd_inode);
+	remove_inode_hash(part->bd_inode);
 
-	put_device(part_to_dev(part));
+	put_device(&part->bd_device);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -324,11 +316,10 @@ static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
  * Must be called either with bd_mutex held, before a disk can be opened or
  * after all disk users are gone.
  */
-static struct hd_struct *add_partition(struct gendisk *disk, int partno,
+static struct block_device *add_partition(struct gendisk *disk, int partno,
 				sector_t start, sector_t len, int flags,
 				struct partition_meta_info *info)
 {
-	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
 	struct device *ddev = disk_to_dev(disk);
 	struct device *pdev;
@@ -367,9 +358,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!bdev)
 		return ERR_PTR(-ENOMEM);
 
-	p = bdev->bd_part;
-	pdev = part_to_dev(p);
-
 	bdev->bd_start_sect = start;
 	bdev_set_nr_sectors(bdev, len);
 	bdev->bd_read_only = get_disk_ro(disk);
@@ -381,6 +369,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 			goto out_bdput;
 	}
 
+	pdev = &bdev->bd_device;
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
 		dev_set_name(pdev, "%sp%d", dname, partno);
@@ -422,7 +411,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	/* suppress uevent if the disk suppresses it */
 	if (!dev_get_uevent_suppress(ddev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
-	return p;
+	return bdev;
 
 out_bdput:
 	bdput(bdev);
@@ -459,7 +448,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
 int bdev_add_partition(struct block_device *bdev, int partno,
 		sector_t start, sector_t length)
 {
-	struct hd_struct *part;
+	struct block_device *part;
 
 	mutex_lock(&bdev->bd_mutex);
 	if (partition_overlaps(bdev->bd_disk, start, length, -1)) {
@@ -475,76 +464,59 @@ int bdev_add_partition(struct block_device *bdev, int partno,
 
 int bdev_del_partition(struct block_device *bdev, int partno)
 {
-	struct block_device *bdevp;
-	struct hd_struct *part = NULL;
+	struct block_device *part;
 	int ret;
 
-	bdevp = bdget_disk(bdev->bd_disk, partno);
-	if (!bdevp)
+	part = bdget_disk(bdev->bd_disk, partno);
+	if (!part)
 		return -ENXIO;
 
-	mutex_lock(&bdevp->bd_mutex);
+	mutex_lock(&part->bd_mutex);
 	mutex_lock_nested(&bdev->bd_mutex, 1);
 
-	ret = -ENXIO;
-	part = disk_get_part(bdev->bd_disk, partno);
-	if (!part)
-		goto out_unlock;
-
 	ret = -EBUSY;
-	if (bdevp->bd_openers)
+	if (part->bd_openers)
 		goto out_unlock;
 
-	sync_blockdev(bdevp);
-	invalidate_bdev(bdevp);
+	sync_blockdev(part);
+	invalidate_bdev(part);
 
 	delete_partition(part);
 	ret = 0;
 out_unlock:
 	mutex_unlock(&bdev->bd_mutex);
-	mutex_unlock(&bdevp->bd_mutex);
-	bdput(bdevp);
-	if (part)
-		disk_put_part(part);
+	mutex_unlock(&part->bd_mutex);
+	bdput(part);
 	return ret;
 }
 
 int bdev_resize_partition(struct block_device *bdev, int partno,
 		sector_t start, sector_t length)
 {
-	struct block_device *bdevp;
-	struct hd_struct *part;
+	struct block_device *part;
 	int ret = 0;
 
-	part = disk_get_part(bdev->bd_disk, partno);
+	part = bdget_disk(bdev->bd_disk, partno);
 	if (!part)
 		return -ENXIO;
 
-	ret = -ENOMEM;
-	bdevp = bdget_part(part);
-	if (!bdevp)
-		goto out_put_part;
-
-	mutex_lock(&bdevp->bd_mutex);
+	mutex_lock(&part->bd_mutex);
 	mutex_lock_nested(&bdev->bd_mutex, 1);
-
 	ret = -EINVAL;
-	if (start != part->bdev->bd_start_sect)
+	if (start != part->bd_start_sect)
 		goto out_unlock;
 
 	ret = -EBUSY;
 	if (partition_overlaps(bdev->bd_disk, start, length, partno))
 		goto out_unlock;
 
-	bdev_set_nr_sectors(bdevp, length);
+	bdev_set_nr_sectors(part, length);
 
 	ret = 0;
 out_unlock:
-	mutex_unlock(&bdevp->bd_mutex);
+	mutex_unlock(&part->bd_mutex);
 	mutex_unlock(&bdev->bd_mutex);
-	bdput(bdevp);
-out_put_part:
-	disk_put_part(part);
+	bdput(part);
 	return ret;
 }
 
@@ -577,7 +549,7 @@ int blk_drop_partitions(struct block_device *bdev)
 
 	disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter)))
-		delete_partition(part->bd_part);
+		delete_partition(part);
 	disk_part_iter_exit(&piter);
 
 	return 0;
@@ -592,7 +564,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
 {
 	sector_t size = state->parts[p].size;
 	sector_t from = state->parts[p].from;
-	struct hd_struct *part;
+	struct block_device *part;
 
 	if (!size)
 		return true;
@@ -632,7 +604,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
 
 	if (IS_BUILTIN(CONFIG_BLK_DEV_MD) &&
 	    (state->parts[p].flags & ADDPART_FLAG_RAID))
-		md_autodetect_dev(part_to_dev(part)->devt);
+		md_autodetect_dev(part->bd_dev);
 
 	return true;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 61cf33b6284f..a9905d8fd02b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -39,7 +39,6 @@
 
 struct bdev_inode {
 	struct block_device bdev;
-	struct hd_struct hd;
 	struct inode vfs_inode;
 };
 
@@ -887,9 +886,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		iput(inode);
 		return NULL;
 	}
-	bdev->bd_part = &BDEV_I(inode)->hd;
-	memset(bdev->bd_part, 0, sizeof(*bdev->bd_part));
-	bdev->bd_part->bdev = bdev;
 	return bdev;
 }
 
@@ -926,11 +922,6 @@ struct block_device *bdgrab(struct block_device *bdev)
 }
 EXPORT_SYMBOL(bdgrab);
 
-struct block_device *bdget_part(struct hd_struct *part)
-{
-	return bdget(part_devt(part));
-}
-
 long nr_blockdev_pages(void)
 {
 	struct inode *inode;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6edea5c16259..866f74261b3b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -8,6 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/bvec.h>
+#include <linux/device.h>
 #include <linux/ktime.h>
 
 struct bio_set;
@@ -30,6 +31,7 @@ struct block_device {
 	struct super_block *	bd_super;
 	struct mutex		bd_mutex;	/* open/close mutex */
 	void *			bd_claiming;
+	struct device		bd_device;
 	void *			bd_holder;
 	int			bd_holders;
 	bool			bd_write_holder;
@@ -38,7 +40,6 @@ struct block_device {
 #endif
 	struct kobject		*bd_holder_dir;
 	u8			bd_partno;
-	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
 	unsigned		bd_part_count;
 
@@ -61,8 +62,11 @@ struct block_device {
 #define bdev_whole(_bdev) \
 	((_bdev)->bd_disk->part0)
 
+#define dev_to_bdev(device) \
+	container_of((device), struct block_device, bd_device)
+
 #define bdev_kobj(_bdev) \
-	(&part_to_dev((_bdev)->bd_part)->kobj)
+	(&((_bdev)->bd_device.kobj))
 
 /*
  * Block error status values.  See block/blk-core:blk_errors for the details.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1d4be1fc6007..17cedf0dc83d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1999,7 +1999,6 @@ void blkdev_put_no_open(struct block_device *bdev);
 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
 void bdev_add(struct block_device *bdev, dev_t dev);
 struct block_device *I_BDEV(struct inode *inode);
-struct block_device *bdget_part(struct hd_struct *part);
 struct block_device *bdgrab(struct block_device *bdev);
 void bdput(struct block_device *);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index cd23c80265b2..809aaa32d53c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -19,12 +19,6 @@
 #include <linux/blk_types.h>
 #include <asm/local.h>
 
-#define dev_to_part(device)	container_of((device), struct hd_struct, __dev)
-#define part_to_dev(part)	(&((part)->__dev))
-
-#define dev_to_disk(device)	(dev_to_part(device)->bdev->bd_disk)
-#define disk_to_dev(disk)	(part_to_dev((disk)->part0->bd_part))
-
 extern const struct device_type disk_type;
 extern struct device_type part_type;
 extern struct class block_class;
@@ -51,11 +45,6 @@ struct partition_meta_info {
 	u8 volname[PARTITION_META_INFO_VOLNAMELTH];
 };
 
-struct hd_struct {
-	struct block_device *bdev;
-	struct device __dev;
-};
-
 /**
  * DOC: genhd capability flags
  *
@@ -190,19 +179,21 @@ struct gendisk {
 	struct lockdep_map lockdep_map;
 };
 
+/*
+ * The gendisk is refcounted by the part0 block_device, and the bd_device
+ * therein is also used for device model presentation in sysfs.
+ */
+#define dev_to_disk(device) \
+	(dev_to_bdev(device)->bd_disk)
+#define disk_to_dev(disk) \
+	(&((disk)->part0->bd_device))
+
 #if IS_REACHABLE(CONFIG_CDROM)
 #define disk_to_cdi(disk)	((disk)->cdi)
 #else
 #define disk_to_cdi(disk)	NULL
 #endif
 
-static inline struct gendisk *part_to_disk(struct hd_struct *part)
-{
-	if (unlikely(!part))
-		return NULL;
-	return part->bdev->bd_disk;
-}
-
 static inline int disk_max_parts(struct gendisk *disk)
 {
 	if (disk->flags & GENHD_FL_EXT_DEVT)
@@ -221,19 +212,6 @@ static inline dev_t disk_devt(struct gendisk *disk)
 	return MKDEV(disk->major, disk->first_minor);
 }
 
-static inline dev_t part_devt(struct hd_struct *part)
-{
-	return part_to_dev(part)->devt;
-}
-
-extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
-
-static inline void disk_put_part(struct hd_struct *part)
-{
-	if (likely(part))
-		put_device(part_to_dev(part));
-}
-
 /*
  * Smarter partition iterator without context limits.
  */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 86bef93e72eb..a78e44ee6adb 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -76,11 +76,11 @@ struct uuidcmp {
  */
 static int match_dev_by_uuid(struct device *dev, const void *data)
 {
+	struct block_device *bdev = dev_to_bdev(dev);
 	const struct uuidcmp *cmp = data;
-	struct hd_struct *part = dev_to_part(dev);
 
-	if (!part->bdev->bd_meta_info ||
-	    strncasecmp(cmp->uuid, part->bdev->bd_meta_info->uuid, cmp->len))
+	if (!bdev->bd_meta_info ||
+	    strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len))
 		return 0;
 	return 1;
 }
@@ -133,13 +133,13 @@ static dev_t devt_from_partuuid(const char *uuid_str)
 		 * Attempt to find the requested partition by adding an offset
 		 * to the partition number found by UUID.
 		 */
-		struct hd_struct *part;
+		struct block_device *part;
 
-		part = disk_get_part(dev_to_disk(dev),
-				     dev_to_part(dev)->bdev->bd_partno + offset);
+		part = bdget_disk(dev_to_disk(dev),
+				  dev_to_bdev(dev)->bd_partno + offset);
 		if (part) {
-			devt = part_devt(part);
-			put_device(part_to_dev(part));
+			devt = part->bd_dev;
+			bdput(part);
 		}
 	} else {
 		devt = dev->devt;
@@ -166,11 +166,10 @@ clear_root_wait:
  */
 static int match_dev_by_label(struct device *dev, const void *data)
 {
+	struct block_device *bdev = dev_to_bdev(dev);
 	const char *label = data;
-	struct hd_struct *part = dev_to_part(dev);
 
-	if (!part->bdev->bd_meta_info ||
-	    strcmp(label, part->bdev->bd_meta_info->volname))
+	if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname))
 		return 0;
 	return 1;
 }
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8a723a91ec5a..a482a37848bf 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1810,30 +1810,15 @@ static ssize_t blk_trace_mask2str(char *buf, int mask)
 	return p - buf;
 }
 
-static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
-{
-	if (bdev->bd_disk == NULL)
-		return NULL;
-
-	return bdev_get_queue(bdev);
-}
-
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
-	struct block_device *bdev = bdget_part(dev_to_part(dev));
-	struct request_queue *q;
+	struct block_device *bdev = dev_to_bdev(dev);
+	struct request_queue *q = bdev_get_queue(bdev);
 	struct blk_trace *bt;
 	ssize_t ret = -ENXIO;
 
-	if (bdev == NULL)
-		goto out;
-
-	q = blk_trace_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-
 	mutex_lock(&q->debugfs_mutex);
 
 	bt = rcu_dereference_protected(q->blk_trace,
@@ -1856,9 +1841,6 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 
 out_unlock_bdev:
 	mutex_unlock(&q->debugfs_mutex);
-out_bdput:
-	bdput(bdev);
-out:
 	return ret;
 }
 
@@ -1866,8 +1848,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 					  struct device_attribute *attr,
 					  const char *buf, size_t count)
 {
-	struct block_device *bdev;
-	struct request_queue *q;
+	struct block_device *bdev = dev_to_bdev(dev);
+	struct request_queue *q = bdev_get_queue(bdev);
 	struct blk_trace *bt;
 	u64 value;
 	ssize_t ret = -EINVAL;
@@ -1883,17 +1865,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 				goto out;
 			value = ret;
 		}
-	} else if (kstrtoull(buf, 0, &value))
-		goto out;
-
-	ret = -ENXIO;
-	bdev = bdget_part(dev_to_part(dev));
-	if (bdev == NULL)
-		goto out;
-
-	q = blk_trace_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
+	} else {
+		if (kstrtoull(buf, 0, &value))
+			goto out;
+	}
 
 	mutex_lock(&q->debugfs_mutex);
 
@@ -1931,8 +1906,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 out_unlock_bdev:
 	mutex_unlock(&q->debugfs_mutex);
-out_bdput:
-	bdput(bdev);
 out:
 	return ret ? ret : count;
 }
-- 
cgit 


From c214550ff8eae9623605149fa4e3da3ba43df145 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 29 Nov 2020 22:05:50 +0200
Subject: net: delete __dev_getfirstbyhwtype

The last user of the RTNL brother of dev_getfirstbyhwtype (the latter
being synchronized under RCU) has been deleted in commit b4db2b35fc44
("afs: Use core kernel UUID generation").

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20201129200550.2433401-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  1 -
 net/core/dev.c            | 13 -------------
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8eeb73ac58bd..2e077e289e75 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2815,7 +2815,6 @@ unsigned long netdev_boot_base(const char *prefix, int unit);
 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 				       const char *hwaddr);
 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
-struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type);
 void dev_add_pack(struct packet_type *pt);
 void dev_remove_pack(struct packet_type *pt);
 void __dev_remove_pack(struct packet_type *pt);
diff --git a/net/core/dev.c b/net/core/dev.c
index 51b263076124..b5130fd1cdaa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1069,19 +1069,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 }
 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 
-struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
-{
-	struct net_device *dev;
-
-	ASSERT_RTNL();
-	for_each_netdev(net, dev)
-		if (dev->type == type)
-			return dev;
-
-	return NULL;
-}
-EXPORT_SYMBOL(__dev_getfirstbyhwtype);
-
 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 {
 	struct net_device *dev, *ret = NULL;
-- 
cgit 


From 1446e1df9eb183fdf81c3f0715402f1d7595d4cb Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Fri, 27 Nov 2020 14:32:34 -0500
Subject: kernel: Implement selective syscall userspace redirection

Introduce a mechanism to quickly disable/enable syscall handling for a
specific process and redirect to userspace via SIGSYS.  This is useful
for processes with parts that require syscall redirection and parts that
don't, but who need to perform this boundary crossing really fast,
without paying the cost of a system call to reconfigure syscall handling
on each boundary transition.  This is particularly important for Windows
games running over Wine.

The proposed interface looks like this:

  prctl(PR_SET_SYSCALL_USER_DISPATCH, <op>, <off>, <length>, [selector])

The range [<offset>,<offset>+<length>) is a part of the process memory
map that is allowed to by-pass the redirection code and dispatch
syscalls directly, such that in fast paths a process doesn't need to
disable the trap nor the kernel has to check the selector.  This is
essential to return from SIGSYS to a blocked area without triggering
another SIGSYS from rt_sigreturn.

selector is an optional pointer to a char-sized userspace memory region
that has a key switch for the mechanism. This key switch is set to
either PR_SYS_DISPATCH_ON, PR_SYS_DISPATCH_OFF to enable and disable the
redirection without calling the kernel.

The feature is meant to be set per-thread and it is disabled on
fork/clone/execv.

Internally, this doesn't add overhead to the syscall hot path, and it
requires very little per-architecture support.  I avoided using seccomp,
even though it duplicates some functionality, due to previous feedback
that maybe it shouldn't mix with seccomp since it is not a security
mechanism.  And obviously, this should never be considered a security
mechanism, since any part of the program can by-pass it by using the
syscall dispatcher.

For the sysinfo benchmark, which measures the overhead added to
executing a native syscall that doesn't require interception, the
overhead using only the direct dispatcher region to issue syscalls is
pretty much irrelevant.  The overhead of using the selector goes around
40ns for a native (unredirected) syscall in my system, and it is (as
expected) dominated by the supervisor-mode user-address access.  In
fact, with SMAP off, the overhead is consistently less than 5ns on my
test box.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201127193238.821364-4-krisman@collabora.com
---
 fs/exec.c                             |   3 +
 include/linux/sched.h                 |   2 +
 include/linux/syscall_user_dispatch.h |  40 +++++++++++++
 include/linux/thread_info.h           |   2 +
 include/uapi/linux/prctl.h            |   5 ++
 kernel/entry/Makefile                 |   2 +-
 kernel/entry/common.h                 |   7 +++
 kernel/entry/syscall_user_dispatch.c  | 104 ++++++++++++++++++++++++++++++++++
 kernel/fork.c                         |   1 +
 kernel/sys.c                          |   5 ++
 10 files changed, 170 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/syscall_user_dispatch.h
 create mode 100644 kernel/entry/common.h
 create mode 100644 kernel/entry/syscall_user_dispatch.c

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..aee36e5733ce 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -64,6 +64,7 @@
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
 #include <linux/io_uring.h>
+#include <linux/syscall_user_dispatch.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1302,6 +1303,8 @@ int begin_new_exec(struct linux_binprm * bprm)
 	flush_thread();
 	me->personality &= ~bprm->per_clear;
 
+	clear_syscall_work_syscall_user_dispatch(me);
+
 	/*
 	 * We have to apply CLOEXEC before we change whether the process is
 	 * dumpable (in setup_new_exec) to avoid a race with a process in userspace
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 063cd120b459..5a24a033b3f8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -28,6 +28,7 @@
 #include <linux/sched/prio.h>
 #include <linux/sched/types.h>
 #include <linux/signal_types.h>
+#include <linux/syscall_user_dispatch.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers.h>
@@ -965,6 +966,7 @@ struct task_struct {
 	unsigned int			sessionid;
 #endif
 	struct seccomp			seccomp;
+	struct syscall_user_dispatch	syscall_dispatch;
 
 	/* Thread group tracking: */
 	u64				parent_exec_id;
diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h
new file mode 100644
index 000000000000..a0ae443fb7df
--- /dev/null
+++ b/include/linux/syscall_user_dispatch.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Collabora Ltd.
+ */
+#ifndef _SYSCALL_USER_DISPATCH_H
+#define _SYSCALL_USER_DISPATCH_H
+
+#include <linux/thread_info.h>
+
+#ifdef CONFIG_GENERIC_ENTRY
+
+struct syscall_user_dispatch {
+	char __user	*selector;
+	unsigned long	offset;
+	unsigned long	len;
+	bool		on_dispatch;
+};
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+			      unsigned long len, char __user *selector);
+
+#define clear_syscall_work_syscall_user_dispatch(tsk) \
+	clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH)
+
+#else
+struct syscall_user_dispatch {};
+
+static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+					    unsigned long len, char __user *selector)
+{
+	return -EINVAL;
+}
+
+static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *tsk)
+{
+}
+
+#endif /* CONFIG_GENERIC_ENTRY */
+
+#endif /* _SYSCALL_USER_DISPATCH_H */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ca80a214df09..c8a974cead73 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -42,6 +42,7 @@ enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SYSCALL_TRACE,
 	SYSCALL_WORK_BIT_SYSCALL_EMU,
 	SYSCALL_WORK_BIT_SYSCALL_AUDIT,
+	SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
 };
 
 #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
@@ -49,6 +50,7 @@ enum syscall_work_bit {
 #define SYSCALL_WORK_SYSCALL_TRACE	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
 #define SYSCALL_WORK_SYSCALL_EMU	BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
 #define SYSCALL_WORK_SYSCALL_AUDIT	BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
+#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
 #endif
 
 #include <asm/thread_info.h>
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 7f0827705c9a..90deb41c8a34 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -247,4 +247,9 @@ struct prctl_mm_map {
 #define PR_SET_IO_FLUSHER		57
 #define PR_GET_IO_FLUSHER		58
 
+/* Dispatch syscalls to a userspace handler */
+#define PR_SET_SYSCALL_USER_DISPATCH	59
+# define PR_SYS_DISPATCH_OFF		0
+# define PR_SYS_DISPATCH_ON		1
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index 34c8a3f1c735..095c775e001e 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n
 CFLAGS_REMOVE_common.o	 = -fstack-protector -fstack-protector-strong
 CFLAGS_common.o		+= -fno-stack-protector
 
-obj-$(CONFIG_GENERIC_ENTRY) 		+= common.o
+obj-$(CONFIG_GENERIC_ENTRY) 		+= common.o syscall_user_dispatch.o
 obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK)	+= kvm.o
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
new file mode 100644
index 000000000000..f6e6d02f07fe
--- /dev/null
+++ b/kernel/entry/common.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _COMMON_H
+#define _COMMON_H
+
+bool syscall_user_dispatch(struct pt_regs *regs);
+
+#endif
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
new file mode 100644
index 000000000000..b0338a5625d9
--- /dev/null
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Collabora Ltd.
+ */
+#include <linux/sched.h>
+#include <linux/prctl.h>
+#include <linux/syscall_user_dispatch.h>
+#include <linux/uaccess.h>
+#include <linux/signal.h>
+#include <linux/elf.h>
+
+#include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
+
+#include <asm/syscall.h>
+
+#include "common.h"
+
+static void trigger_sigsys(struct pt_regs *regs)
+{
+	struct kernel_siginfo info;
+
+	clear_siginfo(&info);
+	info.si_signo = SIGSYS;
+	info.si_code = SYS_USER_DISPATCH;
+	info.si_call_addr = (void __user *)KSTK_EIP(current);
+	info.si_errno = 0;
+	info.si_arch = syscall_get_arch(current);
+	info.si_syscall = syscall_get_nr(current, regs);
+
+	force_sig_info(&info);
+}
+
+bool syscall_user_dispatch(struct pt_regs *regs)
+{
+	struct syscall_user_dispatch *sd = &current->syscall_dispatch;
+	char state;
+
+	if (likely(instruction_pointer(regs) - sd->offset < sd->len))
+		return false;
+
+	if (unlikely(arch_syscall_is_vdso_sigreturn(regs)))
+		return false;
+
+	if (likely(sd->selector)) {
+		/*
+		 * access_ok() is performed once, at prctl time, when
+		 * the selector is loaded by userspace.
+		 */
+		if (unlikely(__get_user(state, sd->selector)))
+			do_exit(SIGSEGV);
+
+		if (likely(state == PR_SYS_DISPATCH_OFF))
+			return false;
+
+		if (state != PR_SYS_DISPATCH_ON)
+			do_exit(SIGSYS);
+	}
+
+	sd->on_dispatch = true;
+	syscall_rollback(current, regs);
+	trigger_sigsys(regs);
+
+	return true;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+			      unsigned long len, char __user *selector)
+{
+	switch (mode) {
+	case PR_SYS_DISPATCH_OFF:
+		if (offset || len || selector)
+			return -EINVAL;
+		break;
+	case PR_SYS_DISPATCH_ON:
+		/*
+		 * Validate the direct dispatcher region just for basic
+		 * sanity against overflow and a 0-sized dispatcher
+		 * region.  If the user is able to submit a syscall from
+		 * an address, that address is obviously valid.
+		 */
+		if (offset && offset + len <= offset)
+			return -EINVAL;
+
+		if (selector && !access_ok(selector, sizeof(*selector)))
+			return -EFAULT;
+
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	current->syscall_dispatch.selector = selector;
+	current->syscall_dispatch.offset = offset;
+	current->syscall_dispatch.len = len;
+	current->syscall_dispatch.on_dispatch = false;
+
+	if (mode == PR_SYS_DISPATCH_ON)
+		set_syscall_work(SYSCALL_USER_DISPATCH);
+	else
+		clear_syscall_work(SYSCALL_USER_DISPATCH);
+
+	return 0;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 02b689a23457..4a5ecb41f440 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -906,6 +906,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
 	set_task_stack_end_magic(tsk);
+	clear_syscall_work_syscall_user_dispatch(tsk);
 
 #ifdef CONFIG_STACKPROTECTOR
 	tsk->stack_canary = get_random_canary();
diff --git a/kernel/sys.c b/kernel/sys.c
index a730c03ee607..51f00fe20e4d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -42,6 +42,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
 #include <linux/ctype.h>
+#include <linux/syscall_user_dispatch.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
 		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
 		break;
+	case PR_SET_SYSCALL_USER_DISPATCH:
+		error = set_syscall_user_dispatch(arg2, arg3, arg4,
+						  (char __user *) arg5);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit 


From 11894468e39def270199f845b76df6c36d4ed133 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Fri, 27 Nov 2020 14:32:35 -0500
Subject: entry: Support Syscall User Dispatch on common syscall entry

Syscall User Dispatch (SUD) must take precedence over seccomp and
ptrace, since the use case is emulation (it can be invoked with a
different ABI) such that seccomp filtering by syscall number doesn't
make sense in the first place.  In addition, either the syscall is
dispatched back to userspace, in which case there is no resource for to
trace, or the syscall will be executed, and seccomp/ptrace will execute
next.

Since SUD runs before tracepoints, it needs to be a SYSCALL_WORK_EXIT as
well, just to prevent a trace exit event when dispatch was triggered.
For that, the on_syscall_dispatch() examines context to skip the
tracepoint, audit and other work.

[ tglx: Add a comment on the exit side ]

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201127193238.821364-5-krisman@collabora.com
---
 include/linux/entry-common.h |  2 ++
 kernel/entry/common.c        | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 49b26b216e4e..a6e98b4ba8e9 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -44,10 +44,12 @@
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_EMU |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
+				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
 				 ARCH_SYSCALL_WORK_ENTER)
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
+				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
 				 ARCH_SYSCALL_WORK_EXIT)
 
 /*
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 91e8fd50adf4..e661e70ffcf3 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,8 @@
 #include <linux/livepatch.h>
 #include <linux/audit.h>
 
+#include "common.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
@@ -46,6 +48,16 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 {
 	long ret = 0;
 
+	/*
+	 * Handle Syscall User Dispatch.  This must comes first, since
+	 * the ABI here can be something that doesn't make sense for
+	 * other syscall_work features.
+	 */
+	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+		if (syscall_user_dispatch(regs))
+			return -1L;
+	}
+
 	/* Handle ptrace */
 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
 		ret = arch_syscall_enter_tracehook(regs);
@@ -230,6 +242,19 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
 {
 	bool step;
 
+	/*
+	 * If the syscall was rolled back due to syscall user dispatching,
+	 * then the tracers below are not invoked for the same reason as
+	 * the entry side was not invoked in syscall_trace_enter(): The ABI
+	 * of these syscalls is unknown.
+	 */
+	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+		if (unlikely(current->syscall_dispatch.on_dispatch)) {
+			current->syscall_dispatch.on_dispatch = false;
+			return;
+		}
+	}
+
 	audit_syscall_exit(regs);
 
 	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
-- 
cgit 


From 96e2fbccd0fc806364a964fdf072bfc858a66109 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 1 Dec 2020 15:27:53 +0100
Subject: entry_Add_enter_from_user_mode_wrapper

To be called from architecture specific code if the combo interfaces are
not suitable. It simply calls __enter_from_user_mode(). This way
__enter_from_user_mode will still be inlined because it is declared static
__always_inline.

[ tglx: Amend comments and move it to a different location in the header ]

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201201142755.31931-4-svens@linux.ibm.com
---
 include/linux/entry-common.h | 24 +++++++++++++++++++++++-
 kernel/entry/common.c        | 16 ++++++----------
 2 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index a6e98b4ba8e9..da60980a2e7b 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -101,6 +101,27 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
 }
 #endif
 
+/**
+ * enter_from_user_mode - Establish state when coming from user mode
+ *
+ * Syscall/interrupt entry disables interrupts, but user mode is traced as
+ * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ *
+ * Invoked from architecture specific syscall entry code with interrupts
+ * disabled. The calling code has to be non-instrumentable. When the
+ * function returns all state is correct and interrupts are still
+ * disabled. The subsequent functions can be instrumented.
+ *
+ * This is invoked when there is architecture specific functionality to be
+ * done between establishing state and enabling interrupts. The caller must
+ * enable interrupts before invoking syscall_enter_from_user_mode_work().
+ */
+void enter_from_user_mode(struct pt_regs *regs);
+
 /**
  * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
  * @regs:	Pointer to currents pt_regs
@@ -110,7 +131,8 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
  * function returns all state is correct, interrupts are enabled and the
  * subsequent functions can be instrumented.
  *
- * This handles lockdep, RCU (context tracking) and tracing state.
+ * This handles lockdep, RCU (context tracking) and tracing state, i.e.
+ * the functionality provided by enter_from_user_mode().
  *
  * This is invoked when there is extra architecture specific functionality
  * to be done between establishing state and handling user mode entry work.
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index dff07b4ce6ec..17b1e032afe7 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -10,16 +10,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
-/**
- * __enter_from_user_mode - Establish state when coming from user mode
- *
- * Syscall/interrupt entry disables interrupts, but user mode is traced as
- * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
- *
- * 1) Tell lockdep that interrupts are disabled
- * 2) Invoke context tracking if enabled to reactivate RCU
- * 3) Trace interrupts off state
- */
+/* See comment for enter_from_user_mode() in entry-common.h */
 static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
 {
 	arch_check_user_regs(regs);
@@ -33,6 +24,11 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
 	instrumentation_end();
 }
 
+void noinstr enter_from_user_mode(struct pt_regs *regs)
+{
+	__enter_from_user_mode(regs);
+}
+
 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
 {
 	if (unlikely(audit_context())) {
-- 
cgit 


From 310de1a678b2184c078c593dae343cb79c807f8d Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 1 Dec 2020 15:27:54 +0100
Subject: entry: Add exit_to_user_mode() wrapper

Called from architecture specific code when syscall_exit_to_user_mode() is
not suitable. It simply calls __exit_to_user_mode().

This way __exit_to_user_mode() can still be inlined because it is declared
static __always_inline.

[ tglx: Amended comments and moved it to a different place in the header ]

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201201142755.31931-5-svens@linux.ibm.com
---
 include/linux/entry-common.h | 23 +++++++++++++++++++++--
 kernel/entry/common.c        | 18 ++++++------------
 2 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index da60980a2e7b..e370be8121aa 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -300,6 +300,25 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step)
 }
 #endif
 
+/**
+ * exit_to_user_mode - Fixup state when exiting to user mode
+ *
+ * Syscall/interrupt exit enables interrupts, but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Invoke architecture specific last minute exit code, e.g. speculation
+ *    mitigations, etc.: arch_exit_to_user_mode()
+ * 4) Tell lockdep that interrupts are enabled
+ *
+ * Invoked from architecture specific code when syscall_exit_to_user_mode()
+ * is not suitable as the last step before returning to userspace. Must be
+ * invoked with interrupts disabled and the caller must be
+ * non-instrumentable.
+ */
+void exit_to_user_mode(void);
+
 /**
  * syscall_exit_to_user_mode - Handle work before returning to user mode
  * @regs:	Pointer to currents pt_regs
@@ -322,8 +341,8 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step)
  *	- Architecture specific one time work arch_exit_to_user_mode_prepare()
  *	- Address limit and lockdep checks
  *
- *  3) Final transition (lockdep, tracing, context tracking, RCU). Invokes
- *     arch_exit_to_user_mode() to handle e.g. speculation mitigations
+ *  3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
+ *     functionality in exit_to_user_mode().
  */
 void syscall_exit_to_user_mode(struct pt_regs *regs);
 
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 17b1e032afe7..48d30ce2e00e 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -117,18 +117,7 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 	instrumentation_end();
 }
 
-/**
- * __exit_to_user_mode - Fixup state when exiting to user mode
- *
- * Syscall/interupt exit enables interrupts, but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about it.
- *
- * 1) Trace interrupts on state
- * 2) Invoke context tracking if enabled to adjust RCU state
- * 3) Invoke architecture specific last minute exit code, e.g. speculation
- *    mitigations, etc.
- * 4) Tell lockdep that interrupts are enabled
- */
+/* See comment for exit_to_user_mode() in entry-common.h */
 static __always_inline void __exit_to_user_mode(void)
 {
 	instrumentation_begin();
@@ -141,6 +130,11 @@ static __always_inline void __exit_to_user_mode(void)
 	lockdep_hardirqs_on(CALLER_ADDR0);
 }
 
+void noinstr exit_to_user_mode(void)
+{
+	__exit_to_user_mode();
+}
+
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
 
-- 
cgit 


From c6156e1da633f241e132eaea3b676d674376d770 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 1 Dec 2020 15:27:55 +0100
Subject: entry: Add syscall_exit_to_user_mode_work()

This is the same as syscall_exit_to_user_mode() but without calling
exit_to_user_mode(). This can be used if there is an architectural reason
to avoid the combo function, e.g. restarting a syscall without returning to
userspace. Before returning to user space the caller has to invoke
exit_to_user_mode().

[ tglx: Amended comments ]

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201201142755.31931-6-svens@linux.ibm.com
---
 include/linux/entry-common.h | 20 ++++++++++++++++++++
 kernel/entry/common.c        | 14 ++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index e370be8121aa..7c581a4c3797 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -316,9 +316,25 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step)
  * is not suitable as the last step before returning to userspace. Must be
  * invoked with interrupts disabled and the caller must be
  * non-instrumentable.
+ * The caller has to invoke syscall_exit_to_user_mode_work() before this.
  */
 void exit_to_user_mode(void);
 
+/**
+ * syscall_exit_to_user_mode_work - Handle work before returning to user mode
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling
+ * exit_to_user_mode() to perform the final transition to user mode.
+ *
+ * Calling convention is the same as for syscall_exit_to_user_mode() and it
+ * returns with all work handled and interrupts disabled. The caller must
+ * invoke exit_to_user_mode() before actually switching to user mode to
+ * make the final state transitions. Interrupts must stay disabled between
+ * return from this function and the invocation of exit_to_user_mode().
+ */
+void syscall_exit_to_user_mode_work(struct pt_regs *regs);
+
 /**
  * syscall_exit_to_user_mode - Handle work before returning to user mode
  * @regs:	Pointer to currents pt_regs
@@ -343,6 +359,10 @@ void exit_to_user_mode(void);
  *
  *  3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
  *     functionality in exit_to_user_mode().
+ *
+ * This is a combination of syscall_exit_to_user_mode_work() (1,2) and
+ * exit_to_user_mode(). This function is preferred unless there is a
+ * compelling architectural reason to use the seperate functions.
  */
 void syscall_exit_to_user_mode(struct pt_regs *regs);
 
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 48d30ce2e00e..d6b73937dab3 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -282,12 +282,22 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 		syscall_exit_work(regs, work);
 }
 
-__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
 {
-	instrumentation_begin();
 	syscall_exit_to_user_mode_prepare(regs);
 	local_irq_disable_exit_to_user();
 	exit_to_user_mode_prepare(regs);
+}
+
+void syscall_exit_to_user_mode_work(struct pt_regs *regs)
+{
+	__syscall_exit_to_user_mode_work(regs);
+}
+
+__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+{
+	instrumentation_begin();
+	__syscall_exit_to_user_mode_work(regs);
 	instrumentation_end();
 	__exit_to_user_mode();
 }
-- 
cgit 


From 6b6667aa4d1e0866f00b62d35a9be3875c7551f8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 24 Nov 2020 17:58:12 +0000
Subject: block: optimise for_each_bvec() advance

Because of how for_each_bvec() works it never advances across multiple
entries at a time, so bvec_iter_advance() is an overkill. Add
specialised bvec_iter_advance_single() that is faster. It also handles
zero-len bvecs, so can kill bvec_iter_skip_zero_bvec().

   text    data     bss     dec     hex filename
before:
  23977     805       0   24782    60ce lib/iov_iter.o
before, bvec_iter_advance() w/o WARN_ONCE()
  22886     600       0   23486    5bbe ./lib/iov_iter.o
after:
  21862     600       0   22462    57be lib/iov_iter.o

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 2efec10bf792..ff832e698efb 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -121,18 +121,28 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv,
 	return true;
 }
 
-static inline void bvec_iter_skip_zero_bvec(struct bvec_iter *iter)
+/*
+ * A simpler version of bvec_iter_advance(), @bytes should not span
+ * across multiple bvec entries, i.e. bytes <= bv[i->bi_idx].bv_len
+ */
+static inline void bvec_iter_advance_single(const struct bio_vec *bv,
+				struct bvec_iter *iter, unsigned int bytes)
 {
-	iter->bi_bvec_done = 0;
-	iter->bi_idx++;
+	unsigned int done = iter->bi_bvec_done + bytes;
+
+	if (done == bv[iter->bi_idx].bv_len) {
+		done = 0;
+		iter->bi_idx++;
+	}
+	iter->bi_bvec_done = done;
+	iter->bi_size -= bytes;
 }
 
 #define for_each_bvec(bvl, bio_vec, iter, start)			\
 	for (iter = (start);						\
 	     (iter).bi_size &&						\
 		((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);	\
-	     (bvl).bv_len ? (void)bvec_iter_advance((bio_vec), &(iter),	\
-		     (bvl).bv_len) : bvec_iter_skip_zero_bvec(&(iter)))
+	     bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))
 
 /* for iterating one bio from start to end */
 #define BVEC_ITER_ALL_INIT (struct bvec_iter)				\
-- 
cgit 


From 22b56c2964386ddced252be407150b22f85e209e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 24 Nov 2020 17:58:13 +0000
Subject: bio: optimise bvec iteration

__bio_for_each_bvec(), __bio_for_each_segment() and bio_copy_data_iter()
fall under conditions of bvec_iter_advance_single(), which is a faster
and slimmer version of bvec_iter_advance(). Add
bio_advance_iter_single() and convert them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         |  4 ++--
 include/linux/bio.h | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index ebb18136b86f..1f2cc1fbe283 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1212,8 +1212,8 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 
 		flush_dcache_page(dst_bv.bv_page);
 
-		bio_advance_iter(src, src_iter, bytes);
-		bio_advance_iter(dst, dst_iter, bytes);
+		bio_advance_iter_single(src, src_iter, bytes);
+		bio_advance_iter_single(dst, dst_iter, bytes);
 	}
 }
 EXPORT_SYMBOL(bio_copy_data_iter);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ecf67108f091..1edda614f7ce 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -148,11 +148,24 @@ static inline void bio_advance_iter(const struct bio *bio,
 		/* TODO: It is reasonable to complete bio with error here. */
 }
 
+/* @bytes should be less or equal to bvec[i->bi_idx].bv_len */
+static inline void bio_advance_iter_single(const struct bio *bio,
+					   struct bvec_iter *iter,
+					   unsigned int bytes)
+{
+	iter->bi_sector += bytes >> 9;
+
+	if (bio_no_advance_iter(bio))
+		iter->bi_size -= bytes;
+	else
+		bvec_iter_advance_single(bio->bi_io_vec, iter, bytes);
+}
+
 #define __bio_for_each_segment(bvl, bio, iter, start)			\
 	for (iter = (start);						\
 	     (iter).bi_size &&						\
 		((bvl = bio_iter_iovec((bio), (iter))), 1);		\
-	     bio_advance_iter((bio), &(iter), (bvl).bv_len))
+	     bio_advance_iter_single((bio), &(iter), (bvl).bv_len))
 
 #define bio_for_each_segment(bvl, bio, iter)				\
 	__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
@@ -161,7 +174,7 @@ static inline void bio_advance_iter(const struct bio *bio,
 	for (iter = (start);						\
 	     (iter).bi_size &&						\
 		((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
-	     bio_advance_iter((bio), &(iter), (bvl).bv_len))
+	     bio_advance_iter_single((bio), &(iter), (bvl).bv_len))
 
 /* iterate over multi-page bvec */
 #define bio_for_each_bvec(bvl, bio, iter)			\
-- 
cgit 


From 93b8959a0a8cf1b1a493efee9e8328681e111862 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 15:24:41 -0500
Subject: NFS: More readdir cleanups

Remove the redundant caching of the credential in struct
nfs_open_dir_context.
Pass the buffer size as an argument to nfs_readdir_xdr_filler().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c           | 25 +++++++++++--------------
 include/linux/nfs_fs.h |  1 -
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 438906dae083..bc366bd8e8f3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -68,7 +68,7 @@ const struct address_space_operations nfs_dir_aops = {
 	.freepage = nfs_readdir_clear_array,
 };
 
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 	struct nfs_open_dir_context *ctx;
@@ -78,7 +78,6 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 		ctx->attr_gencount = nfsi->attr_gencount;
 		ctx->dir_cookie = 0;
 		ctx->dup_cookie = 0;
-		ctx->cred = get_cred(cred);
 		spin_lock(&dir->i_lock);
 		if (list_empty(&nfsi->open_files) &&
 		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -96,7 +95,6 @@ static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_cont
 	spin_lock(&dir->i_lock);
 	list_del(&ctx->list);
 	spin_unlock(&dir->i_lock);
-	put_cred(ctx->cred);
 	kfree(ctx);
 }
 
@@ -113,7 +111,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 
-	ctx = alloc_nfs_open_dir_context(inode, current_cred());
+	ctx = alloc_nfs_open_dir_context(inode);
 	if (IS_ERR(ctx)) {
 		res = PTR_ERR(ctx);
 		goto out;
@@ -468,12 +466,12 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 }
 
 /* Fill a page with xdr information before transferring to the cache page */
-static
-int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
-			struct nfs_entry *entry, struct file *file, struct inode *inode)
+static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
+				  u64 cookie, struct page **pages,
+				  size_t bufsize)
 {
-	struct nfs_open_dir_context *ctx = file->private_data;
-	const struct cred *cred = ctx->cred;
+	struct file *file = desc->file;
+	struct inode *inode = file_inode(file);
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -481,8 +479,8 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	desc->dir_verifier = nfs_save_change_attribute(inode);
-	error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
-					  NFS_SERVER(inode)->dtsize, desc->plus);
+	error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred,
+					  cookie, pages, bufsize, desc->plus);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
@@ -764,7 +762,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 {
 	struct page **pages;
 	struct nfs_entry entry;
-	struct file	*file = desc->file;
 	size_t array_size;
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
@@ -791,8 +788,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
-
+		status = nfs_readdir_xdr_filler(desc, entry.cookie,
+						pages, dtsize);
 		if (status < 0)
 			break;
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a2c6455ea3fa..dd6b463dda80 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -88,7 +88,6 @@ struct nfs_open_context {
 
 struct nfs_open_dir_context {
 	struct list_head list;
-	const struct cred *cred;
 	unsigned long attr_gencount;
 	__u64 dir_cookie;
 	__u64 dup_cookie;
-- 
cgit 


From 82e22a5e6245873779db1607d3b0fec6f9ca07d0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 17:34:23 -0500
Subject: NFS: Allow the NFS generic code to pass in a verifier to readdir

If we're ever going to allow support for servers that use the readdir
verifier, then that use needs to be managed by the middle layers as
those need to be able to reject cookies from other verifiers.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c            | 23 ++++++++++++++++++-----
 fs/nfs/nfs3proc.c       | 35 +++++++++++++++++------------------
 fs/nfs/nfs4proc.c       | 36 +++++++++++++++++-------------------
 fs/nfs/proc.c           | 18 +++++++++---------
 include/linux/nfs_xdr.h | 17 +++++++++++++++--
 5 files changed, 76 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b226f6f3ae96..3ee0668a9719 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -469,8 +469,20 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 				  u64 cookie, struct page **pages,
 				  size_t bufsize)
 {
-	struct file *file = desc->file;
-	struct inode *inode = file_inode(file);
+	struct inode *inode = file_inode(desc->file);
+	__be32 verf_res[2];
+	struct nfs_readdir_arg arg = {
+		.dentry = file_dentry(desc->file),
+		.cred = desc->file->f_cred,
+		.verf = NFS_I(inode)->cookieverf,
+		.cookie = cookie,
+		.pages = pages,
+		.page_len = bufsize,
+		.plus = desc->plus,
+	};
+	struct nfs_readdir_res res = {
+		.verf = verf_res,
+	};
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -478,20 +490,21 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	desc->dir_verifier = nfs_save_change_attribute(inode);
-	error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred,
-					  cookie, pages, bufsize, desc->plus);
+	error = NFS_PROTO(inode)->readdir(&arg, &res);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
 			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
-			desc->plus = false;
+			desc->plus = arg.plus = false;
 			goto again;
 		}
 		goto error;
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
+	memcpy(NFS_I(inode)->cookieverf, res.verf,
+	       sizeof(NFS_I(inode)->cookieverf));
 error:
 	return error;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 6b66b73a50eb..5c4e23abc345 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -662,37 +662,36 @@ out:
  * Also note that this implementation handles both plain readdir and
  * readdirplus.
  */
-static int
-nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		  u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			     struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
-	__be32			*verf = NFS_I(dir)->cookieverf;
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs3_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
-		.cookie		= cookie,
-		.verf		= {verf[0], verf[1]},
-		.plus		= plus,
-		.count		= count,
-		.pages		= pages
+		.cookie		= nr_arg->cookie,
+		.plus		= nr_arg->plus,
+		.count		= nr_arg->page_len,
+		.pages		= nr_arg->pages
 	};
 	struct nfs3_readdirres	res = {
-		.verf		= verf,
-		.plus		= plus
+		.verf		= nr_res->verf,
+		.plus		= nr_arg->plus,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_READDIR],
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
-		.rpc_cred	= cred,
+		.rpc_cred	= nr_arg->cred,
 	};
 	int status = -ENOMEM;
 
-	if (plus)
+	if (nr_arg->plus)
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
+	if (arg.cookie)
+		memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf));
 
-	dprintk("NFS call  readdir%s %d\n",
-			plus? "plus" : "", (unsigned int) cookie);
+	dprintk("NFS call  readdir%s %llu\n", nr_arg->plus ? "plus" : "",
+		(unsigned long long)nr_arg->cookie);
 
 	res.dir_attr = nfs_alloc_fattr();
 	if (res.dir_attr == NULL)
@@ -705,8 +704,8 @@ nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
 
 	nfs_free_fattr(res.dir_attr);
 out:
-	dprintk("NFS reply readdir%s: %d\n",
-			plus? "plus" : "", status);
+	dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "",
+		status);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 66f1f4a5c74c..adcaba68eaed 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4961,41 +4961,40 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			      struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs_server	*server = NFS_SERVER(dir);
 	struct nfs4_readdir_arg args = {
 		.fh = NFS_FH(dir),
-		.pages = pages,
+		.pages = nr_arg->pages,
 		.pgbase = 0,
-		.count = count,
-		.plus = plus,
+		.count = nr_arg->page_len,
+		.plus = nr_arg->plus,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR],
 		.rpc_argp = &args,
 		.rpc_resp = &res,
-		.rpc_cred = cred,
+		.rpc_cred = nr_arg->cred,
 	};
 	int			status;
 
-	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__,
-			dentry,
-			(unsigned long long)cookie);
+	dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__,
+		nr_arg->dentry, (unsigned long long)nr_arg->cookie);
 	if (!(server->caps & NFS_CAP_SECURITY_LABEL))
 		args.bitmask = server->attr_bitmask_nl;
 	else
 		args.bitmask = server->attr_bitmask;
 
-	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
+	nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args);
 	res.pgbase = args.pgbase;
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
 			&res.seq_res, 0);
 	if (status >= 0) {
-		memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
+		memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE);
 		status += args.pgbase;
 	}
 
@@ -5005,19 +5004,18 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
 	return status;
 }
 
-static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs4_proc_readdir(struct nfs_readdir_arg *arg,
+			     struct nfs_readdir_res *res)
 {
 	struct nfs4_exception exception = {
 		.interruptible = true,
 	};
 	int err;
 	do {
-		err = _nfs4_proc_readdir(dentry, cred, cookie,
-				pages, count, plus);
-		trace_nfs4_readdir(d_inode(dentry), err);
-		err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err,
-				&exception);
+		err = _nfs4_proc_readdir(arg, res);
+		trace_nfs4_readdir(d_inode(arg->dentry), err);
+		err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)),
+					    err, &exception);
 	} while (exception.retry);
 	return err;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 15c865cc837f..73ab7c59d3a7 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -499,26 +499,26 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name)
  * sure it is syntactically correct; the entries itself are decoded
  * from nfs_readdir by calling the decode_entry function directly.
  */
-static int
-nfs_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		 u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			    struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
-		.cookie		= cookie,
-		.count		= count,
-		.pages		= pages,
+		.cookie		= nr_arg->cookie,
+		.count		= nr_arg->page_len,
+		.pages		= nr_arg->pages,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
 		.rpc_argp	= &arg,
-		.rpc_cred	= cred,
+		.rpc_cred	= nr_arg->cred,
 	};
 	int			status;
 
-	dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
+	dprintk("NFS call  readdir %llu\n", (unsigned long long)nr_arg->cookie);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nr_res->verf[0] = nr_res->verf[1] = 0;
 
 	nfs_invalidate_atime(dir);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d63cb862d58e..3327239fa2f9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -750,6 +750,20 @@ struct nfs_entry {
 	struct nfs_server *	server;
 };
 
+struct nfs_readdir_arg {
+	struct dentry		*dentry;
+	const struct cred	*cred;
+	__be32			*verf;
+	u64			cookie;
+	struct page		**pages;
+	unsigned int		page_len;
+	bool			plus;
+};
+
+struct nfs_readdir_res {
+	__be32			*verf;
+};
+
 /*
  * The following types are for NFSv2 only.
  */
@@ -1744,8 +1758,7 @@ struct nfs_rpc_ops {
 			    unsigned int, struct iattr *);
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, const struct qstr *);
-	int	(*readdir) (struct dentry *, const struct cred *,
-			    u64, struct page **, unsigned int, bool);
+	int	(*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *);
 	int	(*mknod)   (struct inode *, struct dentry *, struct iattr *,
 			    dev_t);
 	int	(*statfs)  (struct nfs_server *, struct nfs_fh *,
-- 
cgit 


From b593c09f83a2732a0f0298c8f3468236a83cdd9f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 20:06:12 -0500
Subject: NFS: Improve handling of directory verifiers

If the server insists on using the readdir verifiers in order to allow
cookies to expire, then we should ensure that we cache the verifier
with the cookie, so that we can return an error if the application
tries to use the expired cookie.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c           | 35 +++++++++++++++++++++++------------
 fs/nfs/inode.c         |  7 -------
 include/linux/nfs_fs.h |  8 +++++++-
 3 files changed, 30 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3b44bef3a1b4..454377228167 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -155,6 +155,7 @@ struct nfs_readdir_descriptor {
 	loff_t		current_index;
 	loff_t		prev_index;
 
+	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	unsigned long	dir_verifier;
 	unsigned long	timestamp;
 	unsigned long	gencount;
@@ -466,15 +467,15 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
 
 /* Fill a page with xdr information before transferring to the cache page */
 static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
-				  u64 cookie, struct page **pages,
-				  size_t bufsize)
+				  __be32 *verf, u64 cookie,
+				  struct page **pages, size_t bufsize,
+				  __be32 *verf_res)
 {
 	struct inode *inode = file_inode(desc->file);
-	__be32 verf_res[2];
 	struct nfs_readdir_arg arg = {
 		.dentry = file_dentry(desc->file),
 		.cred = desc->file->f_cred,
-		.verf = NFS_I(inode)->cookieverf,
+		.verf = verf,
 		.cookie = cookie,
 		.pages = pages,
 		.page_len = bufsize,
@@ -503,8 +504,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
-	memcpy(NFS_I(inode)->cookieverf, res.verf,
-	       sizeof(NFS_I(inode)->cookieverf));
 error:
 	return error;
 }
@@ -770,11 +769,13 @@ out_freepages:
 }
 
 static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
-				    struct page *page, struct inode *inode)
+				    struct page *page, __be32 *verf_arg,
+				    __be32 *verf_res)
 {
 	struct page **pages;
 	struct nfs_entry *entry;
 	size_t array_size;
+	struct inode *inode = file_inode(desc->file);
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
 
@@ -801,8 +802,9 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(desc, entry->cookie,
-						pages, dtsize);
+		status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie,
+						pages, dtsize,
+						verf_res);
 		if (status < 0)
 			break;
 
@@ -854,13 +856,15 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 {
 	struct inode *inode = file_inode(desc->file);
 	struct nfs_inode *nfsi = NFS_I(inode);
+	__be32 verf[NFS_DIR_VERIFIER_SIZE];
 	int res;
 
 	desc->page = nfs_readdir_page_get_cached(desc);
 	if (!desc->page)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
-		res = nfs_readdir_xdr_to_array(desc, desc->page, inode);
+		res = nfs_readdir_xdr_to_array(desc, desc->page,
+					       nfsi->cookieverf, verf);
 		if (res < 0) {
 			nfs_readdir_page_unlock_and_put_cached(desc);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
@@ -870,6 +874,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			}
 			return res;
 		}
+		memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf));
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0) {
@@ -902,6 +907,7 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 {
 	struct file	*file = desc->file;
+	struct nfs_inode *nfsi = NFS_I(file_inode(file));
 	struct nfs_cache_array *array;
 	unsigned int i = 0;
 
@@ -915,6 +921,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 			desc->eof = true;
 			break;
 		}
+		memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf));
 		if (i < (array->size-1))
 			desc->dir_cookie = array->array[i+1].cookie;
 		else
@@ -949,8 +956,8 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
 	struct page	*page = NULL;
+	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	int		status;
-	struct inode *inode = file_inode(desc->file);
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)desc->dir_cookie);
@@ -967,7 +974,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	desc->duped = 0;
 
 	nfs_readdir_page_init_array(page, desc->dir_cookie);
-	status = nfs_readdir_xdr_to_array(desc, page, inode);
+	status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf);
 	if (status < 0)
 		goto out_release;
 
@@ -1023,6 +1030,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->dup_cookie = dir_ctx->dup_cookie;
 	desc->duped = dir_ctx->duped;
 	desc->attr_gencount = dir_ctx->attr_gencount;
+	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
 	spin_unlock(&file->f_lock);
 
 	do {
@@ -1061,6 +1069,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->dup_cookie = desc->dup_cookie;
 	dir_ctx->duped = desc->duped;
 	dir_ctx->attr_gencount = desc->attr_gencount;
+	memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
 	spin_unlock(&file->f_lock);
 
 	kfree(desc);
@@ -1101,6 +1110,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 			dir_ctx->dir_cookie = offset;
 		else
 			dir_ctx->dir_cookie = 0;
+		if (offset == 0)
+			memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
 		dir_ctx->duped = 0;
 	}
 	spin_unlock(&filp->f_lock);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index aa6493905bbe..9b765a900b28 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -229,7 +229,6 @@ static void nfs_zap_caches_locked(struct inode *inode)
 	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 	nfsi->attrtimeo_timestamp = jiffies;
 
-	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
 		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
 					| NFS_INO_INVALID_DATA
@@ -1237,7 +1236,6 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret;
 
 	if (mapping->nrpages != 0) {
@@ -1250,11 +1248,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 		if (ret < 0)
 			return ret;
 	}
-	if (S_ISDIR(inode->i_mode)) {
-		spin_lock(&inode->i_lock);
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		spin_unlock(&inode->i_lock);
-	}
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	nfs_fscache_wait_on_invalidate(inode);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index dd6b463dda80..681ed98e4ba8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -45,6 +45,11 @@
  */
 #define NFS_RPC_SWAPFLAGS		(RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS)
 
+/*
+ * Size of the NFS directory verifier
+ */
+#define NFS_DIR_VERIFIER_SIZE		2
+
 /*
  * NFSv3/v4 Access mode cache entry
  */
@@ -89,6 +94,7 @@ struct nfs_open_context {
 struct nfs_open_dir_context {
 	struct list_head list;
 	unsigned long attr_gencount;
+	__be32	verf[NFS_DIR_VERIFIER_SIZE];
 	__u64 dir_cookie;
 	__u64 dup_cookie;
 	signed char duped;
@@ -156,7 +162,7 @@ struct nfs_inode {
 	 * This is the cookie verifier used for NFSv3 readdir
 	 * operations
 	 */
-	__be32			cookieverf[2];
+	__be32			cookieverf[NFS_DIR_VERIFIER_SIZE];
 
 	atomic_long_t		nrequests;
 	struct nfs_mds_commit_info commit_info;
-- 
cgit 


From d5aa6b22e2258f05317313ecc02efbb988ed6d38 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 16:33:38 -0500
Subject: SUNRPC: xprt_load_transport() needs to support the netid "rdma6"

According to RFC5666, the correct netid for an IPv6 addressed RDMA
transport is "rdma6", which we've supported as a mount option since
Linux-4.7. The problem is when we try to load the module "xprtrdma6",
that will fail, since there is no modulealias of that name.

Fixes: 181342c5ebe8 ("xprtrdma: Add rdma6 option to support NFS/RDMA IPv6")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h     |  1 +
 net/sunrpc/xprt.c               | 65 +++++++++++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/module.c    |  1 +
 net/sunrpc/xprtrdma/transport.c |  1 +
 net/sunrpc/xprtsock.c           |  4 +++
 5 files changed, 56 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a603d48d2b2c..3ac5037d1c3d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -330,6 +330,7 @@ struct xprt_class {
 	struct rpc_xprt *	(*setup)(struct xprt_create *);
 	struct module		*owner;
 	char			name[32];
+	const char *		netid[];
 };
 
 /*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f6c17e75f20e..57f09ea3ef2a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,31 +151,64 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_unregister_transport);
 
+static void
+xprt_class_release(const struct xprt_class *t)
+{
+	module_put(t->owner);
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid_locked(const char *netid)
+{
+	const struct xprt_class *t;
+	unsigned int i;
+
+	list_for_each_entry(t, &xprt_list, list) {
+		for (i = 0; t->netid[i][0] != '\0'; i++) {
+			if (strcmp(t->netid[i], netid) != 0)
+				continue;
+			if (!try_module_get(t->owner))
+				continue;
+			return t;
+		}
+	}
+	return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid(const char *netid)
+{
+	const struct xprt_class *t;
+
+	spin_lock(&xprt_list_lock);
+	t = xprt_class_find_by_netid_locked(netid);
+	if (!t) {
+		spin_unlock(&xprt_list_lock);
+		request_module("rpc%s", netid);
+		spin_lock(&xprt_list_lock);
+		t = xprt_class_find_by_netid_locked(netid);
+	}
+	spin_unlock(&xprt_list_lock);
+	return t;
+}
+
 /**
  * xprt_load_transport - load a transport implementation
- * @transport_name: transport to load
+ * @netid: transport to load
  *
  * Returns:
  * 0:		transport successfully loaded
  * -ENOENT:	transport module not available
  */
-int xprt_load_transport(const char *transport_name)
+int xprt_load_transport(const char *netid)
 {
-	struct xprt_class *t;
-	int result;
+	const struct xprt_class *t;
 
-	result = 0;
-	spin_lock(&xprt_list_lock);
-	list_for_each_entry(t, &xprt_list, list) {
-		if (strcmp(t->name, transport_name) == 0) {
-			spin_unlock(&xprt_list_lock);
-			goto out;
-		}
-	}
-	spin_unlock(&xprt_list_lock);
-	result = request_module("xprt%s", transport_name);
-out:
-	return result;
+	t = xprt_class_find_by_netid(netid);
+	if (!t)
+		return -ENOENT;
+	xprt_class_release(t);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(xprt_load_transport);
 
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 620327c01302..45c5b41ac8dc 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_ALIAS("svcrdma");
 MODULE_ALIAS("xprtrdma");
+MODULE_ALIAS("rpcrdma6");
 
 static void __exit rpc_rdma_cleanup(void)
 {
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8915e42240d3..035060c05fd5 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -768,6 +768,7 @@ static struct xprt_class xprt_rdma = {
 	.owner			= THIS_MODULE,
 	.ident			= XPRT_TRANSPORT_RDMA,
 	.setup			= xprt_setup_rdma,
+	.netid			= { "rdma", "rdma6", "" },
 };
 
 void xprt_rdma_cleanup(void)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7090bbee0ec5..c93ff70da3f9 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3059,6 +3059,7 @@ static struct xprt_class	xs_local_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_LOCAL,
 	.setup		= xs_setup_local,
+	.netid		= { "" },
 };
 
 static struct xprt_class	xs_udp_transport = {
@@ -3067,6 +3068,7 @@ static struct xprt_class	xs_udp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_UDP,
 	.setup		= xs_setup_udp,
+	.netid		= { "udp", "udp6", "" },
 };
 
 static struct xprt_class	xs_tcp_transport = {
@@ -3075,6 +3077,7 @@ static struct xprt_class	xs_tcp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_TCP,
 	.setup		= xs_setup_tcp,
+	.netid		= { "tcp", "tcp6", "" },
 };
 
 static struct xprt_class	xs_bc_tcp_transport = {
@@ -3083,6 +3086,7 @@ static struct xprt_class	xs_bc_tcp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_BC_TCP,
 	.setup		= xs_setup_bc_tcp,
+	.netid		= { "" },
 };
 
 /**
-- 
cgit 


From 1fc5f13186440973e1aa1d85aa263326756af431 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 09:41:21 -0500
Subject: SUNRPC: Add a helper to return the transport identifier given a netid

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           | 25 +++++++++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 3ac5037d1c3d..f7b75c72f80e 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -386,6 +386,7 @@ xprt_disable_swap(struct rpc_xprt *xprt)
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
 int			xprt_load_transport(const char *);
+int			xprt_find_transport_ident(const char *);
 void			xprt_wait_for_reply_request_def(struct rpc_task *task);
 void			xprt_wait_for_reply_request_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index bf490d0c98c6..23452f57d369 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -219,22 +219,39 @@ xprt_class_find_by_netid(const char *netid)
 }
 
 /**
- * xprt_load_transport - load a transport implementation
+ * xprt_find_transport_ident - convert a netid into a transport identifier
  * @netid: transport to load
  *
  * Returns:
- * 0:		transport successfully loaded
+ * > 0:		transport identifier
  * -ENOENT:	transport module not available
  */
-int xprt_load_transport(const char *netid)
+int xprt_find_transport_ident(const char *netid)
 {
 	const struct xprt_class *t;
+	int ret;
 
 	t = xprt_class_find_by_netid(netid);
 	if (!t)
 		return -ENOENT;
+	ret = t->ident;
 	xprt_class_release(t);
-	return 0;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
+
+/**
+ * xprt_load_transport - load a transport implementation
+ * @netid: transport to load
+ *
+ * Returns:
+ * 0:		transport successfully loaded
+ * -ENOENT:	transport module not available
+ */
+int xprt_load_transport(const char *netid)
+{
+	int ret = xprt_find_transport_ident(netid);
+	return ret < 0 ? ret : 0;
 }
 EXPORT_SYMBOL_GPL(xprt_load_transport);
 
-- 
cgit 


From c87b056e58e71ba7a3f603700618f8da9742aa29 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 10:32:14 -0500
Subject: SUNRPC: Remove unused function xprt_load_transport()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  1 -
 net/sunrpc/xprt.c           | 15 ---------------
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index f7b75c72f80e..d2e97ee802af 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -385,7 +385,6 @@ xprt_disable_swap(struct rpc_xprt *xprt)
  */
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
-int			xprt_load_transport(const char *);
 int			xprt_find_transport_ident(const char *);
 void			xprt_wait_for_reply_request_def(struct rpc_task *task);
 void			xprt_wait_for_reply_request_rtt(struct rpc_task *task);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 23452f57d369..691ccf8049a4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -240,21 +240,6 @@ int xprt_find_transport_ident(const char *netid)
 }
 EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
 
-/**
- * xprt_load_transport - load a transport implementation
- * @netid: transport to load
- *
- * Returns:
- * 0:		transport successfully loaded
- * -ENOENT:	transport module not available
- */
-int xprt_load_transport(const char *netid)
-{
-	int ret = xprt_find_transport_ident(netid);
-	return ret < 0 ? ret : 0;
-}
-EXPORT_SYMBOL_GPL(xprt_load_transport);
-
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
 	xprt->snd_task = NULL;
-- 
cgit 


From 8a6a5920d3286eb0eae9f36a4ec4fc9df511eccb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 2 Dec 2020 12:57:30 +0100
Subject: sched/vtime: Consolidate IRQ time accounting

The 3 architectures implementing CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
all have their own version of irq time accounting that dispatch the
cputime to the appropriate index: hardirq, softirq, system, idle,
guest... from an all-in-one function.

Instead of having these ad-hoc versions, move the cputime destination
dispatch decision to the core code and leave only the actual per-index
cputime accounting to the architecture.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201202115732.27827-4-frederic@kernel.org
---
 arch/ia64/kernel/time.c    | 20 ++++++++++++-----
 arch/powerpc/kernel/time.c | 56 +++++++++++++++++++++++++++++++++-------------
 arch/s390/kernel/vtime.c   | 45 ++++++++++++++++++++++++++-----------
 include/linux/vtime.h      | 16 +++++--------
 kernel/sched/cputime.c     | 13 +++++++----
 5 files changed, 102 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 7abc5f37bfaf..733e0e3324b8 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -138,12 +138,8 @@ void vtime_account_kernel(struct task_struct *tsk)
 	struct thread_info *ti = task_thread_info(tsk);
 	__u64 stime = vtime_delta(tsk);
 
-	if ((tsk->flags & PF_VCPU) && !irq_count())
+	if (tsk->flags & PF_VCPU)
 		ti->gtime += stime;
-	else if (hardirq_count())
-		ti->hardirq_time += stime;
-	else if (in_serving_softirq())
-		ti->softirq_time += stime;
 	else
 		ti->stime += stime;
 }
@@ -156,6 +152,20 @@ void vtime_account_idle(struct task_struct *tsk)
 	ti->idle_time += vtime_delta(tsk);
 }
 
+void vtime_account_softirq(struct task_struct *tsk)
+{
+	struct thread_info *ti = task_thread_info(tsk);
+
+	ti->softirq_time += vtime_delta(tsk);
+}
+
+void vtime_account_hardirq(struct task_struct *tsk)
+{
+	struct thread_info *ti = task_thread_info(tsk);
+
+	ti->hardirq_time += vtime_delta(tsk);
+}
+
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 static irqreturn_t
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 74efe46f5532..cf3f8db7e0e3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -311,12 +311,11 @@ static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct,
 	return stime_scaled;
 }
 
-static unsigned long vtime_delta(struct task_struct *tsk,
+static unsigned long vtime_delta(struct cpu_accounting_data *acct,
 				 unsigned long *stime_scaled,
 				 unsigned long *steal_time)
 {
 	unsigned long now, stime;
-	struct cpu_accounting_data *acct = get_accounting(tsk);
 
 	WARN_ON_ONCE(!irqs_disabled());
 
@@ -331,29 +330,30 @@ static unsigned long vtime_delta(struct task_struct *tsk,
 	return stime;
 }
 
+static void vtime_delta_kernel(struct cpu_accounting_data *acct,
+			       unsigned long *stime, unsigned long *stime_scaled)
+{
+	unsigned long steal_time;
+
+	*stime = vtime_delta(acct, stime_scaled, &steal_time);
+	*stime -= min(*stime, steal_time);
+	acct->steal_time += steal_time;
+}
+
 void vtime_account_kernel(struct task_struct *tsk)
 {
-	unsigned long stime, stime_scaled, steal_time;
 	struct cpu_accounting_data *acct = get_accounting(tsk);
+	unsigned long stime, stime_scaled;
 
-	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
-
-	stime -= min(stime, steal_time);
-	acct->steal_time += steal_time;
+	vtime_delta_kernel(acct, &stime, &stime_scaled);
 
-	if ((tsk->flags & PF_VCPU) && !irq_count()) {
+	if (tsk->flags & PF_VCPU) {
 		acct->gtime += stime;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 		acct->utime_scaled += stime_scaled;
 #endif
 	} else {
-		if (hardirq_count())
-			acct->hardirq_time += stime;
-		else if (in_serving_softirq())
-			acct->softirq_time += stime;
-		else
-			acct->stime += stime;
-
+		acct->stime += stime;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 		acct->stime_scaled += stime_scaled;
 #endif
@@ -366,10 +366,34 @@ void vtime_account_idle(struct task_struct *tsk)
 	unsigned long stime, stime_scaled, steal_time;
 	struct cpu_accounting_data *acct = get_accounting(tsk);
 
-	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
+	stime = vtime_delta(acct, &stime_scaled, &steal_time);
 	acct->idle_time += stime + steal_time;
 }
 
+static void vtime_account_irq_field(struct cpu_accounting_data *acct,
+				    unsigned long *field)
+{
+	unsigned long stime, stime_scaled;
+
+	vtime_delta_kernel(acct, &stime, &stime_scaled);
+	*field += stime;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
+	acct->stime_scaled += stime_scaled;
+#endif
+}
+
+void vtime_account_softirq(struct task_struct *tsk)
+{
+	struct cpu_accounting_data *acct = get_accounting(tsk);
+	vtime_account_irq_field(acct, &acct->softirq_time);
+}
+
+void vtime_account_hardirq(struct task_struct *tsk)
+{
+	struct cpu_accounting_data *acct = get_accounting(tsk);
+	vtime_account_irq_field(acct, &acct->hardirq_time);
+}
+
 static void vtime_flush_scaled(struct task_struct *tsk,
 			       struct cpu_accounting_data *acct)
 {
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index ebd8e5655789..5aaa2ca6a928 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -222,31 +222,50 @@ void vtime_flush(struct task_struct *tsk)
 	S390_lowcore.avg_steal_timer = avg_steal;
 }
 
+static u64 vtime_delta(void)
+{
+	u64 timer = S390_lowcore.last_update_timer;
+
+	S390_lowcore.last_update_timer = get_vtimer();
+
+	return timer - S390_lowcore.last_update_timer;
+}
+
 /*
  * Update process times based on virtual cpu times stored by entry.S
  * to the lowcore fields user_timer, system_timer & steal_clock.
  */
 void vtime_account_kernel(struct task_struct *tsk)
 {
-	u64 timer;
-
-	timer = S390_lowcore.last_update_timer;
-	S390_lowcore.last_update_timer = get_vtimer();
-	timer -= S390_lowcore.last_update_timer;
+	u64 delta = vtime_delta();
 
-	if ((tsk->flags & PF_VCPU) && (irq_count() == 0))
-		S390_lowcore.guest_timer += timer;
-	else if (hardirq_count())
-		S390_lowcore.hardirq_timer += timer;
-	else if (in_serving_softirq())
-		S390_lowcore.softirq_timer += timer;
+	if (tsk->flags & PF_VCPU)
+		S390_lowcore.guest_timer += delta;
 	else
-		S390_lowcore.system_timer += timer;
+		S390_lowcore.system_timer += delta;
 
-	virt_timer_forward(timer);
+	virt_timer_forward(delta);
 }
 EXPORT_SYMBOL_GPL(vtime_account_kernel);
 
+void vtime_account_softirq(struct task_struct *tsk)
+{
+	u64 delta = vtime_delta();
+
+	S390_lowcore.softirq_timer += delta;
+
+	virt_timer_forward(delta);
+}
+
+void vtime_account_hardirq(struct task_struct *tsk)
+{
+	u64 delta = vtime_delta();
+
+	S390_lowcore.hardirq_timer += delta;
+
+	virt_timer_forward(delta);
+}
+
 /*
  * Sorted add to a list. List is linear searched until first bigger
  * element is found.
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 2cdeca062db3..6c9867419615 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -83,16 +83,12 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
 #endif
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void vtime_account_irq_enter(struct task_struct *tsk);
-static inline void vtime_account_irq_exit(struct task_struct *tsk)
-{
-	/* On hard|softirq exit we always account to hard|softirq cputime */
-	vtime_account_kernel(tsk);
-}
+extern void vtime_account_irq(struct task_struct *tsk);
+extern void vtime_account_softirq(struct task_struct *tsk);
+extern void vtime_account_hardirq(struct task_struct *tsk);
 extern void vtime_flush(struct task_struct *tsk);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
-static inline void vtime_account_irq_exit(struct task_struct *tsk) { }
+static inline void vtime_account_irq(struct task_struct *tsk) { }
 static inline void vtime_flush(struct task_struct *tsk) { }
 #endif
 
@@ -105,13 +101,13 @@ static inline void irqtime_account_irq(struct task_struct *tsk) { }
 
 static inline void account_irq_enter_time(struct task_struct *tsk)
 {
-	vtime_account_irq_enter(tsk);
+	vtime_account_irq(tsk);
 	irqtime_account_irq(tsk);
 }
 
 static inline void account_irq_exit_time(struct task_struct *tsk)
 {
-	vtime_account_irq_exit(tsk);
+	vtime_account_irq(tsk);
 	irqtime_account_irq(tsk);
 }
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2783162542b1..02163d4260d7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -417,13 +417,18 @@ void vtime_task_switch(struct task_struct *prev)
 }
 # endif
 
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq(struct task_struct *tsk)
 {
-	if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
-	    !in_interrupt() && is_idle_task(tsk))
+	if (hardirq_count()) {
+		vtime_account_hardirq(tsk);
+	} else if (in_serving_softirq()) {
+		vtime_account_softirq(tsk);
+	} else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
+		   is_idle_task(tsk)) {
 		vtime_account_idle(tsk);
-	else
+	} else {
 		vtime_account_kernel(tsk);
+	}
 }
 
 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
-- 
cgit 


From d3759e7184f8f6187e62f8c4e7dcb1f6c47c075a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 2 Dec 2020 12:57:31 +0100
Subject: irqtime: Move irqtime entry accounting after irq offset
 incrementation

IRQ time entry is currently accounted before HARDIRQ_OFFSET or
SOFTIRQ_OFFSET are incremented. This is convenient to decide to which
index the cputime to account is dispatched.

Unfortunately it prevents tick_irq_enter() from being called under
HARDIRQ_OFFSET because tick_irq_enter() has to be called before the IRQ
entry accounting due to the necessary clock catch up. As a result we
don't benefit from appropriate lockdep coverage on tick_irq_enter().

To prepare for fixing this, move the IRQ entry cputime accounting after
the preempt offset is incremented. This requires the cputime dispatch
code to handle the extra offset.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20201202115732.27827-5-frederic@kernel.org
---
 include/linux/hardirq.h |  4 ++--
 include/linux/vtime.h   | 34 ++++++++++++++++++++++++----------
 kernel/sched/cputime.c  | 18 +++++++++++-------
 kernel/softirq.c        |  6 +++---
 4 files changed, 40 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 754f67ac4326..7c9d6a2d7e90 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -32,9 +32,9 @@ static __always_inline void rcu_irq_enter_check_tick(void)
  */
 #define __irq_enter()					\
 	do {						\
-		account_irq_enter_time(current);	\
 		preempt_count_add(HARDIRQ_OFFSET);	\
 		lockdep_hardirq_enter();		\
+		account_hardirq_enter(current);		\
 	} while (0)
 
 /*
@@ -62,8 +62,8 @@ void irq_enter_rcu(void);
  */
 #define __irq_exit()					\
 	do {						\
+		account_hardirq_exit(current);		\
 		lockdep_hardirq_exit();			\
-		account_irq_exit_time(current);		\
 		preempt_count_sub(HARDIRQ_OFFSET);	\
 	} while (0)
 
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 6c9867419615..041d6524d144 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -83,32 +83,46 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
 #endif
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void vtime_account_irq(struct task_struct *tsk);
+extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
 extern void vtime_account_softirq(struct task_struct *tsk);
 extern void vtime_account_hardirq(struct task_struct *tsk);
 extern void vtime_flush(struct task_struct *tsk);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-static inline void vtime_account_irq(struct task_struct *tsk) { }
+static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
+static inline void vtime_account_softirq(struct task_struct *tsk) { }
+static inline void vtime_account_hardirq(struct task_struct *tsk) { }
 static inline void vtime_flush(struct task_struct *tsk) { }
 #endif
 
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-extern void irqtime_account_irq(struct task_struct *tsk);
+extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset);
 #else
-static inline void irqtime_account_irq(struct task_struct *tsk) { }
+static inline void irqtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
 #endif
 
-static inline void account_irq_enter_time(struct task_struct *tsk)
+static inline void account_softirq_enter(struct task_struct *tsk)
 {
-	vtime_account_irq(tsk);
-	irqtime_account_irq(tsk);
+	vtime_account_irq(tsk, SOFTIRQ_OFFSET);
+	irqtime_account_irq(tsk, SOFTIRQ_OFFSET);
 }
 
-static inline void account_irq_exit_time(struct task_struct *tsk)
+static inline void account_softirq_exit(struct task_struct *tsk)
 {
-	vtime_account_irq(tsk);
-	irqtime_account_irq(tsk);
+	vtime_account_softirq(tsk);
+	irqtime_account_irq(tsk, 0);
+}
+
+static inline void account_hardirq_enter(struct task_struct *tsk)
+{
+	vtime_account_irq(tsk, HARDIRQ_OFFSET);
+	irqtime_account_irq(tsk, HARDIRQ_OFFSET);
+}
+
+static inline void account_hardirq_exit(struct task_struct *tsk)
+{
+	vtime_account_hardirq(tsk);
+	irqtime_account_irq(tsk, 0);
 }
 
 #endif /* _LINUX_KERNEL_VTIME_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 02163d4260d7..5f611658eeab 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
 }
 
 /*
- * Called before incrementing preempt_count on {soft,}irq_enter
+ * Called after incrementing preempt_count on {soft,}irq_enter
  * and before decrementing preempt_count on {soft,}irq_exit.
  */
-void irqtime_account_irq(struct task_struct *curr)
+void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
 {
 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+	unsigned int pc;
 	s64 delta;
 	int cpu;
 
@@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr)
 	cpu = smp_processor_id();
 	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
 	irqtime->irq_start_time += delta;
+	pc = preempt_count() - offset;
 
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
@@ -66,9 +68,9 @@ void irqtime_account_irq(struct task_struct *curr)
 	 * in that case, so as not to confuse scheduler with a special task
 	 * that do not consume any time, but still wants to run.
 	 */
-	if (hardirq_count())
+	if (pc & HARDIRQ_MASK)
 		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
-	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+	else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
 		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
 }
 
@@ -417,11 +419,13 @@ void vtime_task_switch(struct task_struct *prev)
 }
 # endif
 
-void vtime_account_irq(struct task_struct *tsk)
+void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
 {
-	if (hardirq_count()) {
+	unsigned int pc = preempt_count() - offset;
+
+	if (pc & HARDIRQ_OFFSET) {
 		vtime_account_hardirq(tsk);
-	} else if (in_serving_softirq()) {
+	} else if (pc & SOFTIRQ_OFFSET) {
 		vtime_account_softirq(tsk);
 	} else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
 		   is_idle_task(tsk)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 617009ccd82c..b8f42b3ba8ca 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -315,10 +315,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 	current->flags &= ~PF_MEMALLOC;
 
 	pending = local_softirq_pending();
-	account_irq_enter_time(current);
 
 	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
 	in_hardirq = lockdep_softirq_start();
+	account_softirq_enter(current);
 
 restart:
 	/* Reset the pending bitmask before enabling irqs */
@@ -365,8 +365,8 @@ restart:
 		wakeup_softirqd();
 	}
 
+	account_softirq_exit(current);
 	lockdep_softirq_end(in_hardirq);
-	account_irq_exit_time(current);
 	__local_bh_enable(SOFTIRQ_OFFSET);
 	WARN_ON_ONCE(in_interrupt());
 	current_restore_flags(old_flags, PF_MEMALLOC);
@@ -418,7 +418,7 @@ static inline void __irq_exit_rcu(void)
 #else
 	lockdep_assert_irqs_disabled();
 #endif
-	account_irq_exit_time(current);
+	account_hardirq_exit(current);
 	preempt_count_sub(HARDIRQ_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
-- 
cgit 


From 278b13ce3a89698711c5a67792ba2dba41555433 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 2 Oct 2019 10:33:02 -0700
Subject: Input: remove input_polled_dev implementation

Now that normal input devices support polling mode, and all users of
input_polled_dev API have been converted, we can remove it.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 Documentation/driver-api/input.rst |   9 -
 drivers/input/Kconfig              |  13 --
 drivers/input/Makefile             |   1 -
 drivers/input/input-polldev.c      | 362 -------------------------------------
 include/linux/input-polldev.h      |  58 ------
 5 files changed, 443 deletions(-)
 delete mode 100644 drivers/input/input-polldev.c
 delete mode 100644 include/linux/input-polldev.h

(limited to 'include/linux')

diff --git a/Documentation/driver-api/input.rst b/Documentation/driver-api/input.rst
index d05bf58fa83e..4bbb26ae2a89 100644
--- a/Documentation/driver-api/input.rst
+++ b/Documentation/driver-api/input.rst
@@ -25,15 +25,6 @@ Multitouch Library
 .. kernel-doc:: drivers/input/input-mt.c
    :export:
 
-Polled input devices
---------------------
-
-.. kernel-doc:: include/linux/input-polldev.h
-   :internal:
-
-.. kernel-doc:: drivers/input/input-polldev.c
-   :export:
-
 Matrix keyboards/keypads
 ------------------------
 
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index 1efd3154b68d..ec0e861f185f 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -52,19 +52,6 @@ config INPUT_FF_MEMLESS
 	  To compile this driver as a module, choose M here: the
 	  module will be called ff-memless.
 
-config INPUT_POLLDEV
-	tristate "Polled input device skeleton"
-	help
-	  Say Y here if you are using a driver for an input
-	  device that periodically polls hardware state. This
-	  option is only useful for out-of-tree drivers since
-	  in-tree drivers select it automatically.
-
-	  If unsure, say N.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called input-polldev.
-
 config INPUT_SPARSEKMAP
 	tristate "Sparse keymap support library"
 	help
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index e35650930371..d8f5310e22ba 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -9,7 +9,6 @@ obj-$(CONFIG_INPUT)		+= input-core.o
 input-core-y := input.o input-compat.o input-mt.o input-poller.o ff-core.o
 
 obj-$(CONFIG_INPUT_FF_MEMLESS)	+= ff-memless.o
-obj-$(CONFIG_INPUT_POLLDEV)	+= input-polldev.o
 obj-$(CONFIG_INPUT_SPARSEKMAP)	+= sparse-keymap.o
 obj-$(CONFIG_INPUT_MATRIXKMAP)	+= matrix-keymap.o
 
diff --git a/drivers/input/input-polldev.c b/drivers/input/input-polldev.c
deleted file mode 100644
index 9bf1c9aeb4c4..000000000000
--- a/drivers/input/input-polldev.c
+++ /dev/null
@@ -1,362 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Generic implementation of a polled input device
-
- * Copyright (c) 2007 Dmitry Torokhov
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/jiffies.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/workqueue.h>
-#include <linux/module.h>
-#include <linux/input-polldev.h>
-
-MODULE_AUTHOR("Dmitry Torokhov <dtor@mail.ru>");
-MODULE_DESCRIPTION("Generic implementation of a polled input device");
-MODULE_LICENSE("GPL v2");
-
-static void input_polldev_queue_work(struct input_polled_dev *dev)
-{
-	unsigned long delay;
-
-	delay = msecs_to_jiffies(dev->poll_interval);
-	if (delay >= HZ)
-		delay = round_jiffies_relative(delay);
-
-	queue_delayed_work(system_freezable_wq, &dev->work, delay);
-}
-
-static void input_polled_device_work(struct work_struct *work)
-{
-	struct input_polled_dev *dev =
-		container_of(work, struct input_polled_dev, work.work);
-
-	dev->poll(dev);
-	input_polldev_queue_work(dev);
-}
-
-static int input_open_polled_device(struct input_dev *input)
-{
-	struct input_polled_dev *dev = input_get_drvdata(input);
-
-	if (dev->open)
-		dev->open(dev);
-
-	/* Only start polling if polling is enabled */
-	if (dev->poll_interval > 0) {
-		dev->poll(dev);
-		input_polldev_queue_work(dev);
-	}
-
-	return 0;
-}
-
-static void input_close_polled_device(struct input_dev *input)
-{
-	struct input_polled_dev *dev = input_get_drvdata(input);
-
-	cancel_delayed_work_sync(&dev->work);
-
-	if (dev->close)
-		dev->close(dev);
-}
-
-/* SYSFS interface */
-
-static ssize_t input_polldev_get_poll(struct device *dev,
-				      struct device_attribute *attr, char *buf)
-{
-	struct input_polled_dev *polldev = dev_get_drvdata(dev);
-
-	return sprintf(buf, "%d\n", polldev->poll_interval);
-}
-
-static ssize_t input_polldev_set_poll(struct device *dev,
-				struct device_attribute *attr, const char *buf,
-				size_t count)
-{
-	struct input_polled_dev *polldev = dev_get_drvdata(dev);
-	struct input_dev *input = polldev->input;
-	unsigned int interval;
-	int err;
-
-	err = kstrtouint(buf, 0, &interval);
-	if (err)
-		return err;
-
-	if (interval < polldev->poll_interval_min)
-		return -EINVAL;
-
-	if (interval > polldev->poll_interval_max)
-		return -EINVAL;
-
-	mutex_lock(&input->mutex);
-
-	polldev->poll_interval = interval;
-
-	if (input->users) {
-		cancel_delayed_work_sync(&polldev->work);
-		if (polldev->poll_interval > 0)
-			input_polldev_queue_work(polldev);
-	}
-
-	mutex_unlock(&input->mutex);
-
-	return count;
-}
-
-static DEVICE_ATTR(poll, S_IRUGO | S_IWUSR, input_polldev_get_poll,
-					    input_polldev_set_poll);
-
-
-static ssize_t input_polldev_get_max(struct device *dev,
-				     struct device_attribute *attr, char *buf)
-{
-	struct input_polled_dev *polldev = dev_get_drvdata(dev);
-
-	return sprintf(buf, "%d\n", polldev->poll_interval_max);
-}
-
-static DEVICE_ATTR(max, S_IRUGO, input_polldev_get_max, NULL);
-
-static ssize_t input_polldev_get_min(struct device *dev,
-				     struct device_attribute *attr, char *buf)
-{
-	struct input_polled_dev *polldev = dev_get_drvdata(dev);
-
-	return sprintf(buf, "%d\n", polldev->poll_interval_min);
-}
-
-static DEVICE_ATTR(min, S_IRUGO, input_polldev_get_min, NULL);
-
-static struct attribute *sysfs_attrs[] = {
-	&dev_attr_poll.attr,
-	&dev_attr_max.attr,
-	&dev_attr_min.attr,
-	NULL
-};
-
-static struct attribute_group input_polldev_attribute_group = {
-	.attrs = sysfs_attrs
-};
-
-static const struct attribute_group *input_polldev_attribute_groups[] = {
-	&input_polldev_attribute_group,
-	NULL
-};
-
-/**
- * input_allocate_polled_device - allocate memory for polled device
- *
- * The function allocates memory for a polled device and also
- * for an input device associated with this polled device.
- */
-struct input_polled_dev *input_allocate_polled_device(void)
-{
-	struct input_polled_dev *dev;
-
-	dev = kzalloc(sizeof(struct input_polled_dev), GFP_KERNEL);
-	if (!dev)
-		return NULL;
-
-	dev->input = input_allocate_device();
-	if (!dev->input) {
-		kfree(dev);
-		return NULL;
-	}
-
-	return dev;
-}
-EXPORT_SYMBOL(input_allocate_polled_device);
-
-struct input_polled_devres {
-	struct input_polled_dev *polldev;
-};
-
-static int devm_input_polldev_match(struct device *dev, void *res, void *data)
-{
-	struct input_polled_devres *devres = res;
-
-	return devres->polldev == data;
-}
-
-static void devm_input_polldev_release(struct device *dev, void *res)
-{
-	struct input_polled_devres *devres = res;
-	struct input_polled_dev *polldev = devres->polldev;
-
-	dev_dbg(dev, "%s: dropping reference/freeing %s\n",
-		__func__, dev_name(&polldev->input->dev));
-
-	input_put_device(polldev->input);
-	kfree(polldev);
-}
-
-static void devm_input_polldev_unregister(struct device *dev, void *res)
-{
-	struct input_polled_devres *devres = res;
-	struct input_polled_dev *polldev = devres->polldev;
-
-	dev_dbg(dev, "%s: unregistering device %s\n",
-		__func__, dev_name(&polldev->input->dev));
-	input_unregister_device(polldev->input);
-
-	/*
-	 * Note that we are still holding extra reference to the input
-	 * device so it will stick around until devm_input_polldev_release()
-	 * is called.
-	 */
-}
-
-/**
- * devm_input_allocate_polled_device - allocate managed polled device
- * @dev: device owning the polled device being created
- *
- * Returns prepared &struct input_polled_dev or %NULL.
- *
- * Managed polled input devices do not need to be explicitly unregistered
- * or freed as it will be done automatically when owner device unbinds
- * from * its driver (or binding fails). Once such managed polled device
- * is allocated, it is ready to be set up and registered in the same
- * fashion as regular polled input devices (using
- * input_register_polled_device() function).
- *
- * If you want to manually unregister and free such managed polled devices,
- * it can be still done by calling input_unregister_polled_device() and
- * input_free_polled_device(), although it is rarely needed.
- *
- * NOTE: the owner device is set up as parent of input device and users
- * should not override it.
- */
-struct input_polled_dev *devm_input_allocate_polled_device(struct device *dev)
-{
-	struct input_polled_dev *polldev;
-	struct input_polled_devres *devres;
-
-	devres = devres_alloc(devm_input_polldev_release, sizeof(*devres),
-			      GFP_KERNEL);
-	if (!devres)
-		return NULL;
-
-	polldev = input_allocate_polled_device();
-	if (!polldev) {
-		devres_free(devres);
-		return NULL;
-	}
-
-	polldev->input->dev.parent = dev;
-	polldev->devres_managed = true;
-
-	devres->polldev = polldev;
-	devres_add(dev, devres);
-
-	return polldev;
-}
-EXPORT_SYMBOL(devm_input_allocate_polled_device);
-
-/**
- * input_free_polled_device - free memory allocated for polled device
- * @dev: device to free
- *
- * The function frees memory allocated for polling device and drops
- * reference to the associated input device.
- */
-void input_free_polled_device(struct input_polled_dev *dev)
-{
-	if (dev) {
-		if (dev->devres_managed)
-			WARN_ON(devres_destroy(dev->input->dev.parent,
-						devm_input_polldev_release,
-						devm_input_polldev_match,
-						dev));
-		input_put_device(dev->input);
-		kfree(dev);
-	}
-}
-EXPORT_SYMBOL(input_free_polled_device);
-
-/**
- * input_register_polled_device - register polled device
- * @dev: device to register
- *
- * The function registers previously initialized polled input device
- * with input layer. The device should be allocated with call to
- * input_allocate_polled_device(). Callers should also set up poll()
- * method and set up capabilities (id, name, phys, bits) of the
- * corresponding input_dev structure.
- */
-int input_register_polled_device(struct input_polled_dev *dev)
-{
-	struct input_polled_devres *devres = NULL;
-	struct input_dev *input = dev->input;
-	int error;
-
-	if (dev->devres_managed) {
-		devres = devres_alloc(devm_input_polldev_unregister,
-				      sizeof(*devres), GFP_KERNEL);
-		if (!devres)
-			return -ENOMEM;
-
-		devres->polldev = dev;
-	}
-
-	input_set_drvdata(input, dev);
-	INIT_DELAYED_WORK(&dev->work, input_polled_device_work);
-
-	if (!dev->poll_interval)
-		dev->poll_interval = 500;
-	if (!dev->poll_interval_max)
-		dev->poll_interval_max = dev->poll_interval;
-
-	input->open = input_open_polled_device;
-	input->close = input_close_polled_device;
-
-	input->dev.groups = input_polldev_attribute_groups;
-
-	error = input_register_device(input);
-	if (error) {
-		devres_free(devres);
-		return error;
-	}
-
-	/*
-	 * Take extra reference to the underlying input device so
-	 * that it survives call to input_unregister_polled_device()
-	 * and is deleted only after input_free_polled_device()
-	 * has been invoked. This is needed to ease task of freeing
-	 * sparse keymaps.
-	 */
-	input_get_device(input);
-
-	if (dev->devres_managed) {
-		dev_dbg(input->dev.parent, "%s: registering %s with devres.\n",
-			__func__, dev_name(&input->dev));
-		devres_add(input->dev.parent, devres);
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(input_register_polled_device);
-
-/**
- * input_unregister_polled_device - unregister polled device
- * @dev: device to unregister
- *
- * The function unregisters previously registered polled input
- * device from input layer. Polling is stopped and device is
- * ready to be freed with call to input_free_polled_device().
- */
-void input_unregister_polled_device(struct input_polled_dev *dev)
-{
-	if (dev->devres_managed)
-		WARN_ON(devres_destroy(dev->input->dev.parent,
-					devm_input_polldev_unregister,
-					devm_input_polldev_match,
-					dev));
-
-	input_unregister_device(dev->input);
-}
-EXPORT_SYMBOL(input_unregister_polled_device);
diff --git a/include/linux/input-polldev.h b/include/linux/input-polldev.h
deleted file mode 100644
index 14821fd231c0..000000000000
--- a/include/linux/input-polldev.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _INPUT_POLLDEV_H
-#define _INPUT_POLLDEV_H
-
-/*
- * Copyright (c) 2007 Dmitry Torokhov
- */
-
-#include <linux/input.h>
-#include <linux/workqueue.h>
-
-/**
- * struct input_polled_dev - simple polled input device
- * @private: private driver data.
- * @open: driver-supplied method that prepares device for polling
- *	(enabled the device and maybe flushes device state).
- * @close: driver-supplied method that is called when device is no
- *	longer being polled. Used to put device into low power mode.
- * @poll: driver-supplied method that polls the device and posts
- *	input events (mandatory).
- * @poll_interval: specifies how often the poll() method should be called.
- *	Defaults to 500 msec unless overridden when registering the device.
- * @poll_interval_max: specifies upper bound for the poll interval.
- *	Defaults to the initial value of @poll_interval.
- * @poll_interval_min: specifies lower bound for the poll interval.
- *	Defaults to 0.
- * @input: input device structure associated with the polled device.
- *	Must be properly initialized by the driver (id, name, phys, bits).
- *
- * Polled input device provides a skeleton for supporting simple input
- * devices that do not raise interrupts but have to be periodically
- * scanned or polled to detect changes in their state.
- */
-struct input_polled_dev {
-	void *private;
-
-	void (*open)(struct input_polled_dev *dev);
-	void (*close)(struct input_polled_dev *dev);
-	void (*poll)(struct input_polled_dev *dev);
-	unsigned int poll_interval; /* msec */
-	unsigned int poll_interval_max; /* msec */
-	unsigned int poll_interval_min; /* msec */
-
-	struct input_dev *input;
-
-/* private: */
-	struct delayed_work work;
-
-	bool devres_managed;
-};
-
-struct input_polled_dev *input_allocate_polled_device(void);
-struct input_polled_dev *devm_input_allocate_polled_device(struct device *dev);
-void input_free_polled_device(struct input_polled_dev *dev);
-int input_register_polled_device(struct input_polled_dev *dev);
-void input_unregister_polled_device(struct input_polled_dev *dev);
-
-#endif
-- 
cgit 


From 427167c0b064ed898b848209add62b4322ec7840 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 2 Dec 2020 09:25:15 -0800
Subject: bpf: Allow bpf_{s,g}etsockopt from cgroup bind{4,6} hooks

I have to now lock/unlock socket for the bind hook execution.
That shouldn't cause any overhead because the socket is unbound
and shouldn't receive any traffic.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrey Ignatov <rdna@fb.com>
Link: https://lore.kernel.org/bpf/20201202172516.3483656-3-sdf@google.com
---
 include/linux/bpf-cgroup.h | 12 ++++++------
 net/core/filter.c          |  4 ++++
 net/ipv4/af_inet.c         |  2 +-
 net/ipv6/af_inet6.c        |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ed71bd1a0825..72e69a0e1e8c 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -246,11 +246,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
+#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr)			       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL)
 
-#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
+#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr)			       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL)
 
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \
 					    sk->sk_prot->pre_connect)
@@ -434,8 +434,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
diff --git a/net/core/filter.c b/net/core/filter.c
index 2ca5eecebacf..21d91dcf0260 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6995,6 +6995,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_storage_delete_proto;
 	case BPF_FUNC_setsockopt:
 		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_INET4_BIND:
+		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
 			return &bpf_sock_addr_setsockopt_proto;
@@ -7003,6 +7005,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		}
 	case BPF_FUNC_getsockopt:
 		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_INET4_BIND:
+		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
 			return &bpf_sock_addr_getsockopt_proto;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b7260c8cef2e..b94fa8eb831b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
+	err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr);
 	if (err)
 		return err;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e648fbebb167..a7e3d170af51 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+	err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr);
 	if (err)
 		return err;
 
-- 
cgit 


From ec0caa974cd092549ab282deb8ec7ea73b36eba0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 2 Dec 2020 18:20:37 -0800
Subject: fscrypt: introduce fscrypt_prepare_readdir()

The last remaining use of fscrypt_get_encryption_info() from filesystems
is for readdir (->iterate_shared()).  Every other call is now in
fs/crypto/ as part of some other higher-level operation.

We need to add a new argument to fscrypt_get_encryption_info() to
indicate whether the encryption policy is allowed to be unrecognized or
not.  Doing this is easier if we can work with high-level operations
rather than direct filesystem use of fscrypt_get_encryption_info().

So add a function fscrypt_prepare_readdir() which wraps the call to
fscrypt_get_encryption_info() for the readdir use case.

Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Link: https://lore.kernel.org/r/20201203022041.230976-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/hooks.c       |  6 ++++++
 fs/ext4/dir.c           |  8 +++-----
 fs/ext4/namei.c         |  2 +-
 fs/f2fs/dir.c           |  2 +-
 fs/ubifs/dir.c          |  2 +-
 include/linux/fscrypt.h | 24 ++++++++++++++++++++++++
 6 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index c809a4afa057..82f351d3113a 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -114,6 +114,12 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
 
+int __fscrypt_prepare_readdir(struct inode *dir)
+{
+	return fscrypt_get_encryption_info(dir);
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir);
+
 /**
  * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS
  * @inode: the inode on which flags are being changed
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 16bfbdd5007c..c6d16353326a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -118,11 +118,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 	struct buffer_head *bh = NULL;
 	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
 
-	if (IS_ENCRYPTED(inode)) {
-		err = fscrypt_get_encryption_info(inode);
-		if (err)
-			return err;
-	}
+	err = fscrypt_prepare_readdir(inode);
+	if (err)
+		return err;
 
 	if (is_dx_dir(inode)) {
 		err = ext4_dx_readdir(file, ctx);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 7b31aea3e025..5fa8436cd5fa 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1004,7 +1004,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 					   EXT4_DIR_REC_LEN(0));
 	/* Check if the directory is encrypted */
 	if (IS_ENCRYPTED(dir)) {
-		err = fscrypt_get_encryption_info(dir);
+		err = fscrypt_prepare_readdir(dir);
 		if (err < 0) {
 			brelse(bh);
 			return err;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 47bee953fc8d..049500f1e764 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -1022,7 +1022,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	int err = 0;
 
 	if (IS_ENCRYPTED(inode)) {
-		err = fscrypt_get_encryption_info(inode);
+		err = fscrypt_prepare_readdir(inode);
 		if (err)
 			goto out;
 
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 009fbf844d3e..1f33a5598b93 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -514,7 +514,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 		return 0;
 
 	if (encrypted) {
-		err = fscrypt_get_encryption_info(dir);
+		err = fscrypt_prepare_readdir(dir);
 		if (err)
 			return err;
 
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 0c9e64969b73..8cbb26f55695 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -242,6 +242,7 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
 			     unsigned int flags);
 int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 			     struct fscrypt_name *fname);
+int __fscrypt_prepare_readdir(struct inode *dir);
 int fscrypt_prepare_setflags(struct inode *inode,
 			     unsigned int oldflags, unsigned int flags);
 int fscrypt_prepare_symlink(struct inode *dir, const char *target,
@@ -537,6 +538,11 @@ static inline int __fscrypt_prepare_lookup(struct inode *dir,
 	return -EOPNOTSUPP;
 }
 
+static inline int __fscrypt_prepare_readdir(struct inode *dir)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int fscrypt_prepare_setflags(struct inode *inode,
 					   unsigned int oldflags,
 					   unsigned int flags)
@@ -795,6 +801,24 @@ static inline int fscrypt_prepare_lookup(struct inode *dir,
 	return 0;
 }
 
+/**
+ * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory
+ * @dir: the directory inode
+ *
+ * If the directory is encrypted and it doesn't already have its encryption key
+ * set up, try to set it up so that the filenames will be listed in plaintext
+ * form rather than in no-key form.
+ *
+ * Return: 0 on success; -errno on error.  Note that the encryption key being
+ *	   unavailable is not considered an error.
+ */
+static inline int fscrypt_prepare_readdir(struct inode *dir)
+{
+	if (IS_ENCRYPTED(dir))
+		return __fscrypt_prepare_readdir(dir);
+	return 0;
+}
+
 /**
  * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's
  *			       attributes
-- 
cgit 


From 7622350e5eda2cc57a72c6b27f1405d8b4f94670 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 2 Dec 2020 18:20:38 -0800
Subject: fscrypt: move body of fscrypt_prepare_setattr() out-of-line

In preparation for reducing the visibility of fscrypt_require_key() by
moving it to fscrypt_private.h, move the call to it from
fscrypt_prepare_setattr() to an out-of-line function.

Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Link: https://lore.kernel.org/r/20201203022041.230976-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/hooks.c       |  8 ++++++++
 include/linux/fscrypt.h | 11 +++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 82f351d3113a..1c16dba222d9 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -120,6 +120,14 @@ int __fscrypt_prepare_readdir(struct inode *dir)
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir);
 
+int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	if (attr->ia_valid & ATTR_SIZE)
+		return fscrypt_require_key(d_inode(dentry));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);
+
 /**
  * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS
  * @inode: the inode on which flags are being changed
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 8cbb26f55695..b20900bb829f 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -243,6 +243,7 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
 int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 			     struct fscrypt_name *fname);
 int __fscrypt_prepare_readdir(struct inode *dir);
+int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr);
 int fscrypt_prepare_setflags(struct inode *inode,
 			     unsigned int oldflags, unsigned int flags);
 int fscrypt_prepare_symlink(struct inode *dir, const char *target,
@@ -543,6 +544,12 @@ static inline int __fscrypt_prepare_readdir(struct inode *dir)
 	return -EOPNOTSUPP;
 }
 
+static inline int __fscrypt_prepare_setattr(struct dentry *dentry,
+					    struct iattr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int fscrypt_prepare_setflags(struct inode *inode,
 					   unsigned int oldflags,
 					   unsigned int flags)
@@ -840,8 +847,8 @@ static inline int fscrypt_prepare_readdir(struct inode *dir)
 static inline int fscrypt_prepare_setattr(struct dentry *dentry,
 					  struct iattr *attr)
 {
-	if (attr->ia_valid & ATTR_SIZE)
-		return fscrypt_require_key(d_inode(dentry));
+	if (IS_ENCRYPTED(d_inode(dentry)))
+		return __fscrypt_prepare_setattr(dentry, attr);
 	return 0;
 }
 
-- 
cgit 


From de3cdc6e75179a2324c23400b21483a1372c95e1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 2 Dec 2020 18:20:39 -0800
Subject: fscrypt: move fscrypt_require_key() to fscrypt_private.h

fscrypt_require_key() is now only used by files in fs/crypto/.  So
reduce its visibility to fscrypt_private.h.  This is also a prerequsite
for unexporting fscrypt_get_encryption_info().

Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Link: https://lore.kernel.org/r/20201203022041.230976-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fscrypt_private.h | 26 ++++++++++++++++++++++++++
 include/linux/fscrypt.h     | 26 --------------------------
 2 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index a61d4dbf0a0b..16dd55080127 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -571,6 +571,32 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
 void fscrypt_hash_inode_number(struct fscrypt_info *ci,
 			       const struct fscrypt_master_key *mk);
 
+/**
+ * fscrypt_require_key() - require an inode's encryption key
+ * @inode: the inode we need the key for
+ *
+ * If the inode is encrypted, set up its encryption key if not already done.
+ * Then require that the key be present and return -ENOKEY otherwise.
+ *
+ * No locks are needed, and the key will live as long as the struct inode --- so
+ * it won't go away from under you.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_require_key(struct inode *inode)
+{
+	if (IS_ENCRYPTED(inode)) {
+		int err = fscrypt_get_encryption_info(inode);
+
+		if (err)
+			return err;
+		if (!fscrypt_has_encryption_key(inode))
+			return -ENOKEY;
+	}
+	return 0;
+}
+
 /* keysetup_v1.c */
 
 void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index b20900bb829f..a07610f27926 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -688,32 +688,6 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode)
 	return fscrypt_get_info(inode) != NULL;
 }
 
-/**
- * fscrypt_require_key() - require an inode's encryption key
- * @inode: the inode we need the key for
- *
- * If the inode is encrypted, set up its encryption key if not already done.
- * Then require that the key be present and return -ENOKEY otherwise.
- *
- * No locks are needed, and the key will live as long as the struct inode --- so
- * it won't go away from under you.
- *
- * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
- * if a problem occurred while setting up the encryption key.
- */
-static inline int fscrypt_require_key(struct inode *inode)
-{
-	if (IS_ENCRYPTED(inode)) {
-		int err = fscrypt_get_encryption_info(inode);
-
-		if (err)
-			return err;
-		if (!fscrypt_has_encryption_key(inode))
-			return -ENOKEY;
-	}
-	return 0;
-}
-
 /**
  * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted
  *			    directory
-- 
cgit 


From 5b421f08801fe8247dec368b3d323958f419e769 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 2 Dec 2020 18:20:40 -0800
Subject: fscrypt: unexport fscrypt_get_encryption_info()

Now that fscrypt_get_encryption_info() is only called from files in
fs/crypto/ (due to all key setup now being handled by higher-level
helper functions instead of directly by filesystems), unexport it and
move its declaration to fscrypt_private.h.

Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Link: https://lore.kernel.org/r/20201203022041.230976-9-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fscrypt_private.h | 2 ++
 fs/crypto/keysetup.c        | 1 -
 include/linux/fscrypt.h     | 7 +------
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 16dd55080127..c1c302656c34 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -571,6 +571,8 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
 void fscrypt_hash_inode_number(struct fscrypt_info *ci,
 			       const struct fscrypt_master_key *mk);
 
+int fscrypt_get_encryption_info(struct inode *inode);
+
 /**
  * fscrypt_require_key() - require an inode's encryption key
  * @inode: the inode we need the key for
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 50675b42d5b7..6339b3069a40 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -589,7 +589,6 @@ int fscrypt_get_encryption_info(struct inode *inode)
 		res = 0;
 	return res;
 }
-EXPORT_SYMBOL(fscrypt_get_encryption_info);
 
 /**
  * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index a07610f27926..4b163f5e58e9 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -75,7 +75,7 @@ struct fscrypt_operations {
 static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
 {
 	/*
-	 * Pairs with the cmpxchg_release() in fscrypt_get_encryption_info().
+	 * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
 	 * I.e., another task may publish ->i_crypt_info concurrently, executing
 	 * a RELEASE barrier.  We need to use smp_load_acquire() here to safely
 	 * ACQUIRE the memory the other task published.
@@ -200,7 +200,6 @@ int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg);
 int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg);
 
 /* keysetup.c */
-int fscrypt_get_encryption_info(struct inode *inode);
 int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
 			      bool *encrypt_ret);
 void fscrypt_put_encryption_info(struct inode *inode);
@@ -408,10 +407,6 @@ static inline int fscrypt_ioctl_get_key_status(struct file *filp,
 }
 
 /* keysetup.c */
-static inline int fscrypt_get_encryption_info(struct inode *inode)
-{
-	return -EOPNOTSUPP;
-}
 
 static inline int fscrypt_prepare_new_inode(struct inode *dir,
 					    struct inode *inode,
-- 
cgit 


From a14d0b6764917b21ee6fdfd2a8a4c2920fbefcce Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 2 Dec 2020 18:20:41 -0800
Subject: fscrypt: allow deleting files with unsupported encryption policy

Currently it's impossible to delete files that use an unsupported
encryption policy, as the kernel will just return an error when
performing any operation on the top-level encrypted directory, even just
a path lookup into the directory or opening the directory for readdir.

More specifically, this occurs in any of the following cases:

- The encryption context has an unrecognized version number.  Current
  kernels know about v1 and v2, but there could be more versions in the
  future.

- The encryption context has unrecognized encryption modes
  (FSCRYPT_MODE_*) or flags (FSCRYPT_POLICY_FLAG_*), an unrecognized
  combination of modes, or reserved bits set.

- The encryption key has been added and the encryption modes are
  recognized but aren't available in the crypto API -- for example, a
  directory is encrypted with FSCRYPT_MODE_ADIANTUM but the kernel
  doesn't have CONFIG_CRYPTO_ADIANTUM enabled.

It's desirable to return errors for most operations on files that use an
unsupported encryption policy, but the current behavior is too strict.
We need to allow enough to delete files, so that people can't be stuck
with undeletable files when downgrading kernel versions.  That includes
allowing directories to be listed and allowing dentries to be looked up.

Fix this by modifying the key setup logic to treat an unsupported
encryption policy in the same way as "key unavailable" in the cases that
are required for a recursive delete to work: preparing for a readdir or
a dentry lookup, revalidating a dentry, or checking whether an inode has
the same encryption policy as its parent directory.

Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Link: https://lore.kernel.org/r/20201203022041.230976-10-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fname.c           |  8 ++++++--
 fs/crypto/fscrypt_private.h |  4 ++--
 fs/crypto/hooks.c           |  4 ++--
 fs/crypto/keysetup.c        | 19 +++++++++++++++++--
 fs/crypto/policy.c          | 22 ++++++++++++++--------
 include/linux/fscrypt.h     |  9 ++++++---
 6 files changed, 47 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 1fbe6c24d705..988dadf7a94d 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -404,7 +404,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		fname->disk_name.len = iname->len;
 		return 0;
 	}
-	ret = fscrypt_get_encryption_info(dir);
+	ret = fscrypt_get_encryption_info(dir, lookup);
 	if (ret)
 		return ret;
 
@@ -560,7 +560,11 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 		return -ECHILD;
 
 	dir = dget_parent(dentry);
-	err = fscrypt_get_encryption_info(d_inode(dir));
+	/*
+	 * Pass allow_unsupported=true, so that files with an unsupported
+	 * encryption policy can be deleted.
+	 */
+	err = fscrypt_get_encryption_info(d_inode(dir), true);
 	valid = !fscrypt_has_encryption_key(d_inode(dir));
 	dput(dir);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index c1c302656c34..f0bed6b06fa6 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -571,7 +571,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
 void fscrypt_hash_inode_number(struct fscrypt_info *ci,
 			       const struct fscrypt_master_key *mk);
 
-int fscrypt_get_encryption_info(struct inode *inode);
+int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);
 
 /**
  * fscrypt_require_key() - require an inode's encryption key
@@ -589,7 +589,7 @@ int fscrypt_get_encryption_info(struct inode *inode);
 static inline int fscrypt_require_key(struct inode *inode)
 {
 	if (IS_ENCRYPTED(inode)) {
-		int err = fscrypt_get_encryption_info(inode);
+		int err = fscrypt_get_encryption_info(inode, false);
 
 		if (err)
 			return err;
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 1c16dba222d9..79570e0e8e61 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -116,7 +116,7 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
 
 int __fscrypt_prepare_readdir(struct inode *dir)
 {
-	return fscrypt_get_encryption_info(dir);
+	return fscrypt_get_encryption_info(dir, true);
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir);
 
@@ -332,7 +332,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
 	 * Try to set up the symlink's encryption key, but we can continue
 	 * regardless of whether the key is available or not.
 	 */
-	err = fscrypt_get_encryption_info(inode);
+	err = fscrypt_get_encryption_info(inode, false);
 	if (err)
 		return ERR_PTR(err);
 	has_key = fscrypt_has_encryption_key(inode);
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 6339b3069a40..261293fb7097 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -546,6 +546,11 @@ out:
 /**
  * fscrypt_get_encryption_info() - set up an inode's encryption key
  * @inode: the inode to set up the key for.  Must be encrypted.
+ * @allow_unsupported: if %true, treat an unsupported encryption policy (or
+ *		       unrecognized encryption context) the same way as the key
+ *		       being unavailable, instead of returning an error.  Use
+ *		       %false unless the operation being performed is needed in
+ *		       order for files (or directories) to be deleted.
  *
  * Set up ->i_crypt_info, if it hasn't already been done.
  *
@@ -556,7 +561,7 @@ out:
  *	   encryption key is unavailable.  (Use fscrypt_has_encryption_key() to
  *	   distinguish these cases.)  Also can return another -errno code.
  */
-int fscrypt_get_encryption_info(struct inode *inode)
+int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
 {
 	int res;
 	union fscrypt_context ctx;
@@ -567,24 +572,34 @@ int fscrypt_get_encryption_info(struct inode *inode)
 
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
 	if (res < 0) {
+		if (res == -ERANGE && allow_unsupported)
+			return 0;
 		fscrypt_warn(inode, "Error %d getting encryption context", res);
 		return res;
 	}
 
 	res = fscrypt_policy_from_context(&policy, &ctx, res);
 	if (res) {
+		if (allow_unsupported)
+			return 0;
 		fscrypt_warn(inode,
 			     "Unrecognized or corrupt encryption context");
 		return res;
 	}
 
-	if (!fscrypt_supported_policy(&policy, inode))
+	if (!fscrypt_supported_policy(&policy, inode)) {
+		if (allow_unsupported)
+			return 0;
 		return -EINVAL;
+	}
 
 	res = fscrypt_setup_encryption_info(inode, &policy,
 					    fscrypt_context_nonce(&ctx),
 					    IS_CASEFOLDED(inode) &&
 					    S_ISDIR(inode->i_mode));
+
+	if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */
+		res = 0;
 	if (res == -ENOKEY)
 		res = 0;
 	return res;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index faa0f21daa68..a51cef6bd27f 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -590,7 +590,7 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce);
 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 {
 	union fscrypt_policy parent_policy, child_policy;
-	int err;
+	int err, err1, err2;
 
 	/* No restrictions on file types which are never encrypted */
 	if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
@@ -620,19 +620,25 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 	 * In any case, if an unexpected error occurs, fall back to "forbidden".
 	 */
 
-	err = fscrypt_get_encryption_info(parent);
+	err = fscrypt_get_encryption_info(parent, true);
 	if (err)
 		return 0;
-	err = fscrypt_get_encryption_info(child);
+	err = fscrypt_get_encryption_info(child, true);
 	if (err)
 		return 0;
 
-	err = fscrypt_get_policy(parent, &parent_policy);
-	if (err)
-		return 0;
+	err1 = fscrypt_get_policy(parent, &parent_policy);
+	err2 = fscrypt_get_policy(child, &child_policy);
 
-	err = fscrypt_get_policy(child, &child_policy);
-	if (err)
+	/*
+	 * Allow the case where the parent and child both have an unrecognized
+	 * encryption policy, so that files with an unrecognized encryption
+	 * policy can be deleted.
+	 */
+	if (err1 == -EINVAL && err2 == -EINVAL)
+		return 1;
+
+	if (err1 || err2)
 		return 0;
 
 	return fscrypt_policies_equal(&parent_policy, &child_policy);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 4b163f5e58e9..d23156d1ac94 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -753,8 +753,9 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir,
  *
  * Prepare for ->lookup() in a directory which may be encrypted by determining
  * the name that will actually be used to search the directory on-disk.  If the
- * directory's encryption key is available, then the lookup is assumed to be by
- * plaintext name; otherwise, it is assumed to be by no-key name.
+ * directory's encryption policy is supported by this kernel and its encryption
+ * key is available, then the lookup is assumed to be by plaintext name;
+ * otherwise, it is assumed to be by no-key name.
  *
  * This also installs a custom ->d_revalidate() method which will invalidate the
  * dentry if it was created without the key and the key is later added.
@@ -786,7 +787,9 @@ static inline int fscrypt_prepare_lookup(struct inode *dir,
  * form rather than in no-key form.
  *
  * Return: 0 on success; -errno on error.  Note that the encryption key being
- *	   unavailable is not considered an error.
+ *	   unavailable is not considered an error.  It is also not an error if
+ *	   the encryption policy is unsupported by this kernel; that is treated
+ *	   like the key being unavailable, so that files can still be deleted.
  */
 static inline int fscrypt_prepare_readdir(struct inode *dir)
 {
-- 
cgit 


From bcfe06bf2622f7c4899468e427683aec49070687 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 1 Dec 2020 13:58:27 -0800
Subject: mm: memcontrol: Use helpers to read page's memcg data

Patch series "mm: allow mapping accounted kernel pages to userspace", v6.

Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace.  The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter.  Pages with a type set can't be mapped to
userspace.

But in general the kmemcg flag has nothing to do with mapping to
userspace.  It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.

Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.

This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer.  Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions.  As the
result the code became more robust with fewer open-coded bit tricks.

This patch (of 4):

Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.

It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information.  In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.

This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
  struct mem_cgroup *page_memcg(struct page *page);
  struct mem_cgroup *page_memcg_rcu(struct page *page);
  struct mem_cgroup *page_memcg_check(struct page *page);

page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector.  It does
check the lowest bit, and if set, returns NULL.  page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.

To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
---
 fs/buffer.c                      |   2 +-
 fs/iomap/buffered-io.c           |   2 +-
 include/linux/memcontrol.h       | 114 +++++++++++++++++++++++++++++++++---
 include/linux/mm.h               |  22 -------
 include/linux/mm_types.h         |   5 +-
 include/trace/events/writeback.h |   2 +-
 kernel/fork.c                    |   7 ++-
 mm/debug.c                       |   4 +-
 mm/huge_memory.c                 |   4 +-
 mm/memcontrol.c                  | 121 ++++++++++++++++++---------------------
 mm/page_alloc.c                  |   4 +-
 mm/page_io.c                     |   6 +-
 mm/slab.h                        |   9 ++-
 mm/workingset.c                  |   2 +-
 14 files changed, 184 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 23f645657488..b56f99f82b5b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page)
 		} while (bh != head);
 	}
 	/*
-	 * Lock out page->mem_cgroup migration to keep PageDirty
+	 * Lock out page's memcg migration to keep PageDirty
 	 * synchronized with per-memcg dirty page counters.
 	 */
 	lock_page_memcg(page);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 10cc7979ce38..16a1e82e3aeb 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -650,7 +650,7 @@ iomap_set_page_dirty(struct page *page)
 		return !TestSetPageDirty(page);
 
 	/*
-	 * Lock out page->mem_cgroup migration to keep PageDirty
+	 * Lock out page's memcg migration to keep PageDirty
 	 * synchronized with per-memcg dirty page counters.
 	 */
 	lock_page_memcg(page);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e391e3c56de5..f95c1433461c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -343,6 +343,79 @@ struct mem_cgroup {
 
 extern struct mem_cgroup *root_mem_cgroup;
 
+/*
+ * page_memcg - get the memory cgroup associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper memory cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages.
+ *
+ * Any of the following ensures page and memcg binding stability:
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
+ */
+static inline struct mem_cgroup *page_memcg(struct page *page)
+{
+	VM_BUG_ON_PAGE(PageSlab(page), page);
+	return (struct mem_cgroup *)page->memcg_data;
+}
+
+/*
+ * page_memcg_rcu - locklessly get the memory cgroup associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper memory cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages.
+ */
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+	VM_BUG_ON_PAGE(PageSlab(page), page);
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return (struct mem_cgroup *)READ_ONCE(page->memcg_data);
+}
+
+/*
+ * page_memcg_check - get the memory cgroup associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function unlike page_memcg() can take any  page
+ * as an argument. It has to be used in cases when it's not known if a page
+ * has an associated memory cgroup pointer or an object cgroups vector.
+ *
+ * Any of the following ensures page and memcg binding stability:
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
+ */
+static inline struct mem_cgroup *page_memcg_check(struct page *page)
+{
+	/*
+	 * Because page->memcg_data might be changed asynchronously
+	 * for slab pages, READ_ONCE() should be used here.
+	 */
+	unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
+	/*
+	 * The lowest bit set means that memcg isn't a valid
+	 * memcg pointer, but a obj_cgroups pointer.
+	 * In this case the page is shared and doesn't belong
+	 * to any specific memory cgroup.
+	 */
+	if (memcg_data & 0x1UL)
+		return NULL;
+
+	return (struct mem_cgroup *)memcg_data;
+}
+
 static __always_inline bool memcg_stat_item_in_bytes(int idx)
 {
 	if (idx == MEMCG_PERCPU_B)
@@ -743,15 +816,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 static inline void __mod_memcg_page_state(struct page *page,
 					  int idx, int val)
 {
-	if (page->mem_cgroup)
-		__mod_memcg_state(page->mem_cgroup, idx, val);
+	struct mem_cgroup *memcg = page_memcg(page);
+
+	if (memcg)
+		__mod_memcg_state(memcg, idx, val);
 }
 
 static inline void mod_memcg_page_state(struct page *page,
 					int idx, int val)
 {
-	if (page->mem_cgroup)
-		mod_memcg_state(page->mem_cgroup, idx, val);
+	struct mem_cgroup *memcg = page_memcg(page);
+
+	if (memcg)
+		mod_memcg_state(memcg, idx, val);
 }
 
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
@@ -834,16 +911,17 @@ static inline void __mod_lruvec_page_state(struct page *page,
 					   enum node_stat_item idx, int val)
 {
 	struct page *head = compound_head(page); /* rmap on tail pages */
+	struct mem_cgroup *memcg = page_memcg(head);
 	pg_data_t *pgdat = page_pgdat(page);
 	struct lruvec *lruvec;
 
 	/* Untracked pages have no memcg, no lruvec. Update only the node */
-	if (!head->mem_cgroup) {
+	if (!memcg) {
 		__mod_node_page_state(pgdat, idx, val);
 		return;
 	}
 
-	lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
+	lruvec = mem_cgroup_lruvec(memcg, pgdat);
 	__mod_lruvec_state(lruvec, idx, val);
 }
 
@@ -878,8 +956,10 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 static inline void count_memcg_page_event(struct page *page,
 					  enum vm_event_item idx)
 {
-	if (page->mem_cgroup)
-		count_memcg_events(page->mem_cgroup, idx, 1);
+	struct mem_cgroup *memcg = page_memcg(page);
+
+	if (memcg)
+		count_memcg_events(memcg, idx, 1);
 }
 
 static inline void count_memcg_event_mm(struct mm_struct *mm,
@@ -941,6 +1021,22 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 
 struct mem_cgroup;
 
+static inline struct mem_cgroup *page_memcg(struct page *page)
+{
+	return NULL;
+}
+
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return NULL;
+}
+
+static inline struct mem_cgroup *page_memcg_check(struct page *page)
+{
+	return NULL;
+}
+
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return true;
@@ -1430,7 +1526,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page,
 	if (mem_cgroup_disabled())
 		return;
 
-	if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
+	if (unlikely(&page_memcg(page)->css != wb->memcg_css))
 		mem_cgroup_track_foreign_dirty_slowpath(page, wb);
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..6b0c9d2c1d10 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1484,28 +1484,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
 #endif
 }
 
-#ifdef CONFIG_MEMCG
-static inline struct mem_cgroup *page_memcg(struct page *page)
-{
-	return page->mem_cgroup;
-}
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	return READ_ONCE(page->mem_cgroup);
-}
-#else
-static inline struct mem_cgroup *page_memcg(struct page *page)
-{
-	return NULL;
-}
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	return NULL;
-}
-#endif
-
 /*
  * Some inline functions in vmstat.h depend on page_zone()
  */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5a9238f6caad..80f5d755c037 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -199,10 +199,7 @@ struct page {
 	atomic_t _refcount;
 
 #ifdef CONFIG_MEMCG
-	union {
-		struct mem_cgroup *mem_cgroup;
-		struct obj_cgroup **obj_cgroups;
-	};
+	unsigned long memcg_data;
 #endif
 
 	/*
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index e7cbccc7c14c..39a40dfb578a 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -257,7 +257,7 @@ TRACE_EVENT(track_foreign_dirty,
 		__entry->ino		= inode ? inode->i_ino : 0;
 		__entry->memcg_id	= wb->memcg_css->id;
 		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
-		__entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup);
+		__entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup);
 	),
 
 	TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..cbd4f6f58409 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -404,9 +404,10 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
 
 		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
 			/*
-			 * If memcg_kmem_charge_page() fails, page->mem_cgroup
-			 * pointer is NULL, and memcg_kmem_uncharge_page() in
-			 * free_thread_stack() will ignore this page.
+			 * If memcg_kmem_charge_page() fails, page's
+			 * memory cgroup pointer is NULL, and
+			 * memcg_kmem_uncharge_page() in free_thread_stack()
+			 * will ignore this page.
 			 */
 			ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
 						     0);
diff --git a/mm/debug.c b/mm/debug.c
index ccca576b2899..8a40b3fefbeb 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -182,8 +182,8 @@ hex_only:
 		pr_warn("page dumped because: %s\n", reason);
 
 #ifdef CONFIG_MEMCG
-	if (!page_poisoned && page->mem_cgroup)
-		pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
+	if (!page_poisoned && page->memcg_data)
+		pr_warn("pages's memcg:%lx\n", page->memcg_data);
 #endif
 }
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9474dbc150ed..cedfb3503411 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -470,7 +470,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 #ifdef CONFIG_MEMCG
 static inline struct deferred_split *get_deferred_split_queue(struct page *page)
 {
-	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+	struct mem_cgroup *memcg = page_memcg(compound_head(page));
 	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
 
 	if (memcg)
@@ -2765,7 +2765,7 @@ void deferred_split_huge_page(struct page *page)
 {
 	struct deferred_split *ds_queue = get_deferred_split_queue(page);
 #ifdef CONFIG_MEMCG
-	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+	struct mem_cgroup *memcg = page_memcg(compound_head(page));
 #endif
 	unsigned long flags;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3dcbf24d2227..3968d68503cb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -533,7 +533,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg;
 
-	memcg = page->mem_cgroup;
+	memcg = page_memcg(page);
 
 	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		memcg = root_mem_cgroup;
@@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page)
 	unsigned long ino = 0;
 
 	rcu_read_lock();
-	memcg = page->mem_cgroup;
-
-	/*
-	 * The lowest bit set means that memcg isn't a valid
-	 * memcg pointer, but a obj_cgroups pointer.
-	 * In this case the page is shared and doesn't belong
-	 * to any specific memory cgroup.
-	 */
-	if ((unsigned long) memcg & 0x1UL)
-		memcg = NULL;
+	memcg = page_memcg_check(page);
 
 	while (memcg && !(memcg->css.flags & CSS_ONLINE))
 		memcg = parent_mem_cgroup(memcg);
@@ -1050,7 +1041,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm);
  */
 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
 {
-	struct mem_cgroup *memcg = page->mem_cgroup;
+	struct mem_cgroup *memcg = page_memcg(page);
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -1349,7 +1340,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
 		goto out;
 	}
 
-	memcg = page->mem_cgroup;
+	memcg = page_memcg(page);
 	/*
 	 * Swapcache readahead pages are added to the LRU - and
 	 * possibly migrated - before they are charged.
@@ -2109,7 +2100,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 }
 
 /**
- * lock_page_memcg - lock a page->mem_cgroup binding
+ * lock_page_memcg - lock a page and memcg binding
  * @page: the page
  *
  * This function protects unlocked LRU pages from being moved to
@@ -2141,7 +2132,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
 	if (mem_cgroup_disabled())
 		return NULL;
 again:
-	memcg = head->mem_cgroup;
+	memcg = page_memcg(head);
 	if (unlikely(!memcg))
 		return NULL;
 
@@ -2149,7 +2140,7 @@ again:
 		return memcg;
 
 	spin_lock_irqsave(&memcg->move_lock, flags);
-	if (memcg != head->mem_cgroup) {
+	if (memcg != page_memcg(head)) {
 		spin_unlock_irqrestore(&memcg->move_lock, flags);
 		goto again;
 	}
@@ -2187,14 +2178,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
 }
 
 /**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * unlock_page_memcg - unlock a page and memcg binding
  * @page: the page
  */
 void unlock_page_memcg(struct page *page)
 {
 	struct page *head = compound_head(page);
 
-	__unlock_page_memcg(head->mem_cgroup);
+	__unlock_page_memcg(page_memcg(head));
 }
 EXPORT_SYMBOL(unlock_page_memcg);
 
@@ -2884,7 +2875,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 
 static void commit_charge(struct page *page, struct mem_cgroup *memcg)
 {
-	VM_BUG_ON_PAGE(page->mem_cgroup, page);
+	VM_BUG_ON_PAGE(page_memcg(page), page);
 	/*
 	 * Any of the following ensures page->mem_cgroup stability:
 	 *
@@ -2893,7 +2884,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
 	 * - lock_page_memcg()
 	 * - exclusive reference
 	 */
-	page->mem_cgroup = memcg;
+	page->memcg_data = (unsigned long)memcg;
 }
 
 #ifdef CONFIG_MEMCG_KMEM
@@ -2908,8 +2899,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 	if (!vec)
 		return -ENOMEM;
 
-	if (cmpxchg(&page->obj_cgroups, NULL,
-		    (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+	if (cmpxchg(&page->memcg_data, 0, (unsigned long)vec | 0x1UL))
 		kfree(vec);
 	else
 		kmemleak_not_leak(vec);
@@ -2920,6 +2910,12 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 /*
  * Returns a pointer to the memory cgroup to which the kernel object is charged.
  *
+ * A passed kernel object can be a slab object or a generic kernel page, so
+ * different mechanisms for getting the memory cgroup pointer should be used.
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
  * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
  * cgroup_mutex, etc.
  */
@@ -2932,17 +2928,6 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
 
 	page = virt_to_head_page(p);
 
-	/*
-	 * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
-	 * or a pointer to obj_cgroup vector. In the latter case the lowest
-	 * bit of the pointer is set.
-	 * The page->mem_cgroup pointer can be asynchronously changed
-	 * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
-	 * from a valid memcg pointer to objcg vector or back.
-	 */
-	if (!page->mem_cgroup)
-		return NULL;
-
 	/*
 	 * Slab objects are accounted individually, not per-page.
 	 * Memcg membership data for each individual object is saved in
@@ -2960,8 +2945,14 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
 		return NULL;
 	}
 
-	/* All other pages use page->mem_cgroup */
-	return page->mem_cgroup;
+	/*
+	 * page_memcg_check() is used here, because page_has_obj_cgroups()
+	 * check above could fail because the object cgroups vector wasn't set
+	 * at that moment, but it can be set concurrently.
+	 * page_memcg_check(page) will guarantee that a proper memory
+	 * cgroup pointer or NULL will be returned.
+	 */
+	return page_memcg_check(page);
 }
 
 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
@@ -3099,7 +3090,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
 	if (memcg && !mem_cgroup_is_root(memcg)) {
 		ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
 		if (!ret) {
-			page->mem_cgroup = memcg;
+			page->memcg_data = (unsigned long)memcg;
 			__SetPageKmemcg(page);
 			return 0;
 		}
@@ -3115,7 +3106,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
  */
 void __memcg_kmem_uncharge_page(struct page *page, int order)
 {
-	struct mem_cgroup *memcg = page->mem_cgroup;
+	struct mem_cgroup *memcg = page_memcg(page);
 	unsigned int nr_pages = 1 << order;
 
 	if (!memcg)
@@ -3123,7 +3114,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
 
 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 	__memcg_kmem_uncharge(memcg, nr_pages);
-	page->mem_cgroup = NULL;
+	page->memcg_data = 0;
 	css_put(&memcg->css);
 
 	/* slab pages do not have PageKmemcg flag set */
@@ -3274,7 +3265,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
-	struct mem_cgroup *memcg = head->mem_cgroup;
+	struct mem_cgroup *memcg = page_memcg(head);
 	int i;
 
 	if (mem_cgroup_disabled())
@@ -3282,7 +3273,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		css_get(&memcg->css);
-		head[i].mem_cgroup = memcg;
+		head[i].memcg_data = (unsigned long)memcg;
 	}
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -4664,7 +4655,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
 					     struct bdi_writeback *wb)
 {
-	struct mem_cgroup *memcg = page->mem_cgroup;
+	struct mem_cgroup *memcg = page_memcg(page);
 	struct memcg_cgwb_frn *frn;
 	u64 now = get_jiffies_64();
 	u64 oldest_at = now;
@@ -5641,14 +5632,14 @@ static int mem_cgroup_move_account(struct page *page,
 
 	/*
 	 * Prevent mem_cgroup_migrate() from looking at
-	 * page->mem_cgroup of its source page while we change it.
+	 * page's memory cgroup of its source page while we change it.
 	 */
 	ret = -EBUSY;
 	if (!trylock_page(page))
 		goto out;
 
 	ret = -EINVAL;
-	if (page->mem_cgroup != from)
+	if (page_memcg(page) != from)
 		goto out_unlock;
 
 	pgdat = page_pgdat(page);
@@ -5703,13 +5694,13 @@ static int mem_cgroup_move_account(struct page *page,
 	/*
 	 * All state has been migrated, let's switch to the new memcg.
 	 *
-	 * It is safe to change page->mem_cgroup here because the page
+	 * It is safe to change page's memcg here because the page
 	 * is referenced, charged, isolated, and locked: we can't race
 	 * with (un)charging, migration, LRU putback, or anything else
-	 * that would rely on a stable page->mem_cgroup.
+	 * that would rely on a stable page's memory cgroup.
 	 *
 	 * Note that lock_page_memcg is a memcg lock, not a page lock,
-	 * to save space. As soon as we switch page->mem_cgroup to a
+	 * to save space. As soon as we switch page's memory cgroup to a
 	 * new memcg that isn't locked, the above state can change
 	 * concurrently again. Make sure we're truly done with it.
 	 */
@@ -5718,7 +5709,7 @@ static int mem_cgroup_move_account(struct page *page,
 	css_get(&to->css);
 	css_put(&from->css);
 
-	page->mem_cgroup = to;
+	page->memcg_data = (unsigned long)to;
 
 	__unlock_page_memcg(from);
 
@@ -5784,7 +5775,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 * mem_cgroup_move_account() checks the page is valid or
 		 * not under LRU exclusion.
 		 */
-		if (page->mem_cgroup == mc.from) {
+		if (page_memcg(page) == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (is_device_private_page(page))
 				ret = MC_TARGET_DEVICE;
@@ -5828,7 +5819,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
 	if (!(mc.flags & MOVE_ANON))
 		return ret;
-	if (page->mem_cgroup == mc.from) {
+	if (page_memcg(page) == mc.from) {
 		ret = MC_TARGET_PAGE;
 		if (target) {
 			get_page(page);
@@ -6774,12 +6765,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 		/*
 		 * Every swap fault against a single page tries to charge the
 		 * page, bail as early as possible.  shmem_unuse() encounters
-		 * already charged pages, too.  page->mem_cgroup is protected
-		 * by the page lock, which serializes swap cache removal, which
-		 * in turn serializes uncharging.
+		 * already charged pages, too.  page and memcg binding is
+		 * protected by the page lock, which serializes swap cache
+		 * removal, which in turn serializes uncharging.
 		 */
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
-		if (compound_head(page)->mem_cgroup)
+		if (page_memcg(compound_head(page)))
 			goto out;
 
 		id = lookup_swap_cgroup_id(ent);
@@ -6863,21 +6854,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 
-	if (!page->mem_cgroup)
+	if (!page_memcg(page))
 		return;
 
 	/*
 	 * Nobody should be changing or seriously looking at
-	 * page->mem_cgroup at this point, we have fully
+	 * page_memcg(page) at this point, we have fully
 	 * exclusive access to the page.
 	 */
 
-	if (ug->memcg != page->mem_cgroup) {
+	if (ug->memcg != page_memcg(page)) {
 		if (ug->memcg) {
 			uncharge_batch(ug);
 			uncharge_gather_clear(ug);
 		}
-		ug->memcg = page->mem_cgroup;
+		ug->memcg = page_memcg(page);
 
 		/* pairs with css_put in uncharge_batch */
 		css_get(&ug->memcg->css);
@@ -6894,7 +6885,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 	}
 
 	ug->dummy_page = page;
-	page->mem_cgroup = NULL;
+	page->memcg_data = 0;
 	css_put(&ug->memcg->css);
 }
 
@@ -6937,7 +6928,7 @@ void mem_cgroup_uncharge(struct page *page)
 		return;
 
 	/* Don't touch page->lru of any random page, pre-check: */
-	if (!page->mem_cgroup)
+	if (!page_memcg(page))
 		return;
 
 	uncharge_gather_clear(&ug);
@@ -6987,11 +6978,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
 		return;
 
 	/* Page cache replacement: new page already charged? */
-	if (newpage->mem_cgroup)
+	if (page_memcg(newpage))
 		return;
 
 	/* Swapcache readahead pages can get replaced before being charged */
-	memcg = oldpage->mem_cgroup;
+	memcg = page_memcg(oldpage);
 	if (!memcg)
 		return;
 
@@ -7186,7 +7177,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return;
 
-	memcg = page->mem_cgroup;
+	memcg = page_memcg(page);
 
 	/* Readahead page, never charged */
 	if (!memcg)
@@ -7207,7 +7198,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	VM_BUG_ON_PAGE(oldid, page);
 	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
 
-	page->mem_cgroup = NULL;
+	page->memcg_data = 0;
 
 	if (!mem_cgroup_is_root(memcg))
 		page_counter_uncharge(&memcg->memory, nr_entries);
@@ -7250,7 +7241,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return 0;
 
-	memcg = page->mem_cgroup;
+	memcg = page_memcg(page);
 
 	/* Readahead page, never charged */
 	if (!memcg)
@@ -7331,7 +7322,7 @@ bool mem_cgroup_swap_full(struct page *page)
 	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return false;
 
-	memcg = page->mem_cgroup;
+	memcg = page_memcg(page);
 	if (!memcg)
 		return false;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23f5066bd4a5..271133b8243b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1092,7 +1092,7 @@ static inline bool page_expected_state(struct page *page,
 	if (unlikely((unsigned long)page->mapping |
 			page_ref_count(page) |
 #ifdef CONFIG_MEMCG
-			(unsigned long)page->mem_cgroup |
+			(unsigned long)page_memcg(page) |
 #endif
 			(page->flags & check_flags)))
 		return false;
@@ -1117,7 +1117,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
 			bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
 	}
 #ifdef CONFIG_MEMCG
-	if (unlikely(page->mem_cgroup))
+	if (unlikely(page_memcg(page)))
 		bad_reason = "page still charged to cgroup";
 #endif
 	return bad_reason;
diff --git a/mm/page_io.c b/mm/page_io.c
index 433df1263349..9bca17ecc4df 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -291,12 +291,14 @@ static inline void count_swpout_vm_event(struct page *page)
 static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 {
 	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
 
-	if (!page->mem_cgroup)
+	memcg = page_memcg(page);
+	if (!memcg)
 		return;
 
 	rcu_read_lock();
-	css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
 	bio_associate_blkg_from_css(bio, css);
 	rcu_read_unlock();
 }
diff --git a/mm/slab.h b/mm/slab.h
index 6d7c6a5056ba..e2535cee0d33 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -242,18 +242,17 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
 static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
 {
 	/*
-	 * page->mem_cgroup and page->obj_cgroups are sharing the same
+	 * Page's memory cgroup and obj_cgroups vector are sharing the same
 	 * space. To distinguish between them in case we don't know for sure
 	 * that the page is a slab page (e.g. page_cgroup_ino()), let's
 	 * always set the lowest bit of obj_cgroups.
 	 */
-	return (struct obj_cgroup **)
-		((unsigned long)page->obj_cgroups & ~0x1UL);
+	return (struct obj_cgroup **)(page->memcg_data & ~0x1UL);
 }
 
 static inline bool page_has_obj_cgroups(struct page *page)
 {
-	return ((unsigned long)page->obj_cgroups & 0x1UL);
+	return page->memcg_data & 0x1UL;
 }
 
 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
@@ -262,7 +261,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 static inline void memcg_free_page_obj_cgroups(struct page *page)
 {
 	kfree(page_obj_cgroups(page));
-	page->obj_cgroups = NULL;
+	page->memcg_data = 0;
 }
 
 static inline size_t obj_full_size(struct kmem_cache *s)
diff --git a/mm/workingset.c b/mm/workingset.c
index 975a4d2dd02e..130348cbf40a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
 	struct lruvec *lruvec;
 	int memcgid;
 
-	/* Page is fully exclusive and pins page->mem_cgroup */
+	/* Page is fully exclusive and pins page's memory cgroup pointer */
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 	VM_BUG_ON_PAGE(page_count(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-- 
cgit 


From 270c6a71460e12b07b1dcadf7457ff95b6c6e8f4 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 1 Dec 2020 13:58:28 -0800
Subject: mm: memcontrol/slab: Use helpers to access slab page's memcg_data

To gather all direct accesses to struct page's memcg_data field in one
place, let's introduce 3 new helpers to use in the slab accounting code:

  struct obj_cgroup **page_objcgs(struct page *page);
  struct obj_cgroup **page_objcgs_check(struct page *page);
  bool set_page_objcgs(struct page *page, struct obj_cgroup **objcgs);

They are similar to the corresponding API for generic pages, except that
the setter can return false, indicating that the value has been already
set from a different thread.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20201027001657.3398190-3-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-3-guro@fb.com
---
 include/linux/memcontrol.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c            |  6 ++---
 mm/slab.h                  | 35 ++++++-------------------
 3 files changed, 75 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f95c1433461c..c7ac0a5b8989 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -416,6 +416,70 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	return (struct mem_cgroup *)memcg_data;
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * page_objcgs - get the object cgroups vector associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroups vector associated with the page,
+ * or NULL. This function assumes that the page is known to have an
+ * associated object cgroups vector. It's not safe to call this function
+ * against pages, which might have an associated memory cgroup: e.g.
+ * kernel stack pages.
+ */
+static inline struct obj_cgroup **page_objcgs(struct page *page)
+{
+	return (struct obj_cgroup **)(READ_ONCE(page->memcg_data) & ~0x1UL);
+}
+
+/*
+ * page_objcgs_check - get the object cgroups vector associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroups vector associated with the page,
+ * or NULL. This function is safe to use if the page can be directly associated
+ * with a memory cgroup.
+ */
+static inline struct obj_cgroup **page_objcgs_check(struct page *page)
+{
+	unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
+	if (memcg_data && (memcg_data & 0x1UL))
+		return (struct obj_cgroup **)(memcg_data & ~0x1UL);
+
+	return NULL;
+}
+
+/*
+ * set_page_objcgs - associate a page with a object cgroups vector
+ * @page: a pointer to the page struct
+ * @objcgs: a pointer to the object cgroups vector
+ *
+ * Atomically associates a page with a vector of object cgroups.
+ */
+static inline bool set_page_objcgs(struct page *page,
+					struct obj_cgroup **objcgs)
+{
+	return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | 0x1UL);
+}
+#else
+static inline struct obj_cgroup **page_objcgs(struct page *page)
+{
+	return NULL;
+}
+
+static inline struct obj_cgroup **page_objcgs_check(struct page *page)
+{
+	return NULL;
+}
+
+static inline bool set_page_objcgs(struct page *page,
+					struct obj_cgroup **objcgs)
+{
+	return true;
+}
+#endif
+
 static __always_inline bool memcg_stat_item_in_bytes(int idx)
 {
 	if (idx == MEMCG_PERCPU_B)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3968d68503cb..0054b4846770 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2899,7 +2899,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 	if (!vec)
 		return -ENOMEM;
 
-	if (cmpxchg(&page->memcg_data, 0, (unsigned long)vec | 0x1UL))
+	if (!set_page_objcgs(page, vec))
 		kfree(vec);
 	else
 		kmemleak_not_leak(vec);
@@ -2933,12 +2933,12 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
 	 * Memcg membership data for each individual object is saved in
 	 * the page->obj_cgroups.
 	 */
-	if (page_has_obj_cgroups(page)) {
+	if (page_objcgs_check(page)) {
 		struct obj_cgroup *objcg;
 		unsigned int off;
 
 		off = obj_to_index(page->slab_cache, page, p);
-		objcg = page_obj_cgroups(page)[off];
+		objcg = page_objcgs(page)[off];
 		if (objcg)
 			return obj_cgroup_memcg(objcg);
 
diff --git a/mm/slab.h b/mm/slab.h
index e2535cee0d33..9a54a0cb5cca 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -239,28 +239,12 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
-{
-	/*
-	 * Page's memory cgroup and obj_cgroups vector are sharing the same
-	 * space. To distinguish between them in case we don't know for sure
-	 * that the page is a slab page (e.g. page_cgroup_ino()), let's
-	 * always set the lowest bit of obj_cgroups.
-	 */
-	return (struct obj_cgroup **)(page->memcg_data & ~0x1UL);
-}
-
-static inline bool page_has_obj_cgroups(struct page *page)
-{
-	return page->memcg_data & 0x1UL;
-}
-
 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 				 gfp_t gfp);
 
 static inline void memcg_free_page_obj_cgroups(struct page *page)
 {
-	kfree(page_obj_cgroups(page));
+	kfree(page_objcgs(page));
 	page->memcg_data = 0;
 }
 
@@ -322,7 +306,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 		if (likely(p[i])) {
 			page = virt_to_head_page(p[i]);
 
-			if (!page_has_obj_cgroups(page) &&
+			if (!page_objcgs(page) &&
 			    memcg_alloc_page_obj_cgroups(page, s, flags)) {
 				obj_cgroup_uncharge(objcg, obj_full_size(s));
 				continue;
@@ -330,7 +314,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 
 			off = obj_to_index(s, page, p[i]);
 			obj_cgroup_get(objcg);
-			page_obj_cgroups(page)[off] = objcg;
+			page_objcgs(page)[off] = objcg;
 			mod_objcg_state(objcg, page_pgdat(page),
 					cache_vmstat_idx(s), obj_full_size(s));
 		} else {
@@ -344,6 +328,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
 					void **p, int objects)
 {
 	struct kmem_cache *s;
+	struct obj_cgroup **objcgs;
 	struct obj_cgroup *objcg;
 	struct page *page;
 	unsigned int off;
@@ -357,7 +342,8 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
 			continue;
 
 		page = virt_to_head_page(p[i]);
-		if (!page_has_obj_cgroups(page))
+		objcgs = page_objcgs(page);
+		if (!objcgs)
 			continue;
 
 		if (!s_orig)
@@ -366,11 +352,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
 			s = s_orig;
 
 		off = obj_to_index(s, page, p[i]);
-		objcg = page_obj_cgroups(page)[off];
+		objcg = objcgs[off];
 		if (!objcg)
 			continue;
 
-		page_obj_cgroups(page)[off] = NULL;
+		objcgs[off] = NULL;
 		obj_cgroup_uncharge(objcg, obj_full_size(s));
 		mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
 				-obj_full_size(s));
@@ -379,11 +365,6 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
 }
 
 #else /* CONFIG_MEMCG_KMEM */
-static inline bool page_has_obj_cgroups(struct page *page)
-{
-	return false;
-}
-
 static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
 {
 	return NULL;
-- 
cgit 


From 87944e2992bd28098c6806086a1e96bb4d0e502b Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 1 Dec 2020 13:58:29 -0800
Subject: mm: Introduce page memcg flags

The lowest bit in page->memcg_data is used to distinguish between struct
memory_cgroup pointer and a pointer to a objcgs array.  All checks and
modifications of this bit are open-coded.

Let's formalize it using page memcg flags, defined in enum
page_memcg_data_flags.

Additional flags might be added later.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-4-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-4-guro@fb.com
---
 include/linux/memcontrol.h | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c7ac0a5b8989..99a4841d658b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -343,6 +343,15 @@ struct mem_cgroup {
 
 extern struct mem_cgroup *root_mem_cgroup;
 
+enum page_memcg_data_flags {
+	/* page->memcg_data is a pointer to an objcgs vector */
+	MEMCG_DATA_OBJCGS = (1UL << 0),
+	/* the next bit after the last actual flag */
+	__NR_MEMCG_DATA_FLAGS  = (1UL << 1),
+};
+
+#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
+
 /*
  * page_memcg - get the memory cgroup associated with a page
  * @page: a pointer to the page struct
@@ -404,13 +413,7 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	 */
 	unsigned long memcg_data = READ_ONCE(page->memcg_data);
 
-	/*
-	 * The lowest bit set means that memcg isn't a valid
-	 * memcg pointer, but a obj_cgroups pointer.
-	 * In this case the page is shared and doesn't belong
-	 * to any specific memory cgroup.
-	 */
-	if (memcg_data & 0x1UL)
+	if (memcg_data & MEMCG_DATA_OBJCGS)
 		return NULL;
 
 	return (struct mem_cgroup *)memcg_data;
@@ -429,7 +432,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
  */
 static inline struct obj_cgroup **page_objcgs(struct page *page)
 {
-	return (struct obj_cgroup **)(READ_ONCE(page->memcg_data) & ~0x1UL);
+	unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
+	VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
+
+	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
@@ -444,10 +451,10 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 {
 	unsigned long memcg_data = READ_ONCE(page->memcg_data);
 
-	if (memcg_data && (memcg_data & 0x1UL))
-		return (struct obj_cgroup **)(memcg_data & ~0x1UL);
+	if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
+		return NULL;
 
-	return NULL;
+	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
@@ -460,7 +467,8 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 static inline bool set_page_objcgs(struct page *page,
 					struct obj_cgroup **objcgs)
 {
-	return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | 0x1UL);
+	return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs |
+			MEMCG_DATA_OBJCGS);
 }
 #else
 static inline struct obj_cgroup **page_objcgs(struct page *page)
-- 
cgit 


From 18b2db3b0385226b71cb3288474fa5a6e4a45474 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 1 Dec 2020 13:58:30 -0800
Subject: mm: Convert page kmemcg type to a page memcg flag

PageKmemcg flag is currently defined as a page type (like buddy, offline,
table and guard).  Semantically it means that the page was accounted as a
kernel memory by the page allocator and has to be uncharged on the
release.

As a side effect of defining the flag as a page type, the accounted page
can't be mapped to userspace (look at page_has_type() and comments above).
In particular, this blocks the accounting of vmalloc-backed memory used
by some bpf maps, because these maps do map the memory to userspace.

One option is to fix it by complicating the access to page->mapcount,
which provides some free bits for page->page_type.

But it's way better to move this flag into page->memcg_data flags.
Indeed, the flag makes no sense without enabled memory cgroups and memory
cgroup pointer set in particular.

This commit replaces PageKmemcg() and __SetPageKmemcg() with
PageMemcgKmem() and an open-coded OR operation setting the memcg pointer
with the MEMCG_DATA_KMEM bit.  __ClearPageKmemcg() can be simple deleted,
as the whole memcg_data is zeroed at once.

As a bonus, on !CONFIG_MEMCG build the PageMemcgKmem() check will be
compiled out.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-5-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-5-guro@fb.com
---
 include/linux/memcontrol.h | 37 +++++++++++++++++++++++++++++++++----
 include/linux/page-flags.h | 11 ++---------
 mm/memcontrol.c            | 16 +++++-----------
 mm/page_alloc.c            |  4 ++--
 4 files changed, 42 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 99a4841d658b..7c9d43476166 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -346,8 +346,10 @@ extern struct mem_cgroup *root_mem_cgroup;
 enum page_memcg_data_flags {
 	/* page->memcg_data is a pointer to an objcgs vector */
 	MEMCG_DATA_OBJCGS = (1UL << 0),
+	/* page has been accounted as a non-slab kernel page */
+	MEMCG_DATA_KMEM = (1UL << 1),
 	/* the next bit after the last actual flag */
-	__NR_MEMCG_DATA_FLAGS  = (1UL << 1),
+	__NR_MEMCG_DATA_FLAGS  = (1UL << 2),
 };
 
 #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
@@ -369,8 +371,12 @@ enum page_memcg_data_flags {
  */
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
+	unsigned long memcg_data = page->memcg_data;
+
 	VM_BUG_ON_PAGE(PageSlab(page), page);
-	return (struct mem_cgroup *)page->memcg_data;
+	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+
+	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
@@ -387,7 +393,8 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
 	VM_BUG_ON_PAGE(PageSlab(page), page);
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	return (struct mem_cgroup *)READ_ONCE(page->memcg_data);
+	return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
+				     ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
@@ -416,7 +423,21 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	if (memcg_data & MEMCG_DATA_OBJCGS)
 		return NULL;
 
-	return (struct mem_cgroup *)memcg_data;
+	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+/*
+ * PageMemcgKmem - check if the page has MemcgKmem flag set
+ * @page: a pointer to the page struct
+ *
+ * Checks if the page has MemcgKmem flag set. The caller must ensure that
+ * the page has an associated memory cgroup. It's not safe to call this function
+ * against some types of pages, e.g. slab pages.
+ */
+static inline bool PageMemcgKmem(struct page *page)
+{
+	VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page);
+	return page->memcg_data & MEMCG_DATA_KMEM;
 }
 
 #ifdef CONFIG_MEMCG_KMEM
@@ -435,6 +456,7 @@ static inline struct obj_cgroup **page_objcgs(struct page *page)
 	unsigned long memcg_data = READ_ONCE(page->memcg_data);
 
 	VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
+	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
 
 	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
@@ -454,6 +476,8 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 	if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
 		return NULL;
 
+	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+
 	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
@@ -1109,6 +1133,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	return NULL;
 }
 
+static inline bool PageMemcgKmem(struct page *page)
+{
+	return false;
+}
+
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return true;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4f6ba9379112..fc0e1bd48e73 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -715,9 +715,8 @@ PAGEFLAG_FALSE(DoubleMap)
 #define PAGE_MAPCOUNT_RESERVE	-128
 #define PG_buddy	0x00000080
 #define PG_offline	0x00000100
-#define PG_kmemcg	0x00000200
-#define PG_table	0x00000400
-#define PG_guard	0x00000800
+#define PG_table	0x00000200
+#define PG_guard	0x00000400
 
 #define PageType(page, flag)						\
 	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -768,12 +767,6 @@ PAGE_TYPE_OPS(Buddy, buddy)
  */
 PAGE_TYPE_OPS(Offline, offline)
 
-/*
- * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
- * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
- */
-PAGE_TYPE_OPS(Kmemcg, kmemcg)
-
 /*
  * Marks pages in use as page tables.
  */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0054b4846770..e0366e306221 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3090,8 +3090,8 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
 	if (memcg && !mem_cgroup_is_root(memcg)) {
 		ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
 		if (!ret) {
-			page->memcg_data = (unsigned long)memcg;
-			__SetPageKmemcg(page);
+			page->memcg_data = (unsigned long)memcg |
+				MEMCG_DATA_KMEM;
 			return 0;
 		}
 		css_put(&memcg->css);
@@ -3116,10 +3116,6 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
 	__memcg_kmem_uncharge(memcg, nr_pages);
 	page->memcg_data = 0;
 	css_put(&memcg->css);
-
-	/* slab pages do not have PageKmemcg flag set */
-	if (PageKmemcg(page))
-		__ClearPageKmemcg(page);
 }
 
 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -6877,12 +6873,10 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 	nr_pages = compound_nr(page);
 	ug->nr_pages += nr_pages;
 
-	if (!PageKmemcg(page)) {
-		ug->pgpgout++;
-	} else {
+	if (PageMemcgKmem(page))
 		ug->nr_kmem += nr_pages;
-		__ClearPageKmemcg(page);
-	}
+	else
+		ug->pgpgout++;
 
 	ug->dummy_page = page;
 	page->memcg_data = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 271133b8243b..3c53018c9c61 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1214,7 +1214,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 		 * Do not let hwpoison pages hit pcplists/buddy
 		 * Untie memcg state and reset page's owner
 		 */
-		if (memcg_kmem_enabled() && PageKmemcg(page))
+		if (memcg_kmem_enabled() && PageMemcgKmem(page))
 			__memcg_kmem_uncharge_page(page, order);
 		reset_page_owner(page, order);
 		return false;
@@ -1244,7 +1244,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	}
 	if (PageMappingFlags(page))
 		page->mapping = NULL;
-	if (memcg_kmem_enabled() && PageKmemcg(page))
+	if (memcg_kmem_enabled() && PageMemcgKmem(page))
 		__memcg_kmem_uncharge_page(page, order);
 	if (check_free)
 		bad += check_free_page(page);
-- 
cgit 


From 48edc1f78aabeba35ed00e40c36f211de89e0090 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 1 Dec 2020 13:58:32 -0800
Subject: bpf: Prepare for memcg-based memory accounting for bpf maps

Bpf maps can be updated from an interrupt context and in such
case there is no process which can be charged. It makes the memory
accounting of bpf maps non-trivial.

Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
memcg accounting from interrupt contexts") and commit b87d8cefe43c
("mm, memcg: rework remote charging API to support nesting")
it's finally possible.

To make the ownership model simple and consistent, when the map
is created, the memory cgroup of the current process is recorded.
All subsequent allocations related to the bpf map are charged to
the same memory cgroup. It includes allocations made by any processes
(even if they do belong to a different cgroup) and from interrupts.

This commit introduces 3 new helpers, which will be used by following
commits to enable the accounting of bpf maps memory:
  - bpf_map_kmalloc_node()
  - bpf_map_kzalloc()
  - bpf_map_alloc_percpu()

They are wrapping popular memory allocation functions. They set
the active memory cgroup to the map's memory cgroup and add
__GFP_ACCOUNT to the passed gfp flags. Then they call into
the corresponding memory allocation function and restore
the original active memory cgroup.

These helpers are supposed to use everywhere except the map creation
path. During the map creation when the map structure is allocated by
itself, it cannot be passed to those helpers. In those cases default
memory allocation function will be used with the __GFP_ACCOUNT flag.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201201215900.3569844-7-guro@fb.com
---
 include/linux/bpf.h  | 34 ++++++++++++++++++++++++++++
 kernel/bpf/syscall.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1bcb6d7345c..e1f2c95c15ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -20,6 +20,8 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/capability.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -37,6 +39,7 @@ struct bpf_iter_aux_info;
 struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
+struct mem_cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -161,6 +164,9 @@ struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	struct bpf_map_memory memory;
+#ifdef CONFIG_MEMCG_KMEM
+	struct mem_cgroup *memcg;
+#endif
 	char name[BPF_OBJ_NAME_LEN];
 	u32 btf_vmlinux_value_type_id;
 	bool bypass_spec_v1;
@@ -1240,6 +1246,34 @@ int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
+#ifdef CONFIG_MEMCG_KMEM
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node);
+void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t flags);
+#else
+static inline void *
+bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+		     int node)
+{
+	return kmalloc_node(size, flags, node);
+}
+
+static inline void *
+bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
+{
+	return kzalloc(size, flags);
+}
+
+static inline void __percpu *
+bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
+		     gfp_t flags)
+{
+	return __alloc_percpu_gfp(size, align, flags);
+}
+#endif
+
 extern int sysctl_unprivileged_bpf_disabled;
 
 static inline bool bpf_allow_ptr_leaks(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f3fe9f53f93c..dedbf6d4cd84 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -31,6 +31,7 @@
 #include <linux/poll.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/memcontrol.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -456,6 +457,65 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 		__release(&map_idr_lock);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+	map->memcg = get_mem_cgroup_from_mm(current->mm);
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+	mem_cgroup_put(map->memcg);
+}
+
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node)
+{
+	struct mem_cgroup *old_memcg;
+	void *ptr;
+
+	old_memcg = set_active_memcg(map->memcg);
+	ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
+	set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
+{
+	struct mem_cgroup *old_memcg;
+	void *ptr;
+
+	old_memcg = set_active_memcg(map->memcg);
+	ptr = kzalloc(size, flags | __GFP_ACCOUNT);
+	set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t flags)
+{
+	struct mem_cgroup *old_memcg;
+	void __percpu *ptr;
+
+	old_memcg = set_active_memcg(map->memcg);
+	ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
+	set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+#else
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+}
+#endif
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -464,6 +524,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
+	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 	bpf_map_charge_finish(&mem);
@@ -875,6 +936,8 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_sec;
 
+	bpf_map_save_memcg(map);
+
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
-- 
cgit 


From 80ee81e0403c48f4eb342f7c8d40477c89b8836a Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 1 Dec 2020 13:58:58 -0800
Subject: bpf: Eliminate rlimit-based memory accounting infra for bpf maps

Remove rlimit-based accounting infrastructure code, which is not used
anymore.

To provide a backward compatibility, use an approximation of the
bpf map memory footprint as a "memlock" value, available to a user
via map info. The approximation is based on the maximal number of
elements and key and value sizes.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20201201215900.3569844-33-guro@fb.com
---
 include/linux/bpf.h                                | 12 ---
 kernel/bpf/syscall.c                               | 96 ++++------------------
 .../testing/selftests/bpf/progs/bpf_iter_bpf_map.c |  2 +-
 tools/testing/selftests/bpf/progs/map_ptr_kern.c   |  7 --
 4 files changed, 17 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1f2c95c15ec..61331a148cde 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -138,11 +138,6 @@ struct bpf_map_ops {
 	const struct bpf_iter_seq_info *iter_seq_info;
 };
 
-struct bpf_map_memory {
-	u32 pages;
-	struct user_struct *user;
-};
-
 struct bpf_map {
 	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
@@ -163,7 +158,6 @@ struct bpf_map {
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
 	struct btf *btf;
-	struct bpf_map_memory memory;
 #ifdef CONFIG_MEMCG_KMEM
 	struct mem_cgroup *memcg;
 #endif
@@ -1224,12 +1218,6 @@ void bpf_map_inc_with_uref(struct bpf_map *map);
 struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
-int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
-void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
-int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size);
-void bpf_map_charge_finish(struct bpf_map_memory *mem);
-void bpf_map_charge_move(struct bpf_map_memory *dst,
-			 struct bpf_map_memory *src);
 void *bpf_map_area_alloc(u64 size, int numa_node);
 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
 void bpf_map_area_free(void *base);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dff3a5f62d7a..29096d96d989 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -128,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
-static u32 bpf_map_value_size(struct bpf_map *map)
+static u32 bpf_map_value_size(const struct bpf_map *map)
 {
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -346,77 +346,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
 	map->numa_node = bpf_map_attr_numa_node(attr);
 }
 
-static int bpf_charge_memlock(struct user_struct *user, u32 pages)
-{
-	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-	if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
-		atomic_long_sub(pages, &user->locked_vm);
-		return -EPERM;
-	}
-	return 0;
-}
-
-static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
-{
-	if (user)
-		atomic_long_sub(pages, &user->locked_vm);
-}
-
-int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
-{
-	u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
-	struct user_struct *user;
-	int ret;
-
-	if (size >= U32_MAX - PAGE_SIZE)
-		return -E2BIG;
-
-	user = get_current_user();
-	ret = bpf_charge_memlock(user, pages);
-	if (ret) {
-		free_uid(user);
-		return ret;
-	}
-
-	mem->pages = pages;
-	mem->user = user;
-
-	return 0;
-}
-
-void bpf_map_charge_finish(struct bpf_map_memory *mem)
-{
-	bpf_uncharge_memlock(mem->user, mem->pages);
-	free_uid(mem->user);
-}
-
-void bpf_map_charge_move(struct bpf_map_memory *dst,
-			 struct bpf_map_memory *src)
-{
-	*dst = *src;
-
-	/* Make sure src will not be used for the redundant uncharging. */
-	memset(src, 0, sizeof(struct bpf_map_memory));
-}
-
-int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
-{
-	int ret;
-
-	ret = bpf_charge_memlock(map->memory.user, pages);
-	if (ret)
-		return ret;
-	map->memory.pages += pages;
-	return ret;
-}
-
-void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
-{
-	bpf_uncharge_memlock(map->memory.user, pages);
-	map->memory.pages -= pages;
-}
-
 static int bpf_map_alloc_id(struct bpf_map *map)
 {
 	int id;
@@ -524,14 +453,11 @@ static void bpf_map_release_memcg(struct bpf_map *map)
 static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
-	struct bpf_map_memory mem;
 
-	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
 	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
-	bpf_map_charge_finish(&mem);
 }
 
 static void bpf_map_put_uref(struct bpf_map *map)
@@ -592,6 +518,19 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
 }
 
 #ifdef CONFIG_PROC_FS
+/* Provides an approximation of the map's memory footprint.
+ * Used only to provide a backward compatibility and display
+ * a reasonable "memlock" info.
+ */
+static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
+{
+	unsigned long size;
+
+	size = round_up(map->key_size + bpf_map_value_size(map), 8);
+
+	return round_up(map->max_entries * size, PAGE_SIZE);
+}
+
 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_map *map = filp->private_data;
@@ -610,7 +549,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "value_size:\t%u\n"
 		   "max_entries:\t%u\n"
 		   "map_flags:\t%#x\n"
-		   "memlock:\t%llu\n"
+		   "memlock:\t%lu\n"
 		   "map_id:\t%u\n"
 		   "frozen:\t%u\n",
 		   map->map_type,
@@ -618,7 +557,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->value_size,
 		   map->max_entries,
 		   map->map_flags,
-		   map->memory.pages * 1ULL << PAGE_SHIFT,
+		   bpf_map_memory_footprint(map),
 		   map->id,
 		   READ_ONCE(map->frozen));
 	if (type) {
@@ -861,7 +800,6 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
-	struct bpf_map_memory mem;
 	struct bpf_map *map;
 	int f_flags;
 	int err;
@@ -960,9 +898,7 @@ free_map_sec:
 	security_bpf_map_free(map);
 free_map:
 	btf_put(map->btf);
-	bpf_map_charge_move(&mem, &map->memory);
 	map->ops->map_free(map);
-	bpf_map_charge_finish(&mem);
 	return err;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
index 08651b23edba..b83b5d2e17dc 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
@@ -23,6 +23,6 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
 
 	BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter,
 		       map->usercnt.counter,
-		       map->memory.user->locked_vm.counter);
+		       0LLU);
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c
index c325405751e2..d8850bc6a9f1 100644
--- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c
+++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c
@@ -26,17 +26,12 @@ __u32 g_line = 0;
 		return 0;	\
 })
 
-struct bpf_map_memory {
-	__u32 pages;
-} __attribute__((preserve_access_index));
-
 struct bpf_map {
 	enum bpf_map_type map_type;
 	__u32 key_size;
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 id;
-	struct bpf_map_memory memory;
 } __attribute__((preserve_access_index));
 
 static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size,
@@ -47,7 +42,6 @@ static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size,
 	VERIFY(map->value_size == value_size);
 	VERIFY(map->max_entries == max_entries);
 	VERIFY(map->id > 0);
-	VERIFY(map->memory.pages > 0);
 
 	return 1;
 }
@@ -60,7 +54,6 @@ static inline int check_bpf_map_ptr(struct bpf_map *indirect,
 	VERIFY(indirect->value_size == direct->value_size);
 	VERIFY(indirect->max_entries == direct->max_entries);
 	VERIFY(indirect->id == direct->id);
-	VERIFY(indirect->memory.pages == direct->memory.pages);
 
 	return 1;
 }
-- 
cgit 


From 3ac1f01b43b6e2759cc34d3a715ba5eed04c5805 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 1 Dec 2020 13:58:59 -0800
Subject: bpf: Eliminate rlimit-based memory accounting for bpf progs

Do not use rlimit-based memory accounting for bpf progs. It has been
replaced with memcg-based memory accounting.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20201201215900.3569844-34-guro@fb.com
---
 include/linux/bpf.h  | 11 ---------
 kernel/bpf/core.c    | 12 ++-------
 kernel/bpf/syscall.c | 69 ++++++++--------------------------------------------
 3 files changed, 12 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 61331a148cde..a9de5711b23f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1202,8 +1202,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i);
 void bpf_prog_inc(struct bpf_prog *prog);
 struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
-int __bpf_prog_charge(struct user_struct *user, u32 pages);
-void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
 void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 			  struct bpf_map **used_maps, u32 len);
 
@@ -1512,15 +1510,6 @@ bpf_prog_inc_not_zero(struct bpf_prog *prog)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline int __bpf_prog_charge(struct user_struct *user, u32 pages)
-{
-	return 0;
-}
-
-static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
-{
-}
-
 static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
 				 const struct bpf_link_ops *ops,
 				 struct bpf_prog *prog)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2921f58c03a8..261f8692d0d2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -221,23 +221,15 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 {
 	gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
 	struct bpf_prog *fp;
-	u32 pages, delta;
-	int ret;
+	u32 pages;
 
 	size = round_up(size, PAGE_SIZE);
 	pages = size / PAGE_SIZE;
 	if (pages <= fp_old->pages)
 		return fp_old;
 
-	delta = pages - fp_old->pages;
-	ret = __bpf_prog_charge(fp_old->aux->user, delta);
-	if (ret)
-		return NULL;
-
 	fp = __vmalloc(size, gfp_flags);
-	if (fp == NULL) {
-		__bpf_prog_uncharge(fp_old->aux->user, delta);
-	} else {
+	if (fp) {
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = pages;
 		fp->aux->prog = fp;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 29096d96d989..d16dd4945100 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1632,51 +1632,6 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
 	audit_log_end(ab);
 }
 
-int __bpf_prog_charge(struct user_struct *user, u32 pages)
-{
-	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	unsigned long user_bufs;
-
-	if (user) {
-		user_bufs = atomic_long_add_return(pages, &user->locked_vm);
-		if (user_bufs > memlock_limit) {
-			atomic_long_sub(pages, &user->locked_vm);
-			return -EPERM;
-		}
-	}
-
-	return 0;
-}
-
-void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
-{
-	if (user)
-		atomic_long_sub(pages, &user->locked_vm);
-}
-
-static int bpf_prog_charge_memlock(struct bpf_prog *prog)
-{
-	struct user_struct *user = get_current_user();
-	int ret;
-
-	ret = __bpf_prog_charge(user, prog->pages);
-	if (ret) {
-		free_uid(user);
-		return ret;
-	}
-
-	prog->aux->user = user;
-	return 0;
-}
-
-static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
-{
-	struct user_struct *user = prog->aux->user;
-
-	__bpf_prog_uncharge(user, prog->pages);
-	free_uid(user);
-}
-
 static int bpf_prog_alloc_id(struct bpf_prog *prog)
 {
 	int id;
@@ -1726,7 +1681,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
-	bpf_prog_uncharge_memlock(aux->prog);
+	free_uid(aux->user);
 	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
 }
@@ -2164,7 +2119,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 		dst_prog = bpf_prog_get(attr->attach_prog_fd);
 		if (IS_ERR(dst_prog)) {
 			err = PTR_ERR(dst_prog);
-			goto free_prog_nouncharge;
+			goto free_prog;
 		}
 		prog->aux->dst_prog = dst_prog;
 	}
@@ -2174,18 +2129,15 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
-		goto free_prog_nouncharge;
-
-	err = bpf_prog_charge_memlock(prog);
-	if (err)
-		goto free_prog_sec;
+		goto free_prog;
 
+	prog->aux->user = get_current_user();
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
 	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
 			   bpf_prog_insn_size(prog)) != 0)
-		goto free_prog;
+		goto free_prog_sec;
 
 	prog->orig_prog = NULL;
 	prog->jited = 0;
@@ -2196,19 +2148,19 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	if (bpf_prog_is_dev_bound(prog->aux)) {
 		err = bpf_prog_offload_init(prog, attr);
 		if (err)
-			goto free_prog;
+			goto free_prog_sec;
 	}
 
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
-		goto free_prog;
+		goto free_prog_sec;
 
 	prog->aux->load_time = ktime_get_boottime_ns();
 	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
 			       sizeof(attr->prog_name));
 	if (err < 0)
-		goto free_prog;
+		goto free_prog_sec;
 
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr, uattr);
@@ -2253,11 +2205,10 @@ free_used_maps:
 	 */
 	__bpf_prog_put_noref(prog, prog->aux->func_cnt);
 	return err;
-free_prog:
-	bpf_prog_uncharge_memlock(prog);
 free_prog_sec:
+	free_uid(prog->aux->user);
 	security_bpf_prog_free(prog->aux);
-free_prog_nouncharge:
+free_prog:
 	bpf_prog_free(prog);
 	return err;
 }
-- 
cgit 


From 608af703519a58f5a7da4273809211cac27edfb2 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 19 Nov 2020 06:09:02 +0000
Subject: libfs: Add generic function for setting dentry_ops

This adds a function to set dentry operations at lookup time that will
work for both encrypted filenames and casefolded filenames.

A filesystem that supports both features simultaneously can use this
function during lookup preparations to set up its dentry operations once
fscrypt no longer does that itself.

Currently the casefolding dentry operation are always set if the
filesystem defines an encoding because the features is toggleable on
empty directories. Unlike in the encryption case, the dentry operations
used come from the parent. Since we don't know what set of functions
we'll eventually need, and cannot change them later, we enable the
casefolding operations if the filesystem supports them at all.

By splitting out the various cases, we support as few dentry operations
as we can get away with, maximizing compatibility with overlayfs, which
will not function if a filesystem supports certain dentry_operations.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/libfs.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 2 files changed, 71 insertions(+)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index fc34361c1489..bac918699022 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1449,4 +1449,74 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 	return 0;
 }
 EXPORT_SYMBOL(generic_ci_d_hash);
+
+static const struct dentry_operations generic_ci_dentry_ops = {
+	.d_hash = generic_ci_d_hash,
+	.d_compare = generic_ci_d_compare,
+};
+#endif
+
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct dentry_operations generic_encrypted_dentry_ops = {
+	.d_revalidate = fscrypt_d_revalidate,
+};
+#endif
+
+#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
+	.d_hash = generic_ci_d_hash,
+	.d_compare = generic_ci_d_compare,
+	.d_revalidate = fscrypt_d_revalidate,
+};
+#endif
+
+/**
+ * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
+ * @dentry:	dentry to set ops on
+ *
+ * Casefolded directories need d_hash and d_compare set, so that the dentries
+ * contained in them are handled case-insensitively.  Note that these operations
+ * are needed on the parent directory rather than on the dentries in it, and
+ * while the casefolding flag can be toggled on and off on an empty directory,
+ * dentry_operations can't be changed later.  As a result, if the filesystem has
+ * casefolding support enabled at all, we have to give all dentries the
+ * casefolding operations even if their inode doesn't have the casefolding flag
+ * currently (and thus the casefolding ops would be no-ops for now).
+ *
+ * Encryption works differently in that the only dentry operation it needs is
+ * d_revalidate, which it only needs on dentries that have the no-key name flag.
+ * The no-key flag can't be set "later", so we don't have to worry about that.
+ *
+ * Finally, to maximize compatibility with overlayfs (which isn't compatible
+ * with certain dentry operations) and to avoid taking an unnecessary
+ * performance hit, we use custom dentry_operations for each possible
+ * combination rather than always installing all operations.
+ */
+void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+	bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
+#endif
+#ifdef CONFIG_UNICODE
+	bool needs_ci_ops = dentry->d_sb->s_encoding;
+#endif
+#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+	if (needs_encrypt_ops && needs_ci_ops) {
+		d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+		return;
+	}
 #endif
+#ifdef CONFIG_FS_ENCRYPTION
+	if (needs_encrypt_ops) {
+		d_set_d_op(dentry, &generic_encrypted_dentry_ops);
+		return;
+	}
+#endif
+#ifdef CONFIG_UNICODE
+	if (needs_ci_ops) {
+		d_set_d_op(dentry, &generic_ci_dentry_ops);
+		return;
+	}
+#endif
+}
+EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21cc971fd960..4a25ab4dbd3e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3186,6 +3186,7 @@ extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
 extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 				const char *str, const struct qstr *name);
 #endif
+extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
 
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
-- 
cgit 


From bb9cd9106b22b4fc5ff8d78a752be8a4ba2cbba5 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 19 Nov 2020 06:09:03 +0000
Subject: fscrypt: Have filesystems handle their d_ops

This shifts the responsibility of setting up dentry operations from
fscrypt to the individual filesystems, allowing them to have their own
operations while still setting fscrypt's d_revalidate as appropriate.

Most filesystems can just use generic_set_encrypted_ci_d_ops, unless
they have their own specific dentry operations as well. That operation
will set the minimal d_ops required under the circumstances.

Since the fscrypt d_ops are set later on, we must set all d_ops there,
since we cannot adjust those later on. This should not result in any
change in behavior.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/crypto/fname.c           | 4 ----
 fs/crypto/fscrypt_private.h | 1 -
 fs/crypto/hooks.c           | 1 -
 fs/ext4/dir.c               | 7 -------
 fs/ext4/ext4.h              | 4 ----
 fs/ext4/namei.c             | 1 +
 fs/ext4/super.c             | 5 -----
 fs/f2fs/dir.c               | 7 -------
 fs/f2fs/f2fs.h              | 3 ---
 fs/f2fs/namei.c             | 1 +
 fs/f2fs/super.c             | 1 -
 fs/ubifs/dir.c              | 1 +
 include/linux/fscrypt.h     | 7 +++++--
 13 files changed, 8 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 1fbe6c24d705..cb3cfa6329ba 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -570,7 +570,3 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 	return valid;
 }
 EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
-
-const struct dentry_operations fscrypt_d_ops = {
-	.d_revalidate = fscrypt_d_revalidate,
-};
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 4f5806a3b73d..df9c48c1fbf7 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -294,7 +294,6 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
 bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
 				  u32 orig_len, u32 max_len,
 				  u32 *encrypted_len_ret);
-extern const struct dentry_operations fscrypt_d_ops;
 
 /* hkdf.c */
 
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 20b0df47fe6a..9006fa983335 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -117,7 +117,6 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags |= DCACHE_NOKEY_NAME;
 		spin_unlock(&dentry->d_lock);
-		d_set_d_op(dentry, &fscrypt_d_ops);
 	}
 	return err;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ca50c90adc4c..e757319a4472 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -667,10 +667,3 @@ const struct file_operations ext4_dir_operations = {
 	.open		= ext4_dir_open,
 	.release	= ext4_release_dir,
 };
-
-#ifdef CONFIG_UNICODE
-const struct dentry_operations ext4_dentry_ops = {
-	.d_hash = generic_ci_d_hash,
-	.d_compare = generic_ci_d_compare,
-};
-#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 45fcdbf538d1..983f2b970d6a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3354,10 +3354,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
 /* dir.c */
 extern const struct file_operations ext4_dir_operations;
 
-#ifdef CONFIG_UNICODE
-extern const struct dentry_operations ext4_dentry_ops;
-#endif
-
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f458d1d81d96..8e2398e5d0fe 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1614,6 +1614,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
 	struct buffer_head *bh;
 
 	err = ext4_fname_prepare_lookup(dir, dentry, &fname);
+	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		return NULL;
 	if (err)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ef4734b40e2a..82b365acedf8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4974,11 +4974,6 @@ no_journal:
 		goto failed_mount4;
 	}
 
-#ifdef CONFIG_UNICODE
-	if (sb->s_encoding)
-		sb->s_d_op = &ext4_dentry_ops;
-#endif
-
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 4b9ef8bbfa4a..71fdf5076461 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -1099,10 +1099,3 @@ const struct file_operations f2fs_dir_operations = {
 	.compat_ioctl   = f2fs_compat_ioctl,
 #endif
 };
-
-#ifdef CONFIG_UNICODE
-const struct dentry_operations f2fs_dentry_ops = {
-	.d_hash = generic_ci_d_hash,
-	.d_compare = generic_ci_d_compare,
-};
-#endif
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 99bcf4b44a9c..01fd42843e49 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3688,9 +3688,6 @@ static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {}
 #endif
 
 extern const struct file_operations f2fs_dir_operations;
-#ifdef CONFIG_UNICODE
-extern const struct dentry_operations f2fs_dentry_ops;
-#endif
 extern const struct file_operations f2fs_file_operations;
 extern const struct inode_operations f2fs_file_inode_operations;
 extern const struct address_space_operations f2fs_dblock_aops;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 8fa37d1434de..6edb1ab579a1 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -497,6 +497,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	err = f2fs_prepare_lookup(dir, dentry, &fname);
+	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		goto out_splice;
 	if (err)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 0ec292d7fcdb..08e63c8caa1e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3426,7 +3426,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
 
 		sbi->sb->s_encoding = encoding;
 		sbi->sb->s_encoding_flags = encoding_flags;
-		sbi->sb->s_d_op = &f2fs_dentry_ops;
 	}
 #else
 	if (f2fs_sb_has_casefold(sbi)) {
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 155521e51ac5..7a920434d741 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -203,6 +203,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
 
 	err = fscrypt_prepare_lookup(dir, dentry, &nm);
+	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		return d_splice_alias(NULL, dentry);
 	if (err)
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index a8f7a43f031b..e72f80482671 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -741,8 +741,11 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir,
  * directory's encryption key is available, then the lookup is assumed to be by
  * plaintext name; otherwise, it is assumed to be by no-key name.
  *
- * This also installs a custom ->d_revalidate() method which will invalidate the
- * dentry if it was created without the key and the key is later added.
+ * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key
+ * name.  In this case the filesystem must assign the dentry a dentry_operations
+ * which contains fscrypt_d_revalidate (or contains a d_revalidate method that
+ * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the
+ * directory's encryption key is later added.
  *
  * Return: 0 on success; -ENOENT if the directory's key is unavailable but the
  * filename isn't a valid no-key name, so a negative dentry should be created;
-- 
cgit 


From b28f047b28c51d0b9864c34b097bb0b221ea7247 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 26 Nov 2020 18:32:09 +0800
Subject: f2fs: compress: support chksum

This patch supports to store chksum value with compressed
data, and verify the integrality of compressed data while
reading the data.

The feature can be enabled through specifying mount option
'compress_chksum'.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.rst |  1 +
 fs/f2fs/compress.c                 | 23 +++++++++++++++++++++++
 fs/f2fs/f2fs.h                     | 16 ++++++++++++++--
 fs/f2fs/inode.c                    |  3 +++
 fs/f2fs/super.c                    |  9 +++++++++
 include/linux/f2fs_fs.h            |  2 +-
 6 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index b8ee761c9922..985ae7d35066 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -260,6 +260,7 @@ compress_extension=%s	 Support adding specified extension, so that f2fs can enab
 			 For other files, we can still enable compression via ioctl.
 			 Note that, there is one reserved special extension '*', it
 			 can be set to enable compression for all files.
+compress_chksum		 Support verifying chksum of raw data in compressed cluster.
 inlinecrypt		 When possible, encrypt/decrypt the contents of encrypted
 			 files using the blk-crypto framework rather than
 			 filesystem-layer encryption. This allows the use of
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 14262e0f1cd6..7ec1592a0973 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -602,6 +602,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 				f2fs_cops[fi->i_compress_algorithm];
 	unsigned int max_len, new_nr_cpages;
 	struct page **new_cpages;
+	u32 chksum = 0;
 	int i, ret;
 
 	trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
@@ -655,6 +656,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 
 	cc->cbuf->clen = cpu_to_le32(cc->clen);
 
+	if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)
+		chksum = f2fs_crc32(F2FS_I_SB(cc->inode),
+					cc->cbuf->cdata, cc->clen);
+	cc->cbuf->chksum = cpu_to_le32(chksum);
+
 	for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
 		cc->cbuf->reserved[i] = cpu_to_le32(0);
 
@@ -790,6 +796,23 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 
 	ret = cops->decompress_pages(dic);
 
+	if (!ret && fi->i_compress_flag & 1 << COMPRESS_CHKSUM) {
+		u32 provided = le32_to_cpu(dic->cbuf->chksum);
+		u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen);
+
+		if (provided != calculated) {
+			if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) {
+				set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT);
+				printk_ratelimited(
+					"%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x",
+					KERN_INFO, sbi->sb->s_id, dic->inode->i_ino,
+					provided, calculated);
+			}
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			WARN_ON_ONCE(1);
+		}
+	}
+
 out_vunmap_cbuf:
 	vm_unmap_ram(dic->cbuf, dic->nr_cpages);
 out_vunmap_rbuf:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0d25f5ca5618..0b314b2034d8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -147,7 +147,8 @@ struct f2fs_mount_info {
 
 	/* For compression */
 	unsigned char compress_algorithm;	/* algorithm type */
-	unsigned compress_log_size;		/* cluster log size */
+	unsigned char compress_log_size;	/* cluster log size */
+	bool compress_chksum;			/* compressed data chksum */
 	unsigned char compress_ext_cnt;		/* extension count */
 	unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN];	/* extensions */
 };
@@ -676,6 +677,7 @@ enum {
 	FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
 	FI_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 	FI_COMPRESSED_FILE,	/* indicate file's data can be compressed */
+	FI_COMPRESS_CORRUPT,	/* indicate compressed cluster is corrupted */
 	FI_MMAP_FILE,		/* indicate file was mmapped */
 	FI_MAX,			/* max flag, never be used */
 };
@@ -733,6 +735,7 @@ struct f2fs_inode_info {
 	atomic_t i_compr_blocks;		/* # of compressed blocks */
 	unsigned char i_compress_algorithm;	/* algorithm type */
 	unsigned char i_log_cluster_size;	/* log of cluster size */
+	unsigned short i_compress_flag;		/* compress flag */
 	unsigned int i_cluster_size;		/* cluster size */
 };
 
@@ -1272,9 +1275,15 @@ enum compress_algorithm_type {
 	COMPRESS_MAX,
 };
 
-#define COMPRESS_DATA_RESERVED_SIZE		5
+enum compress_flag {
+	COMPRESS_CHKSUM,
+	COMPRESS_MAX_FLAG,
+};
+
+#define COMPRESS_DATA_RESERVED_SIZE		4
 struct compress_data {
 	__le32 clen;			/* compressed data size */
+	__le32 chksum;			/* compressed data chksum */
 	__le32 reserved[COMPRESS_DATA_RESERVED_SIZE];	/* reserved */
 	u8 cdata[];			/* compressed data */
 };
@@ -3888,6 +3897,9 @@ static inline void set_compress_context(struct inode *inode)
 			F2FS_OPTION(sbi).compress_algorithm;
 	F2FS_I(inode)->i_log_cluster_size =
 			F2FS_OPTION(sbi).compress_log_size;
+	F2FS_I(inode)->i_compress_flag =
+			F2FS_OPTION(sbi).compress_chksum ?
+				1 << COMPRESS_CHKSUM : 0;
 	F2FS_I(inode)->i_cluster_size =
 			1 << F2FS_I(inode)->i_log_cluster_size;
 	F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 657db2fb6739..349d9cb933ee 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -456,6 +456,7 @@ static int do_read_inode(struct inode *inode)
 					le64_to_cpu(ri->i_compr_blocks));
 			fi->i_compress_algorithm = ri->i_compress_algorithm;
 			fi->i_log_cluster_size = ri->i_log_cluster_size;
+			fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag);
 			fi->i_cluster_size = 1 << fi->i_log_cluster_size;
 			set_inode_flag(inode, FI_COMPRESSED_FILE);
 		}
@@ -634,6 +635,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 					&F2FS_I(inode)->i_compr_blocks));
 			ri->i_compress_algorithm =
 				F2FS_I(inode)->i_compress_algorithm;
+			ri->i_compress_flag =
+				cpu_to_le16(F2FS_I(inode)->i_compress_flag);
 			ri->i_log_cluster_size =
 				F2FS_I(inode)->i_log_cluster_size;
 		}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 82baaa89c893..f3d919ee4dee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -146,6 +146,7 @@ enum {
 	Opt_compress_algorithm,
 	Opt_compress_log_size,
 	Opt_compress_extension,
+	Opt_compress_chksum,
 	Opt_atgc,
 	Opt_err,
 };
@@ -214,6 +215,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_compress_algorithm, "compress_algorithm=%s"},
 	{Opt_compress_log_size, "compress_log_size=%u"},
 	{Opt_compress_extension, "compress_extension=%s"},
+	{Opt_compress_chksum, "compress_chksum"},
 	{Opt_atgc, "atgc"},
 	{Opt_err, NULL},
 };
@@ -934,10 +936,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			F2FS_OPTION(sbi).compress_ext_cnt++;
 			kfree(name);
 			break;
+		case Opt_compress_chksum:
+			F2FS_OPTION(sbi).compress_chksum = true;
+			break;
 #else
 		case Opt_compress_algorithm:
 		case Opt_compress_log_size:
 		case Opt_compress_extension:
+		case Opt_compress_chksum:
 			f2fs_info(sbi, "compression options not supported");
 			break;
 #endif
@@ -1523,6 +1529,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
 		seq_printf(seq, ",compress_extension=%s",
 			F2FS_OPTION(sbi).extensions[i]);
 	}
+
+	if (F2FS_OPTION(sbi).compress_chksum)
+		seq_puts(seq, ",compress_chksum");
 }
 
 static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index a5dbb57a687f..7dc2a06cf19a 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -273,7 +273,7 @@ struct f2fs_inode {
 			__le64 i_compr_blocks;	/* # of compressed blocks */
 			__u8 i_compress_algorithm;	/* compress algorithm */
 			__u8 i_log_cluster_size;	/* log of cluster size */
-			__le16 i_padding;		/* padding */
+			__le16 i_compress_flag;		/* compress flag */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
 		} __packed;
 		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
-- 
cgit 


From 39be39ceffd572baddfeff8b50aba931d3d6d785 Mon Sep 17 00:00:00 2001
From: Andrzej Pietrasiewicz <andrzej.p@collabora.com>
Date: Sun, 4 Oct 2020 21:15:46 -0700
Subject: Input: add input_device_enabled()

A helper function for drivers to decide if the device is used or not.
A lockdep check is introduced as inspecting ->users should be done under
input device's mutex.

Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@collabora.com>
Link: https://lore.kernel.org/r/20200608112211.12125-2-andrzej.p@collabora.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/input.c | 8 ++++++++
 include/linux/input.h | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 3cfd2c18eebd..41377bfa142d 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -2127,6 +2127,14 @@ void input_enable_softrepeat(struct input_dev *dev, int delay, int period)
 }
 EXPORT_SYMBOL(input_enable_softrepeat);
 
+bool input_device_enabled(struct input_dev *dev)
+{
+	lockdep_assert_held(&dev->mutex);
+
+	return dev->users > 0;
+}
+EXPORT_SYMBOL_GPL(input_device_enabled);
+
 /**
  * input_register_device - register device with input core
  * @dev: device to be registered
diff --git a/include/linux/input.h b/include/linux/input.h
index 56f2fd32e609..eda4587dba67 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -502,6 +502,8 @@ bool input_match_device_id(const struct input_dev *dev,
 
 void input_enable_softrepeat(struct input_dev *dev, int delay, int period);
 
+bool input_device_enabled(struct input_dev *dev);
+
 extern struct class input_class;
 
 /**
-- 
cgit 


From a181616487dbdbc953e476d1da15365f887859ed Mon Sep 17 00:00:00 2001
From: Patrik Fimml <patrikf@chromium.org>
Date: Wed, 2 Dec 2020 14:42:04 -0800
Subject: Input: Add "inhibited" property

Userspace might want to implement a policy to temporarily disregard input
from certain devices, including not treating them as wakeup sources.

An example use case is a laptop, whose keyboard can be folded under the
screen to create tablet-like experience. The user then must hold the laptop
in such a way that it is difficult to avoid pressing the keyboard keys. It
is therefore desirable to temporarily disregard input from the keyboard,
until it is folded back. This obviously is a policy which should be kept
out of the kernel, but the kernel must provide suitable means to implement
such a policy.

This patch adds a sysfs interface for exactly this purpose.

To implement the said interface it adds an "inhibited" property to struct
input_dev, and effectively creates four states a device can be in: closed
uninhibited, closed inhibited, open uninhibited, open inhibited. It also
defers calling driver's ->open() and ->close() to until they are actually
needed, e.g. it makes no sense to prepare the underlying device for
generating events (->open()) if the device is inhibited.

              uninhibit
closed      <------------ closed
uninhibited ------------> inhibited
      | ^     inhibit        | ^
 1st  | |               1st  | |
 open | |               open | |
      | |                    | |
      | | last               | | last
      | | close              | | close
      v |     uninhibit      v |
open        <------------ open
uninhibited ------------> inhibited

The top inhibit/uninhibit transition happens when users == 0.
The bottom inhibit/uninhibit transition happens when users > 0.
The left open/close transition happens when !inhibited.
The right open/close transition happens when inhibited.
Due to all transitions being serialized with dev->mutex, it is impossible
to have "diagonal" transitions between closed uninhibited and open
inhibited or between open uninhibited and closed inhibited.

No new callbacks are added to drivers, because their open() and close()
serve exactly the purpose to tell the driver to start/stop providing
events to the input core. Consequently, open() and close() - if provided
- are called in both inhibit and uninhibit paths.

Signed-off-by: Patrik Fimml <patrikf@chromium.org>
Co-developed-by: Andrzej Pietrasiewicz <andrzej.p@collabora.com>
Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@collabora.com>
Link: https://lore.kernel.org/r/20200608112211.12125-8-andrzej.p@collabora.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/input.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/input.h |  12 +++++-
 2 files changed, 115 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 41377bfa142d..ccaeb2426385 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -367,8 +367,13 @@ static int input_get_disposition(struct input_dev *dev,
 static void input_handle_event(struct input_dev *dev,
 			       unsigned int type, unsigned int code, int value)
 {
-	int disposition = input_get_disposition(dev, type, code, &value);
+	int disposition;
 
+	/* filter-out events from inhibited devices */
+	if (dev->inhibited)
+		return;
+
+	disposition = input_get_disposition(dev, type, code, &value);
 	if (disposition != INPUT_IGNORE_EVENT && type != EV_SYN)
 		add_input_randomness(type, code, value);
 
@@ -612,10 +617,10 @@ int input_open_device(struct input_handle *handle)
 
 	handle->open++;
 
-	if (dev->users++) {
+	if (dev->users++ || dev->inhibited) {
 		/*
-		 * Device is already opened, so we can exit immediately and
-		 * report success.
+		 * Device is already opened and/or inhibited,
+		 * so we can exit immediately and report success.
 		 */
 		goto out;
 	}
@@ -675,10 +680,9 @@ void input_close_device(struct input_handle *handle)
 
 	__input_release_device(handle);
 
-	if (!--dev->users) {
+	if (!dev->inhibited && !--dev->users) {
 		if (dev->poller)
 			input_dev_poller_stop(dev->poller);
-
 		if (dev->close)
 			dev->close(dev);
 	}
@@ -1416,12 +1420,49 @@ static ssize_t input_dev_show_properties(struct device *dev,
 }
 static DEVICE_ATTR(properties, S_IRUGO, input_dev_show_properties, NULL);
 
+static int input_inhibit_device(struct input_dev *dev);
+static int input_uninhibit_device(struct input_dev *dev);
+
+static ssize_t inhibited_show(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct input_dev *input_dev = to_input_dev(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", input_dev->inhibited);
+}
+
+static ssize_t inhibited_store(struct device *dev,
+			       struct device_attribute *attr, const char *buf,
+			       size_t len)
+{
+	struct input_dev *input_dev = to_input_dev(dev);
+	ssize_t rv;
+	bool inhibited;
+
+	if (strtobool(buf, &inhibited))
+		return -EINVAL;
+
+	if (inhibited)
+		rv = input_inhibit_device(input_dev);
+	else
+		rv = input_uninhibit_device(input_dev);
+
+	if (rv != 0)
+		return rv;
+
+	return len;
+}
+
+static DEVICE_ATTR_RW(inhibited);
+
 static struct attribute *input_dev_attrs[] = {
 	&dev_attr_name.attr,
 	&dev_attr_phys.attr,
 	&dev_attr_uniq.attr,
 	&dev_attr_modalias.attr,
 	&dev_attr_properties.attr,
+	&dev_attr_inhibited.attr,
 	NULL
 };
 
@@ -1703,6 +1744,63 @@ void input_reset_device(struct input_dev *dev)
 }
 EXPORT_SYMBOL(input_reset_device);
 
+static int input_inhibit_device(struct input_dev *dev)
+{
+	int ret = 0;
+
+	mutex_lock(&dev->mutex);
+
+	if (dev->inhibited)
+		goto out;
+
+	if (dev->users) {
+		if (dev->close)
+			dev->close(dev);
+		if (dev->poller)
+			input_dev_poller_stop(dev->poller);
+	}
+
+	spin_lock_irq(&dev->event_lock);
+	input_dev_release_keys(dev);
+	input_dev_toggle(dev, false);
+	spin_unlock_irq(&dev->event_lock);
+
+	dev->inhibited = true;
+
+out:
+	mutex_unlock(&dev->mutex);
+	return ret;
+}
+
+static int input_uninhibit_device(struct input_dev *dev)
+{
+	int ret = 0;
+
+	mutex_lock(&dev->mutex);
+
+	if (!dev->inhibited)
+		goto out;
+
+	if (dev->users) {
+		if (dev->open) {
+			ret = dev->open(dev);
+			if (ret)
+				goto out;
+		}
+		if (dev->poller)
+			input_dev_poller_start(dev->poller);
+	}
+
+	dev->inhibited = false;
+	spin_lock_irq(&dev->event_lock);
+	input_dev_toggle(dev, true);
+	spin_unlock_irq(&dev->event_lock);
+
+out:
+	mutex_unlock(&dev->mutex);
+	return ret;
+}
+
 #ifdef CONFIG_PM_SLEEP
 static int input_dev_suspend(struct device *dev)
 {
@@ -2131,7 +2229,7 @@ bool input_device_enabled(struct input_dev *dev)
 {
 	lockdep_assert_held(&dev->mutex);
 
-	return dev->users > 0;
+	return !dev->inhibited && dev->users > 0;
 }
 EXPORT_SYMBOL_GPL(input_device_enabled);
 
diff --git a/include/linux/input.h b/include/linux/input.h
index eda4587dba67..0354b298d874 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -90,9 +90,11 @@ enum input_clock_type {
  * @open: this method is called when the very first user calls
  *	input_open_device(). The driver must prepare the device
  *	to start generating events (start polling thread,
- *	request an IRQ, submit URB, etc.)
+ *	request an IRQ, submit URB, etc.). The meaning of open() is
+ *	to start providing events to the input core.
  * @close: this method is called when the very last user calls
- *	input_close_device().
+ *	input_close_device(). The meaning of close() is to stop
+ *	providing events to the input core.
  * @flush: purges the device. Most commonly used to get rid of force
  *	feedback effects loaded into the device when disconnecting
  *	from it
@@ -127,6 +129,10 @@ enum input_clock_type {
  *	and needs not be explicitly unregistered or freed.
  * @timestamp: storage for a timestamp set by input_set_timestamp called
  *  by a driver
+ * @inhibited: indicates that the input device is inhibited. If that is
+ * the case then input core ignores any events generated by the device.
+ * Device's close() is called when it is being inhibited and its open()
+ * is called when it is being uninhibited.
  */
 struct input_dev {
 	const char *name;
@@ -201,6 +207,8 @@ struct input_dev {
 	bool devres_managed;
 
 	ktime_t timestamp[INPUT_CLK_MAX];
+
+	bool inhibited;
 };
 #define to_input_dev(d) container_of(d, struct input_dev, dev)
 
-- 
cgit 


From 41a340941854c4606a9b71b9d68db412747e7c84 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Wed, 2 Dec 2020 15:13:26 +0100
Subject: media: coda: Convert the driver to DT-only

Since 5.10-rc1 i.MX is a devicetree-only platform, so simplify the code
by removing the unused non-DT support.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 drivers/media/platform/Kconfig            |  2 +-
 drivers/media/platform/coda/coda-common.c | 27 ++-------------------------
 include/linux/platform_data/media/coda.h  | 14 --------------
 3 files changed, 3 insertions(+), 40 deletions(-)
 delete mode 100644 include/linux/platform_data/media/coda.h

(limited to 'include/linux')

diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig
index ffffef2267f4..9fdbfea06087 100644
--- a/drivers/media/platform/Kconfig
+++ b/drivers/media/platform/Kconfig
@@ -201,7 +201,7 @@ if V4L_MEM2MEM_DRIVERS
 
 config VIDEO_CODA
 	tristate "Chips&Media Coda multi-standard codec IP"
-	depends on VIDEO_DEV && VIDEO_V4L2 && (ARCH_MXC || COMPILE_TEST)
+	depends on VIDEO_DEV && VIDEO_V4L2 && OF && (ARCH_MXC || COMPILE_TEST)
 	select SRAM
 	select VIDEOBUF2_DMA_CONTIG
 	select VIDEOBUF2_VMALLOC
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index d30eafea701d..995e95272e51 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/videodev2.h>
 #include <linux/of.h>
-#include <linux/platform_data/media/coda.h>
 #include <linux/ratelimit.h>
 #include <linux/reset.h>
 
@@ -3102,13 +3101,6 @@ static const struct coda_devtype coda_devdata[] = {
 	},
 };
 
-static const struct platform_device_id coda_platform_ids[] = {
-	{ .name = "coda-imx27", .driver_data = CODA_IMX27 },
-	{ /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(platform, coda_platform_ids);
-
-#ifdef CONFIG_OF
 static const struct of_device_id coda_dt_ids[] = {
 	{ .compatible = "fsl,imx27-vpu", .data = &coda_devdata[CODA_IMX27] },
 	{ .compatible = "fsl,imx51-vpu", .data = &coda_devdata[CODA_IMX51] },
@@ -3118,14 +3110,9 @@ static const struct of_device_id coda_dt_ids[] = {
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, coda_dt_ids);
-#endif
 
 static int coda_probe(struct platform_device *pdev)
 {
-	const struct of_device_id *of_id =
-			of_match_device(of_match_ptr(coda_dt_ids), &pdev->dev);
-	const struct platform_device_id *pdev_id;
-	struct coda_platform_data *pdata = pdev->dev.platform_data;
 	struct device_node *np = pdev->dev.of_node;
 	struct gen_pool *pool;
 	struct coda_dev *dev;
@@ -3135,14 +3122,7 @@ static int coda_probe(struct platform_device *pdev)
 	if (!dev)
 		return -ENOMEM;
 
-	pdev_id = of_id ? of_id->data : platform_get_device_id(pdev);
-
-	if (of_id)
-		dev->devtype = of_id->data;
-	else if (pdev_id)
-		dev->devtype = &coda_devdata[pdev_id->driver_data];
-	else
-		return -EINVAL;
+	dev->devtype = of_device_get_match_data(&pdev->dev);
 
 	dev->dev = &pdev->dev;
 	dev->clk_per = devm_clk_get(&pdev->dev, "per");
@@ -3200,10 +3180,8 @@ static int coda_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	/* Get IRAM pool from device tree or platform data */
+	/* Get IRAM pool from device tree */
 	pool = of_gen_pool_get(np, "iram", 0);
-	if (!pool && pdata)
-		pool = gen_pool_get(pdata->iram_dev, NULL);
 	if (!pool) {
 		dev_err(&pdev->dev, "iram pool not available\n");
 		return -ENOMEM;
@@ -3342,7 +3320,6 @@ static struct platform_driver coda_driver = {
 		.of_match_table = of_match_ptr(coda_dt_ids),
 		.pm	= &coda_pm_ops,
 	},
-	.id_table = coda_platform_ids,
 };
 
 module_platform_driver(coda_driver);
diff --git a/include/linux/platform_data/media/coda.h b/include/linux/platform_data/media/coda.h
deleted file mode 100644
index 293b61b60c9d..000000000000
--- a/include/linux/platform_data/media/coda.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2013 Philipp Zabel, Pengutronix
- */
-#ifndef PLATFORM_CODA_H
-#define PLATFORM_CODA_H
-
-struct device;
-
-struct coda_platform_data {
-	struct device *iram_dev;
-};
-
-#endif
-- 
cgit 


From 2a4a06da8a4b93dd189171eed7a99fffd38f42f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 13 Nov 2020 11:41:40 +0100
Subject: mm/gup: Provide gup_get_pte() more generic

In order to write another lockless page-table walker, we need
gup_get_pte() exposed. While doing that, rename it to
ptep_get_lockless() to match the existing ptep_get() naming.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201126121121.036370527@infradead.org
---
 include/linux/pgtable.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/gup.c                | 58 +------------------------------------------------
 2 files changed, 56 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 71125a4676c4..ed9266cc115b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -258,6 +258,61 @@ static inline pte_t ptep_get(pte_t *ptep)
 }
 #endif
 
+#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking any
+ * locks.  For this we would like to load the pointers atomically, but sometimes
+ * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
+ * we do have is the guarantee that a PTE will only either go from not present
+ * to present, or present to not present or both -- it will not switch to a
+ * completely different present page without a TLB flush in between; something
+ * that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ *   ptep->pte_high = h;
+ *   smp_wmb();
+ *   ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ *   ptep->pte_low = 0;
+ *   smp_wmb();
+ *   ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
+ * We load pte_high *after* loading pte_low, which ensures we don't see an older
+ * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
+ * picked up a changed pte high. We might have gotten rubbish values from
+ * pte_low and pte_high, but we are guaranteed that pte_low will not have the
+ * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
+ * operates on present ptes we're safe.
+ */
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+	pte_t pte;
+
+	do {
+		pte.pte_low = ptep->pte_low;
+		smp_rmb();
+		pte.pte_high = ptep->pte_high;
+		smp_rmb();
+	} while (unlikely(pte.pte_low != ptep->pte_low));
+
+	return pte;
+}
+#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+/*
+ * We require that the PTE can be read atomically.
+ */
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+	return ptep_get(ptep);
+}
+#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
diff --git a/mm/gup.c b/mm/gup.c
index 98eb8e6d2609..44b0c6b89602 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2085,62 +2085,6 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
 	put_page(page);
 }
 
-#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
-
-/*
- * WARNING: only to be used in the get_user_pages_fast() implementation.
- *
- * With get_user_pages_fast(), we walk down the pagetables without taking any
- * locks.  For this we would like to load the pointers atomically, but sometimes
- * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
- * we do have is the guarantee that a PTE will only either go from not present
- * to present, or present to not present or both -- it will not switch to a
- * completely different present page without a TLB flush in between; something
- * that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- *
- *   ptep->pte_high = h;
- *   smp_wmb();
- *   ptep->pte_low = l;
- *
- * And present to not present goes:
- *
- *   ptep->pte_low = 0;
- *   smp_wmb();
- *   ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
- * We load pte_high *after* loading pte_low, which ensures we don't see an older
- * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
- * picked up a changed pte high. We might have gotten rubbish values from
- * pte_low and pte_high, but we are guaranteed that pte_low will not have the
- * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
- * operates on present ptes we're safe.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-	pte_t pte;
-
-	do {
-		pte.pte_low = ptep->pte_low;
-		smp_rmb();
-		pte.pte_high = ptep->pte_high;
-		smp_rmb();
-	} while (unlikely(pte.pte_low != ptep->pte_low));
-
-	return pte;
-}
-#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-/*
- * We require that the PTE can be read atomically.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-	return ptep_get(ptep);
-}
-#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-
 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
 					    unsigned int flags,
 					    struct page **pages)
@@ -2166,7 +2110,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 
 	ptem = ptep = pte_offset_map(&pmd, addr);
 	do {
-		pte_t pte = gup_get_pte(ptep);
+		pte_t pte = ptep_get_lockless(ptep);
 		struct page *head, *page;
 
 		/*
-- 
cgit 


From 560dabbdf68bb15f9e241af8f828b1c8c38d6c6f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 13 Nov 2020 11:45:36 +0100
Subject: mm: Introduce pXX_leaf_size()

A number of architectures have non-pagetable aligned huge/large pages.
For such architectures a leaf can actually be part of a larger entry.

Provide generic helpers to determine the size of a page-table leaf.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lkml.kernel.org/r/20201126121121.102580109@infradead.org
---
 include/linux/pgtable.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index ed9266cc115b..fefbbdbbb3f3 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1536,4 +1536,20 @@ typedef unsigned int pgtbl_mod_mask;
 #define pmd_leaf(x)	0
 #endif
 
+#ifndef pgd_leaf_size
+#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
+#endif
+#ifndef p4d_leaf_size
+#define p4d_leaf_size(x) P4D_SIZE
+#endif
+#ifndef pud_leaf_size
+#define pud_leaf_size(x) PUD_SIZE
+#endif
+#ifndef pmd_leaf_size
+#define pmd_leaf_size(x) PMD_SIZE
+#endif
+#ifndef pte_leaf_size
+#define pte_leaf_size(x) PAGE_SIZE
+#endif
+
 #endif /* _LINUX_PGTABLE_H */
-- 
cgit 


From a07c45312f06e288417049208c344ad76074627d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 26 Oct 2020 17:50:38 +0100
Subject: seqlock: avoid -Wshadow warnings

When building with W=2, there is a flood of warnings about the seqlock
macros shadowing local variables:

  19806 linux/seqlock.h:331:11: warning: declaration of 'seq' shadows a previous local [-Wshadow]
     48 linux/seqlock.h:348:11: warning: declaration of 'seq' shadows a previous local [-Wshadow]
      8 linux/seqlock.h:379:11: warning: declaration of 'seq' shadows a previous local [-Wshadow]

Prefix the local variables to make the warning useful elsewhere again.

Fixes: 52ac39e5db51 ("seqlock: seqcount_t: Implement all read APIs as statement expressions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201026165044.3722931-1-arnd@kernel.org
---
 include/linux/seqlock.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index cbfc78b92b65..8d8552474c64 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -328,13 +328,13 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
  */
 #define __read_seqcount_begin(s)					\
 ({									\
-	unsigned seq;							\
+	unsigned __seq;							\
 									\
-	while ((seq = __seqcount_sequence(s)) & 1)			\
+	while ((__seq = __seqcount_sequence(s)) & 1)			\
 		cpu_relax();						\
 									\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
-	seq;								\
+	__seq;								\
 })
 
 /**
@@ -345,10 +345,10 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
  */
 #define raw_read_seqcount_begin(s)					\
 ({									\
-	unsigned seq = __read_seqcount_begin(s);			\
+	unsigned _seq = __read_seqcount_begin(s);			\
 									\
 	smp_rmb();							\
-	seq;								\
+	_seq;								\
 })
 
 /**
@@ -376,11 +376,11 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
  */
 #define raw_read_seqcount(s)						\
 ({									\
-	unsigned seq = __seqcount_sequence(s);				\
+	unsigned __seq = __seqcount_sequence(s);			\
 									\
 	smp_rmb();							\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
-	seq;								\
+	__seq;								\
 })
 
 /**
-- 
cgit 


From ab440b2c604b60fe90885270fcfeb5c3dd5d6fae Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Nov 2020 13:44:17 +0100
Subject: seqlock: Rename __seqprop() users

More consistent naming should make it easier to untangle the _Generic
token pasting maze called __seqprop().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201110115358.GE2594@hirez.programming.kicks-ass.net
---
 include/linux/seqlock.h | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 8d8552474c64..d89134c74fba 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -307,10 +307,10 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
 	__seqprop_case((s),	mutex,		prop),			\
 	__seqprop_case((s),	ww_mutex,	prop))
 
-#define __seqcount_ptr(s)		__seqprop(s, ptr)
-#define __seqcount_sequence(s)		__seqprop(s, sequence)
-#define __seqcount_lock_preemptible(s)	__seqprop(s, preemptible)
-#define __seqcount_assert_lock_held(s)	__seqprop(s, assert)
+#define seqprop_ptr(s)			__seqprop(s, ptr)
+#define seqprop_sequence(s)		__seqprop(s, sequence)
+#define seqprop_preemptible(s)		__seqprop(s, preemptible)
+#define seqprop_assert(s)		__seqprop(s, assert)
 
 /**
  * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
@@ -330,7 +330,7 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
 ({									\
 	unsigned __seq;							\
 									\
-	while ((__seq = __seqcount_sequence(s)) & 1)			\
+	while ((__seq = seqprop_sequence(s)) & 1)			\
 		cpu_relax();						\
 									\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
@@ -359,7 +359,7 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
  */
 #define read_seqcount_begin(s)						\
 ({									\
-	seqcount_lockdep_reader_access(__seqcount_ptr(s));		\
+	seqcount_lockdep_reader_access(seqprop_ptr(s));			\
 	raw_read_seqcount_begin(s);					\
 })
 
@@ -376,7 +376,7 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
  */
 #define raw_read_seqcount(s)						\
 ({									\
-	unsigned __seq = __seqcount_sequence(s);			\
+	unsigned __seq = seqprop_sequence(s);				\
 									\
 	smp_rmb();							\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
@@ -425,7 +425,7 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
  * Return: true if a read section retry is required, else false
  */
 #define __read_seqcount_retry(s, start)					\
-	__read_seqcount_t_retry(__seqcount_ptr(s), start)
+	__read_seqcount_t_retry(seqprop_ptr(s), start)
 
 static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start)
 {
@@ -445,7 +445,7 @@ static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start)
  * Return: true if a read section retry is required, else false
  */
 #define read_seqcount_retry(s, start)					\
-	read_seqcount_t_retry(__seqcount_ptr(s), start)
+	read_seqcount_t_retry(seqprop_ptr(s), start)
 
 static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start)
 {
@@ -459,10 +459,10 @@ static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start)
  */
 #define raw_write_seqcount_begin(s)					\
 do {									\
-	if (__seqcount_lock_preemptible(s))				\
+	if (seqprop_preemptible(s))					\
 		preempt_disable();					\
 									\
-	raw_write_seqcount_t_begin(__seqcount_ptr(s));			\
+	raw_write_seqcount_t_begin(seqprop_ptr(s));			\
 } while (0)
 
 static inline void raw_write_seqcount_t_begin(seqcount_t *s)
@@ -478,9 +478,9 @@ static inline void raw_write_seqcount_t_begin(seqcount_t *s)
  */
 #define raw_write_seqcount_end(s)					\
 do {									\
-	raw_write_seqcount_t_end(__seqcount_ptr(s));			\
+	raw_write_seqcount_t_end(seqprop_ptr(s));			\
 									\
-	if (__seqcount_lock_preemptible(s))				\
+	if (seqprop_preemptible(s))					\
 		preempt_enable();					\
 } while (0)
 
@@ -501,12 +501,12 @@ static inline void raw_write_seqcount_t_end(seqcount_t *s)
  */
 #define write_seqcount_begin_nested(s, subclass)			\
 do {									\
-	__seqcount_assert_lock_held(s);					\
+	seqprop_assert(s);						\
 									\
-	if (__seqcount_lock_preemptible(s))				\
+	if (seqprop_preemptible(s))					\
 		preempt_disable();					\
 									\
-	write_seqcount_t_begin_nested(__seqcount_ptr(s), subclass);	\
+	write_seqcount_t_begin_nested(seqprop_ptr(s), subclass);	\
 } while (0)
 
 static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass)
@@ -528,12 +528,12 @@ static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass)
  */
 #define write_seqcount_begin(s)						\
 do {									\
-	__seqcount_assert_lock_held(s);					\
+	seqprop_assert(s);						\
 									\
-	if (__seqcount_lock_preemptible(s))				\
+	if (seqprop_preemptible(s))					\
 		preempt_disable();					\
 									\
-	write_seqcount_t_begin(__seqcount_ptr(s));			\
+	write_seqcount_t_begin(seqprop_ptr(s));				\
 } while (0)
 
 static inline void write_seqcount_t_begin(seqcount_t *s)
@@ -549,9 +549,9 @@ static inline void write_seqcount_t_begin(seqcount_t *s)
  */
 #define write_seqcount_end(s)						\
 do {									\
-	write_seqcount_t_end(__seqcount_ptr(s));			\
+	write_seqcount_t_end(seqprop_ptr(s));				\
 									\
-	if (__seqcount_lock_preemptible(s))				\
+	if (seqprop_preemptible(s))					\
 		preempt_enable();					\
 } while (0)
 
@@ -603,7 +603,7 @@ static inline void write_seqcount_t_end(seqcount_t *s)
  *      }
  */
 #define raw_write_seqcount_barrier(s)					\
-	raw_write_seqcount_t_barrier(__seqcount_ptr(s))
+	raw_write_seqcount_t_barrier(seqprop_ptr(s))
 
 static inline void raw_write_seqcount_t_barrier(seqcount_t *s)
 {
@@ -623,7 +623,7 @@ static inline void raw_write_seqcount_t_barrier(seqcount_t *s)
  * will complete successfully and see data older than this.
  */
 #define write_seqcount_invalidate(s)					\
-	write_seqcount_t_invalidate(__seqcount_ptr(s))
+	write_seqcount_t_invalidate(seqprop_ptr(s))
 
 static inline void write_seqcount_t_invalidate(seqcount_t *s)
 {
-- 
cgit 


From b6498aad59b091e5618a9f05e7636e2ad2c6732d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 1 Dec 2020 13:09:00 +0100
Subject: completion: Drop init_completion define

Changeset cd8084f91c02 ("locking/lockdep: Apply crossrelease to completions")
added a CONFIG_LOCKDEP_COMPLETE (that was later renamed to
CONFIG_LOCKDEP_COMPLETIONS).

Such changeset renamed the init_completion, and add a macro
that would either run a modified version or the original code.

However, such code reported too many false positives. So, it
ended being dropped later on by
changeset e966eaeeb623 ("locking/lockdep: Remove the cross-release locking checks").

Yet, the define remained there as just:

	 #define init_completion(x) __init_completion(x)

Get rid of the define, and return __init_completion() function
to its original name.

Fixes: e966eaeeb623 ("locking/lockdep: Remove the cross-release locking checks")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/e657bfc533545c185b1c3c55926a449ead56a88b.1606823973.git.mchehab+huawei@kernel.org
---
 include/linux/completion.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index bf8e77001f18..51d9ab079629 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -28,8 +28,7 @@ struct completion {
 	struct swait_queue_head wait;
 };
 
-#define init_completion_map(x, m) __init_completion(x)
-#define init_completion(x) __init_completion(x)
+#define init_completion_map(x, m) init_completion(x)
 static inline void complete_acquire(struct completion *x) {}
 static inline void complete_release(struct completion *x) {}
 
@@ -82,7 +81,7 @@ static inline void complete_release(struct completion *x) {}
  * This inline function will initialize a dynamically created completion
  * structure.
  */
-static inline void __init_completion(struct completion *x)
+static inline void init_completion(struct completion *x)
 {
 	x->done = 0;
 	init_swait_queue_head(&x->wait);
-- 
cgit 


From 97d62caa32d6d79dadae3f8d19af5c92ea9a589a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 1 Dec 2020 13:09:08 +0100
Subject: refcount: Fix a kernel-doc markup

The kernel-doc markup is wrong: it is asking the tool to document
struct refcount_struct, instead of documenting typedef refcount_t.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/afb9bb1e675bf5f72a34a55d780779d7d5916b4c.1606823973.git.mchehab+huawei@kernel.org
---
 include/linux/refcount.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 497990c69b0b..b8a6e387f8f9 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -101,7 +101,7 @@
 struct mutex;
 
 /**
- * struct refcount_t - variant of atomic_t specialized for reference counts
+ * typedef refcount_t - variant of atomic_t specialized for reference counts
  * @refs: atomic_t counter field
  *
  * The counter saturates at REFCOUNT_SATURATED and will not move once
-- 
cgit 


From 950cc0d2bef078e1f6459900ca4d4b2a2e0e3c37 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 2 Dec 2020 14:07:07 +0200
Subject: fsnotify: generalize handle_inode_event()

The handle_inode_event() interface was added as (quoting comment):
"a simple variant of handle_event() for groups that only have inode
marks and don't have ignore mask".

In other words, all backends except fanotify.  The inotify backend
also falls under this category, but because it required extra arguments
it was left out of the initial pass of backends conversion to the
simple interface.

This results in code duplication between the generic helper
fsnotify_handle_event() and the inotify_handle_event() callback
which also happen to be buggy code.

Generalize the handle_inode_event() arguments and add the check for
FS_EXCL_UNLINK flag to the generic helper, so inotify backend could
be converted to use the simple interface.

Link: https://lore.kernel.org/r/20201202120713.702387-2-amir73il@gmail.com
CC: stable@vger.kernel.org
Fixes: b9a1b9772509 ("fsnotify: create method handle_inode_event() in fsnotify_operations")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/nfsd/filecache.c              |  2 +-
 fs/notify/dnotify/dnotify.c      |  2 +-
 fs/notify/fsnotify.c             | 31 ++++++++++++++++++++++++-------
 include/linux/fsnotify_backend.h |  3 ++-
 kernel/audit_fsnotify.c          |  2 +-
 kernel/audit_tree.c              |  2 +-
 kernel/audit_watch.c             |  2 +-
 7 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 3c6c2f7d1688..5849c1bd88f1 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -600,7 +600,7 @@ static struct notifier_block nfsd_file_lease_notifier = {
 static int
 nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
 				struct inode *inode, struct inode *dir,
-				const struct qstr *name)
+				const struct qstr *name, u32 cookie)
 {
 	trace_nfsd_file_fsnotify_handle_event(inode, mask);
 
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5dcda8f20c04..e45ca6ecba95 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -72,7 +72,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
  */
 static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 				struct inode *inode, struct inode *dir,
-				const struct qstr *name)
+				const struct qstr *name, u32 cookie)
 {
 	struct dnotify_mark *dn_mark;
 	struct dnotify_struct *dn;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 8d3ad5ef2925..c5c68bcbaadf 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -232,6 +232,26 @@ notify:
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
+static int fsnotify_handle_inode_event(struct fsnotify_group *group,
+				       struct fsnotify_mark *inode_mark,
+				       u32 mask, const void *data, int data_type,
+				       struct inode *dir, const struct qstr *name,
+				       u32 cookie)
+{
+	const struct path *path = fsnotify_data_path(data, data_type);
+	struct inode *inode = fsnotify_data_inode(data, data_type);
+	const struct fsnotify_ops *ops = group->ops;
+
+	if (WARN_ON_ONCE(!ops->handle_inode_event))
+		return 0;
+
+	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+	    path && d_unlinked(path->dentry))
+		return 0;
+
+	return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
+}
+
 static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 				 const void *data, int data_type,
 				 struct inode *dir, const struct qstr *name,
@@ -239,13 +259,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
 	struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info);
-	struct inode *inode = fsnotify_data_inode(data, data_type);
-	const struct fsnotify_ops *ops = group->ops;
 	int ret;
 
-	if (WARN_ON_ONCE(!ops->handle_inode_event))
-		return 0;
-
 	if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
 	    WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
 		return 0;
@@ -262,7 +277,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 		name = NULL;
 	}
 
-	ret = ops->handle_inode_event(inode_mark, mask, inode, dir, name);
+	ret = fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
+					  dir, name, cookie);
 	if (ret || !child_mark)
 		return ret;
 
@@ -272,7 +288,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 	 * report the event once to parent dir with name and once to child
 	 * without name.
 	 */
-	return ops->handle_inode_event(child_mark, mask, inode, NULL, NULL);
+	return fsnotify_handle_inode_event(group, child_mark, mask, data, data_type,
+					   NULL, NULL, 0);
 }
 
 static int send_to_group(__u32 mask, const void *data, int data_type,
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index f8529a3a2923..4ee3044eedd0 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -137,6 +137,7 @@ struct mem_cgroup;
  *		if @file_name is not NULL, this is the directory that
  *		@file_name is relative to.
  * @file_name:	optional file name associated with event
+ * @cookie:	inotify rename cookie
  *
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
  * freeing_mark - called when a mark is being destroyed for some reason.  The group
@@ -151,7 +152,7 @@ struct fsnotify_ops {
 			    struct fsnotify_iter_info *iter_info);
 	int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
 			    struct inode *inode, struct inode *dir,
-			    const struct qstr *file_name);
+			    const struct qstr *file_name, u32 cookie);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event)(struct fsnotify_event *event);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index bfcfcd61adb6..5b3f01da172b 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -154,7 +154,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
 /* Update mark data in audit rules based on fsnotify events. */
 static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 				   struct inode *inode, struct inode *dir,
-				   const struct qstr *dname)
+				   const struct qstr *dname, u32 cookie)
 {
 	struct audit_fsnotify_mark *audit_mark;
 
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 83e1c07fc99e..6c91902f4f45 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1037,7 +1037,7 @@ static void evict_chunk(struct audit_chunk *chunk)
 
 static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
 				   struct inode *inode, struct inode *dir,
-				   const struct qstr *file_name)
+				   const struct qstr *file_name, u32 cookie)
 {
 	return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 246e5ba704c0..2acf7ca49154 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -466,7 +466,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 /* Update watch data in audit rules based on fsnotify events. */
 static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 				    struct inode *inode, struct inode *dir,
-				    const struct qstr *dname)
+				    const struct qstr *dname, u32 cookie)
 {
 	struct audit_parent *parent;
 
-- 
cgit 


From 50a4952fd67b7f7f551e82ac07c51c1a7a74d474 Mon Sep 17 00:00:00 2001
From: Alexander Lochmann <alexander.lochmann@tu-dortmund.de>
Date: Thu, 15 Oct 2020 15:24:52 +0200
Subject: Updated locking documentation for transaction_t

We used LockDoc to derive locking rules for each member
of struct transaction_t.
Based on those results, we extended the existing documentation
by more members of struct transaction_t, and updated the existing
documentation.

Link: https://lore.kernel.org/r/10cfbef1-994c-c604-f8a6-b1042fcc622f@tu-dortmund.de
Signed-off-by: Alexander Lochmann <alexander.lochmann@tu-dortmund.de>
Signed-off-by: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 578ff196b3ce..d2a4860feb72 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -538,6 +538,7 @@ struct transaction_chp_stats_s {
  * The transaction keeps track of all of the buffers modified by a
  * running transaction, and all of the buffers committed but not yet
  * flushed to home for finished transactions.
+ * (Locking Documentation improved by LockDoc)
  */
 
 /*
@@ -658,12 +659,12 @@ struct transaction_s
 	unsigned long		t_start;
 
 	/*
-	 * When commit was requested
+	 * When commit was requested [j_state_lock]
 	 */
 	unsigned long		t_requested;
 
 	/*
-	 * Checkpointing stats [j_checkpoint_sem]
+	 * Checkpointing stats [j_list_lock]
 	 */
 	struct transaction_chp_stats_s t_chp_stats;
 
-- 
cgit 


From 14026b94ccfe626e512bc9fa01e0e72ee75c7a98 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 18 Aug 2020 17:19:32 +0000
Subject: signal: Add unsafe_put_compat_sigset()

Implement 'unsafe' version of put_compat_sigset()

For the bigendian, use unsafe_put_user() directly
to avoid intermediate copy through the stack.

For the littleendian, use a straight unsafe_copy_to_user().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/537c7082ee309a0bb9c67a50c5d9dd929aedb82d.1597770847.git.christophe.leroy@csgroup.eu
---
 include/linux/compat.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 14d514233e1d..400c0941c8af 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -442,6 +442,38 @@ put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
 #endif
 }
 
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define unsafe_put_compat_sigset(compat, set, label) do {		\
+	compat_sigset_t __user *__c = compat;				\
+	const sigset_t *__s = set;					\
+									\
+	switch (_NSIG_WORDS) {						\
+	case 4:								\
+		unsafe_put_user(__s->sig[3] >> 32, &__c->sig[7], label);	\
+		unsafe_put_user(__s->sig[3], &__c->sig[6], label);	\
+		fallthrough;						\
+	case 3:								\
+		unsafe_put_user(__s->sig[2] >> 32, &__c->sig[5], label);	\
+		unsafe_put_user(__s->sig[2], &__c->sig[4], label);	\
+		fallthrough;						\
+	case 2:								\
+		unsafe_put_user(__s->sig[1] >> 32, &__c->sig[3], label);	\
+		unsafe_put_user(__s->sig[1], &__c->sig[2], label);	\
+		fallthrough;						\
+	case 1:								\
+		unsafe_put_user(__s->sig[0] >> 32, &__c->sig[1], label);	\
+		unsafe_put_user(__s->sig[0], &__c->sig[0], label);	\
+	}								\
+} while (0)
+#else
+#define unsafe_put_compat_sigset(compat, set, label) do {		\
+	compat_sigset_t __user *__c = compat;				\
+	const sigset_t *__s = set;					\
+									\
+	unsafe_copy_to_user(__c, __s, sizeof(*__c), label);		\
+} while (0)
+#endif
+
 extern int compat_ptrace_request(struct task_struct *child,
 				 compat_long_t request,
 				 compat_ulong_t addr, compat_ulong_t data);
-- 
cgit 


From d4bff72c8401e6f56194ecf455db70ebc22929e2 Mon Sep 17 00:00:00 2001
From: Thomas Karlsson <thomas.karlsson@paneda.se>
Date: Wed, 2 Dec 2020 19:49:58 +0100
Subject: macvlan: Support for high multicast packet rate

Background:
Broadcast and multicast packages are enqueued for later processing.
This queue was previously hardcoded to 1000.

This proved insufficient for handling very high packet rates.
This resulted in packet drops for multicast.
While at the same time unicast worked fine.

The change:
This patch make the queue length adjustable to accommodate
for environments with very high multicast packet rate.
But still keeps the default value of 1000 unless specified.

The queue length is specified as a request per macvlan
using the IFLA_MACVLAN_BC_QUEUE_LEN parameter.

The actual used queue length will then be the maximum of
any macvlan connected to the same port. The actual used
queue length for the port can be retrieved (read only)
by the IFLA_MACVLAN_BC_QUEUE_LEN_USED parameter for verification.

This will be followed up by a patch to iproute2
in order to adjust the parameter from userspace.

Signed-off-by: Thomas Karlsson <thomas.karlsson@paneda.se>
Link: https://lore.kernel.org/r/dd4673b2-7eab-edda-6815-85c67ce87f63@paneda.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/macvlan.c              | 40 ++++++++++++++++++++++++++++++++++++--
 include/linux/if_macvlan.h         |  1 +
 include/uapi/linux/if_link.h       |  2 ++
 tools/include/uapi/linux/if_link.h |  2 ++
 4 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d9b6c44a5911..fb51329f8964 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -35,7 +35,7 @@
 
 #define MACVLAN_HASH_BITS	8
 #define MACVLAN_HASH_SIZE	(1<<MACVLAN_HASH_BITS)
-#define MACVLAN_BC_QUEUE_LEN	1000
+#define MACVLAN_DEFAULT_BC_QUEUE_LEN	1000
 
 #define MACVLAN_F_PASSTHRU	1
 #define MACVLAN_F_ADDRCHANGE	2
@@ -46,6 +46,7 @@ struct macvlan_port {
 	struct list_head	vlans;
 	struct sk_buff_head	bc_queue;
 	struct work_struct	bc_work;
+	u32			bc_queue_len_used;
 	u32			flags;
 	int			count;
 	struct hlist_head	vlan_source_hash[MACVLAN_HASH_SIZE];
@@ -67,6 +68,7 @@ struct macvlan_skb_cb {
 #define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0]))
 
 static void macvlan_port_destroy(struct net_device *dev);
+static void update_port_bc_queue_len(struct macvlan_port *port);
 
 static inline bool macvlan_passthru(const struct macvlan_port *port)
 {
@@ -354,7 +356,7 @@ static void macvlan_broadcast_enqueue(struct macvlan_port *port,
 	MACVLAN_SKB_CB(nskb)->src = src;
 
 	spin_lock(&port->bc_queue.lock);
-	if (skb_queue_len(&port->bc_queue) < MACVLAN_BC_QUEUE_LEN) {
+	if (skb_queue_len(&port->bc_queue) < port->bc_queue_len_used) {
 		if (src)
 			dev_hold(src->dev);
 		__skb_queue_tail(&port->bc_queue, nskb);
@@ -1218,6 +1220,7 @@ static int macvlan_port_create(struct net_device *dev)
 	for (i = 0; i < MACVLAN_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(&port->vlan_source_hash[i]);
 
+	port->bc_queue_len_used = 0;
 	skb_queue_head_init(&port->bc_queue);
 	INIT_WORK(&port->bc_work, macvlan_process_broadcast);
 
@@ -1486,6 +1489,10 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 			goto destroy_macvlan_port;
 	}
 
+	vlan->bc_queue_len_req = MACVLAN_DEFAULT_BC_QUEUE_LEN;
+	if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN])
+		vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]);
+
 	err = register_netdevice(dev);
 	if (err < 0)
 		goto destroy_macvlan_port;
@@ -1496,6 +1503,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 		goto unregister_netdev;
 
 	list_add_tail_rcu(&vlan->list, &port->vlans);
+	update_port_bc_queue_len(vlan->port);
 	netif_stacked_transfer_operstate(lowerdev, dev);
 	linkwatch_fire_event(dev);
 
@@ -1529,6 +1537,7 @@ void macvlan_dellink(struct net_device *dev, struct list_head *head)
 	if (vlan->mode == MACVLAN_MODE_SOURCE)
 		macvlan_flush_sources(vlan->port, vlan);
 	list_del_rcu(&vlan->list);
+	update_port_bc_queue_len(vlan->port);
 	unregister_netdevice_queue(dev, head);
 	netdev_upper_dev_unlink(vlan->lowerdev, dev);
 }
@@ -1572,6 +1581,12 @@ static int macvlan_changelink(struct net_device *dev,
 		}
 		vlan->flags = flags;
 	}
+
+	if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) {
+		vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]);
+		update_port_bc_queue_len(vlan->port);
+	}
+
 	if (set_mode)
 		vlan->mode = mode;
 	if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
@@ -1602,6 +1617,8 @@ static size_t macvlan_get_size(const struct net_device *dev)
 		+ nla_total_size(2) /* IFLA_MACVLAN_FLAGS */
 		+ nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */
 		+ macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */
+		+ nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN */
+		+ nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN_USED */
 		);
 }
 
@@ -1625,6 +1642,7 @@ static int macvlan_fill_info(struct sk_buff *skb,
 				const struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvlan_port *port = vlan->port;
 	int i;
 	struct nlattr *nest;
 
@@ -1645,6 +1663,10 @@ static int macvlan_fill_info(struct sk_buff *skb,
 		}
 		nla_nest_end(skb, nest);
 	}
+	if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN, vlan->bc_queue_len_req))
+		goto nla_put_failure;
+	if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -1658,6 +1680,8 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
 	[IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
 	[IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED },
 	[IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 },
+	[IFLA_MACVLAN_BC_QUEUE_LEN] = { .type = NLA_U32 },
+	[IFLA_MACVLAN_BC_QUEUE_LEN_USED] = { .type = NLA_REJECT },
 };
 
 int macvlan_link_register(struct rtnl_link_ops *ops)
@@ -1688,6 +1712,18 @@ static struct rtnl_link_ops macvlan_link_ops = {
 	.priv_size      = sizeof(struct macvlan_dev),
 };
 
+static void update_port_bc_queue_len(struct macvlan_port *port)
+{
+	u32 max_bc_queue_len_req = 0;
+	struct macvlan_dev *vlan;
+
+	list_for_each_entry(vlan, &port->vlans, list) {
+		if (vlan->bc_queue_len_req > max_bc_queue_len_req)
+			max_bc_queue_len_req = vlan->bc_queue_len_req;
+	}
+	port->bc_queue_len_used = max_bc_queue_len_req;
+}
+
 static int macvlan_device_event(struct notifier_block *unused,
 				unsigned long event, void *ptr)
 {
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index a367ead4bf4b..96556c64c95d 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -30,6 +30,7 @@ struct macvlan_dev {
 	enum macvlan_mode	mode;
 	u16			flags;
 	unsigned int		macaddr_count;
+	u32			bc_queue_len_req;
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
 #endif
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index c4b23f06f69e..874cc12a34d9 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -588,6 +588,8 @@ enum {
 	IFLA_MACVLAN_MACADDR,
 	IFLA_MACVLAN_MACADDR_DATA,
 	IFLA_MACVLAN_MACADDR_COUNT,
+	IFLA_MACVLAN_BC_QUEUE_LEN,
+	IFLA_MACVLAN_BC_QUEUE_LEN_USED,
 	__IFLA_MACVLAN_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 781e482dc499..d208b2af697f 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -409,6 +409,8 @@ enum {
 	IFLA_MACVLAN_MACADDR,
 	IFLA_MACVLAN_MACADDR_DATA,
 	IFLA_MACVLAN_MACADDR_COUNT,
+	IFLA_MACVLAN_BC_QUEUE_LEN,
+	IFLA_MACVLAN_BC_QUEUE_LEN_USED,
 	__IFLA_MACVLAN_MAX,
 };
 
-- 
cgit 


From d421e466c2373095f165ddd25cbabd6c5b077928 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Wed, 2 Dec 2020 20:39:46 -0800
Subject: net/mlx5: DR, Proper handling of unsupported Connect-X6DX SW steering

STEs format for Connect-X5 and Connect-X6DX different. Currently, on
Connext-X6DX the SW steering would break at some point when building STEs
w/o giving a proper error message. Fix this by checking the STE format of
the current device when initializing domain: add mlx5_ifc definitions for
Connect-X6DX SW steering, read FW capability to get the current format
version, and check this version when domain is being created.

Fixes: 26d688e33f88 ("net/mlx5: DR, Add Steering entry (STE) utilities")
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c    | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c | 5 +++++
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h  | 1 +
 include/linux/mlx5/mlx5_ifc.h                                | 9 ++++++++-
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
index 6bd34b293007..51bbd88ff021 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
@@ -92,6 +92,7 @@ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev,
 	caps->eswitch_manager	= MLX5_CAP_GEN(mdev, eswitch_manager);
 	caps->gvmi		= MLX5_CAP_GEN(mdev, vhca_id);
 	caps->flex_protocols	= MLX5_CAP_GEN(mdev, flex_parser_protocols);
+	caps->sw_format_ver	= MLX5_CAP_GEN(mdev, steering_format_version);
 
 	if (mlx5dr_matcher_supp_flex_parser_icmp_v4(caps)) {
 		caps->flex_parser_id_icmp_dw0 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw0);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
index 890767a2a7cb..aa2c2d6c44e6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
@@ -223,6 +223,11 @@ static int dr_domain_caps_init(struct mlx5_core_dev *mdev,
 	if (ret)
 		return ret;
 
+	if (dmn->info.caps.sw_format_ver != MLX5_STEERING_FORMAT_CONNECTX_5) {
+		mlx5dr_err(dmn, "SW steering is not supported on this device\n");
+		return -EOPNOTSUPP;
+	}
+
 	ret = dr_domain_query_fdb_caps(mdev, dmn);
 	if (ret)
 		return ret;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index f50f3b107aa3..cf62ea4f882e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -625,6 +625,7 @@ struct mlx5dr_cmd_caps {
 	u8 max_ft_level;
 	u16 roce_min_src_udp;
 	u8 num_esw_ports;
+	u8 sw_format_ver;
 	bool eswitch_manager;
 	bool rx_sw_owner;
 	bool tx_sw_owner;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a092346c7b2d..233352447b1a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1223,6 +1223,11 @@ enum mlx5_fc_bulk_alloc_bitmask {
 
 #define MLX5_FC_BULK_NUM_FCS(fc_enum) (MLX5_FC_BULK_SIZE_FACTOR * (fc_enum))
 
+enum {
+	MLX5_STEERING_FORMAT_CONNECTX_5   = 0,
+	MLX5_STEERING_FORMAT_CONNECTX_6DX = 1,
+};
+
 struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_0[0x30];
 	u8         vhca_id[0x10];
@@ -1521,7 +1526,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         general_obj_types[0x40];
 
-	u8         reserved_at_440[0x20];
+	u8         reserved_at_440[0x4];
+	u8         steering_format_version[0x4];
+	u8         create_qp_start_hint[0x18];
 
 	u8         reserved_at_460[0x3];
 	u8         log_max_uctx[0x5];
-- 
cgit 


From 0fb6ee8d0b5e90b72f870f76debc8bd31a742014 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Tue, 24 Nov 2020 14:38:07 +0200
Subject: iio: ad_sigma_delta: Don't put SPI transfer buffer on the stack

Use a heap allocated memory for the SPI transfer buffer. Using stack memory
can corrupt stack memory when using DMA on some systems.

This change moves the buffer from the stack of the trigger handler call to
the heap of the buffer of the state struct. The size increases takes into
account the alignment for the timestamp, which is 8 bytes.

The 'data' buffer is split into 'tx_buf' and 'rx_buf', to make a clearer
separation of which part of the buffer should be used for TX & RX.

Fixes: af3008485ea03 ("iio:adc: Add common code for ADI Sigma Delta devices")
Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20201124123807.19717-1-alexandru.ardelean@analog.com
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad_sigma_delta.c       | 18 ++++++++----------
 include/linux/iio/adc/ad_sigma_delta.h |  6 +++++-
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index 86039e9ecaca..3a6f239d4acc 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -57,7 +57,7 @@ EXPORT_SYMBOL_GPL(ad_sd_set_comm);
 int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg,
 	unsigned int size, unsigned int val)
 {
-	uint8_t *data = sigma_delta->data;
+	uint8_t *data = sigma_delta->tx_buf;
 	struct spi_transfer t = {
 		.tx_buf		= data,
 		.len		= size + 1,
@@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(ad_sd_write_reg);
 static int ad_sd_read_reg_raw(struct ad_sigma_delta *sigma_delta,
 	unsigned int reg, unsigned int size, uint8_t *val)
 {
-	uint8_t *data = sigma_delta->data;
+	uint8_t *data = sigma_delta->tx_buf;
 	int ret;
 	struct spi_transfer t[] = {
 		{
@@ -146,22 +146,22 @@ int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta,
 {
 	int ret;
 
-	ret = ad_sd_read_reg_raw(sigma_delta, reg, size, sigma_delta->data);
+	ret = ad_sd_read_reg_raw(sigma_delta, reg, size, sigma_delta->rx_buf);
 	if (ret < 0)
 		goto out;
 
 	switch (size) {
 	case 4:
-		*val = get_unaligned_be32(sigma_delta->data);
+		*val = get_unaligned_be32(sigma_delta->rx_buf);
 		break;
 	case 3:
-		*val = get_unaligned_be24(&sigma_delta->data[0]);
+		*val = get_unaligned_be24(sigma_delta->rx_buf);
 		break;
 	case 2:
-		*val = get_unaligned_be16(sigma_delta->data);
+		*val = get_unaligned_be16(sigma_delta->rx_buf);
 		break;
 	case 1:
-		*val = sigma_delta->data[0];
+		*val = sigma_delta->rx_buf[0];
 		break;
 	default:
 		ret = -EINVAL;
@@ -395,11 +395,9 @@ static irqreturn_t ad_sd_trigger_handler(int irq, void *p)
 	struct iio_poll_func *pf = p;
 	struct iio_dev *indio_dev = pf->indio_dev;
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
+	uint8_t *data = sigma_delta->rx_buf;
 	unsigned int reg_size;
 	unsigned int data_reg;
-	uint8_t data[16];
-
-	memset(data, 0x00, 16);
 
 	reg_size = indio_dev->channels[0].scan_type.realbits +
 			indio_dev->channels[0].scan_type.shift;
diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index a3a838dcf8e4..7199280d89ca 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -79,8 +79,12 @@ struct ad_sigma_delta {
 	/*
 	 * DMA (thus cache coherency maintenance) requires the
 	 * transfer buffers to live in their own cache lines.
+	 * 'tx_buf' is up to 32 bits.
+	 * 'rx_buf' is up to 32 bits per sample + 64 bit timestamp,
+	 * rounded to 16 bytes to take into account padding.
 	 */
-	uint8_t				data[4] ____cacheline_aligned;
+	uint8_t				tx_buf[4] ____cacheline_aligned;
+	uint8_t				rx_buf[16] __aligned(8);
 };
 
 static inline int ad_sigma_delta_set_channel(struct ad_sigma_delta *sd,
-- 
cgit 


From eca8523a388f65cc0f515e3db4f3b027d7546532 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 20 Sep 2020 14:25:48 +0100
Subject: iio:trigger: rename try_reenable() to reenable() plus return void

As we no longer support a try again if we cannot reenable the trigger
rename the function to reflect this.   Also we don't do anything with
the value returned so stop it returning anything.  For the few drivers
that didn't already print an error message in this patch, add such
a print.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Christian Oder <me@myself5.de>
Cc: Eugen Hristev <eugen.hristev@microchip.com>
Cc: Nishant Malpani <nish.malpani25@gmail.com>
Cc: Daniel Baluta <daniel.baluta@oss.nxp.com>
Link: https://lore.kernel.org/r/20200920132548.196452-3-jic23@kernel.org
---
 drivers/iio/accel/bma180.c             |  9 ++++++---
 drivers/iio/accel/bmc150-accel-core.c  | 12 ++++--------
 drivers/iio/accel/kxcjk-1013.c         | 10 +++-------
 drivers/iio/accel/mxc4005.c            | 16 ++++++----------
 drivers/iio/adc/at91-sama5d2_adc.c     |  8 +++-----
 drivers/iio/gyro/adxrs290.c            |  6 ++----
 drivers/iio/gyro/bmg160_core.c         | 12 ++++--------
 drivers/iio/imu/kmx61.c                | 10 +++-------
 drivers/iio/industrialio-trigger.c     |  4 ++--
 drivers/iio/magnetometer/bmc150_magn.c | 10 +++++-----
 include/linux/iio/trigger.h            |  4 ++--
 11 files changed, 40 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/bma180.c b/drivers/iio/accel/bma180.c
index 6b74c2b04c15..71f85a3e525b 100644
--- a/drivers/iio/accel/bma180.c
+++ b/drivers/iio/accel/bma180.c
@@ -959,17 +959,20 @@ static int bma180_data_rdy_trigger_set_state(struct iio_trigger *trig,
 	return bma180_set_new_data_intr_state(data, state);
 }
 
-static int bma180_trig_try_reen(struct iio_trigger *trig)
+static void bma180_trig_reen(struct iio_trigger *trig)
 {
 	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	struct bma180_data *data = iio_priv(indio_dev);
+	int ret;
 
-	return bma180_reset_intr(data);
+	ret = bma180_reset_intr(data);
+	if (ret)
+		dev_err(&data->client->dev, "failed to reset interrupt\n");
 }
 
 static const struct iio_trigger_ops bma180_trigger_ops = {
 	.set_trigger_state = bma180_data_rdy_trigger_set_state,
-	.try_reenable = bma180_trig_try_reen,
+	.reenable = bma180_trig_reen,
 };
 
 static int bma180_probe(struct i2c_client *client,
diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c
index 20dc7bc291f8..9a552d11ea27 100644
--- a/drivers/iio/accel/bmc150-accel-core.c
+++ b/drivers/iio/accel/bmc150-accel-core.c
@@ -1151,7 +1151,7 @@ err_read:
 	return IRQ_HANDLED;
 }
 
-static int bmc150_accel_trig_try_reen(struct iio_trigger *trig)
+static void bmc150_accel_trig_reen(struct iio_trigger *trig)
 {
 	struct bmc150_accel_trigger *t = iio_trigger_get_drvdata(trig);
 	struct bmc150_accel_data *data = t->data;
@@ -1160,7 +1160,7 @@ static int bmc150_accel_trig_try_reen(struct iio_trigger *trig)
 
 	/* new data interrupts don't need ack */
 	if (t == &t->data->triggers[BMC150_ACCEL_TRIGGER_DATA_READY])
-		return 0;
+		return;
 
 	mutex_lock(&data->mutex);
 	/* clear any latched interrupt */
@@ -1168,12 +1168,8 @@ static int bmc150_accel_trig_try_reen(struct iio_trigger *trig)
 			   BMC150_ACCEL_INT_MODE_LATCH_INT |
 			   BMC150_ACCEL_INT_MODE_LATCH_RESET);
 	mutex_unlock(&data->mutex);
-	if (ret < 0) {
+	if (ret < 0)
 		dev_err(dev, "Error writing reg_int_rst_latch\n");
-		return ret;
-	}
-
-	return 0;
 }
 
 static int bmc150_accel_trigger_set_state(struct iio_trigger *trig,
@@ -1213,7 +1209,7 @@ static int bmc150_accel_trigger_set_state(struct iio_trigger *trig,
 
 static const struct iio_trigger_ops bmc150_accel_trigger_ops = {
 	.set_trigger_state = bmc150_accel_trigger_set_state,
-	.try_reenable = bmc150_accel_trig_try_reen,
+	.reenable = bmc150_accel_trig_reen,
 };
 
 static int bmc150_accel_handle_roc_event(struct iio_dev *indio_dev)
diff --git a/drivers/iio/accel/kxcjk-1013.c b/drivers/iio/accel/kxcjk-1013.c
index 560a3373ff20..e92c7e6766e1 100644
--- a/drivers/iio/accel/kxcjk-1013.c
+++ b/drivers/iio/accel/kxcjk-1013.c
@@ -1105,19 +1105,15 @@ err:
 	return IRQ_HANDLED;
 }
 
-static int kxcjk1013_trig_try_reen(struct iio_trigger *trig)
+static void kxcjk1013_trig_reen(struct iio_trigger *trig)
 {
 	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	struct kxcjk1013_data *data = iio_priv(indio_dev);
 	int ret;
 
 	ret = i2c_smbus_read_byte_data(data->client, KXCJK1013_REG_INT_REL);
-	if (ret < 0) {
+	if (ret < 0)
 		dev_err(&data->client->dev, "Error reading reg_int_rel\n");
-		return ret;
-	}
-
-	return 0;
 }
 
 static int kxcjk1013_data_rdy_trigger_set_state(struct iio_trigger *trig,
@@ -1161,7 +1157,7 @@ static int kxcjk1013_data_rdy_trigger_set_state(struct iio_trigger *trig,
 
 static const struct iio_trigger_ops kxcjk1013_trigger_ops = {
 	.set_trigger_state = kxcjk1013_data_rdy_trigger_set_state,
-	.try_reenable = kxcjk1013_trig_try_reen,
+	.reenable = kxcjk1013_trig_reen,
 };
 
 static void kxcjk1013_report_motion_event(struct iio_dev *indio_dev)
diff --git a/drivers/iio/accel/mxc4005.c b/drivers/iio/accel/mxc4005.c
index f877263dc6ef..0f8fd687866d 100644
--- a/drivers/iio/accel/mxc4005.c
+++ b/drivers/iio/accel/mxc4005.c
@@ -310,19 +310,15 @@ err:
 	return IRQ_HANDLED;
 }
 
-static int mxc4005_clr_intr(struct mxc4005_data *data)
+static void mxc4005_clr_intr(struct mxc4005_data *data)
 {
 	int ret;
 
 	/* clear interrupt */
 	ret = regmap_write(data->regmap, MXC4005_REG_INT_CLR1,
 			   MXC4005_REG_INT_CLR1_BIT_DRDYC);
-	if (ret < 0) {
+	if (ret < 0)
 		dev_err(data->dev, "failed to write to reg_int_clr1\n");
-		return ret;
-	}
-
-	return 0;
 }
 
 static int mxc4005_set_trigger_state(struct iio_trigger *trig,
@@ -353,20 +349,20 @@ static int mxc4005_set_trigger_state(struct iio_trigger *trig,
 	return 0;
 }
 
-static int mxc4005_trigger_try_reen(struct iio_trigger *trig)
+static void mxc4005_trigger_reen(struct iio_trigger *trig)
 {
 	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	struct mxc4005_data *data = iio_priv(indio_dev);
 
 	if (!data->dready_trig)
-		return 0;
+		return;
 
-	return mxc4005_clr_intr(data);
+	mxc4005_clr_intr(data);
 }
 
 static const struct iio_trigger_ops mxc4005_trigger_ops = {
 	.set_trigger_state = mxc4005_set_trigger_state,
-	.try_reenable = mxc4005_trigger_try_reen,
+	.reenable = mxc4005_trigger_reen,
 };
 
 static int mxc4005_chip_init(struct mxc4005_data *data)
diff --git a/drivers/iio/adc/at91-sama5d2_adc.c b/drivers/iio/adc/at91-sama5d2_adc.c
index 6edcc99009d1..a7826f097b95 100644
--- a/drivers/iio/adc/at91-sama5d2_adc.c
+++ b/drivers/iio/adc/at91-sama5d2_adc.c
@@ -742,26 +742,24 @@ static int at91_adc_configure_trigger(struct iio_trigger *trig, bool state)
 	return 0;
 }
 
-static int at91_adc_reenable_trigger(struct iio_trigger *trig)
+static void at91_adc_reenable_trigger(struct iio_trigger *trig)
 {
 	struct iio_dev *indio = iio_trigger_get_drvdata(trig);
 	struct at91_adc_state *st = iio_priv(indio);
 
 	/* if we are using DMA, we must not reenable irq after each trigger */
 	if (st->dma_st.dma_chan)
-		return 0;
+		return;
 
 	enable_irq(st->irq);
 
 	/* Needed to ACK the DRDY interruption */
 	at91_adc_readl(st, AT91_SAMA5D2_LCDR);
-
-	return 0;
 }
 
 static const struct iio_trigger_ops at91_adc_trigger_ops = {
 	.set_trigger_state = &at91_adc_configure_trigger,
-	.try_reenable = &at91_adc_reenable_trigger,
+	.reenable = &at91_adc_reenable_trigger,
 	.validate_device = iio_trigger_validate_own_device,
 };
 
diff --git a/drivers/iio/gyro/adxrs290.c b/drivers/iio/gyro/adxrs290.c
index ca6fc234076e..c45d8226cc2b 100644
--- a/drivers/iio/gyro/adxrs290.c
+++ b/drivers/iio/gyro/adxrs290.c
@@ -478,7 +478,7 @@ static int adxrs290_data_rdy_trigger_set_state(struct iio_trigger *trig,
 	return ret;
 }
 
-static int adxrs290_reset_trig(struct iio_trigger *trig)
+static void adxrs290_reset_trig(struct iio_trigger *trig)
 {
 	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	int val;
@@ -491,14 +491,12 @@ static int adxrs290_reset_trig(struct iio_trigger *trig)
 	 */
 	adxrs290_get_rate_data(indio_dev,
 			       ADXRS290_READ_REG(ADXRS290_REG_DATAY0), &val);
-
-	return 0;
 }
 
 static const struct iio_trigger_ops adxrs290_trigger_ops = {
 	.set_trigger_state = &adxrs290_data_rdy_trigger_set_state,
 	.validate_device = &iio_trigger_validate_own_device,
-	.try_reenable = &adxrs290_reset_trig,
+	.reenable = &adxrs290_reset_trig,
 };
 
 static irqreturn_t adxrs290_trigger_handler(int irq, void *p)
diff --git a/drivers/iio/gyro/bmg160_core.c b/drivers/iio/gyro/bmg160_core.c
index 8ddda96455fc..2d5015801a75 100644
--- a/drivers/iio/gyro/bmg160_core.c
+++ b/drivers/iio/gyro/bmg160_core.c
@@ -893,7 +893,7 @@ err:
 	return IRQ_HANDLED;
 }
 
-static int bmg160_trig_try_reen(struct iio_trigger *trig)
+static void bmg160_trig_reen(struct iio_trigger *trig)
 {
 	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	struct bmg160_data *data = iio_priv(indio_dev);
@@ -902,18 +902,14 @@ static int bmg160_trig_try_reen(struct iio_trigger *trig)
 
 	/* new data interrupts don't need ack */
 	if (data->dready_trigger_on)
-		return 0;
+		return;
 
 	/* Set latched mode interrupt and clear any latched interrupt */
 	ret = regmap_write(data->regmap, BMG160_REG_INT_RST_LATCH,
 			   BMG160_INT_MODE_LATCH_INT |
 			   BMG160_INT_MODE_LATCH_RESET);
-	if (ret < 0) {
+	if (ret < 0)
 		dev_err(dev, "Error writing reg_rst_latch\n");
-		return ret;
-	}
-
-	return 0;
 }
 
 static int bmg160_data_rdy_trigger_set_state(struct iio_trigger *trig,
@@ -961,7 +957,7 @@ static int bmg160_data_rdy_trigger_set_state(struct iio_trigger *trig,
 
 static const struct iio_trigger_ops bmg160_trigger_ops = {
 	.set_trigger_state = bmg160_data_rdy_trigger_set_state,
-	.try_reenable = bmg160_trig_try_reen,
+	.reenable = bmg160_trig_reen,
 };
 
 static irqreturn_t bmg160_event_handler(int irq, void *private)
diff --git a/drivers/iio/imu/kmx61.c b/drivers/iio/imu/kmx61.c
index 61885e99d3fc..4377047d503a 100644
--- a/drivers/iio/imu/kmx61.c
+++ b/drivers/iio/imu/kmx61.c
@@ -1063,24 +1063,20 @@ err_unlock:
 	return ret;
 }
 
-static int kmx61_trig_try_reenable(struct iio_trigger *trig)
+static void kmx61_trig_reenable(struct iio_trigger *trig)
 {
 	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	struct kmx61_data *data = kmx61_get_data(indio_dev);
 	int ret;
 
 	ret = i2c_smbus_read_byte_data(data->client, KMX61_REG_INL);
-	if (ret < 0) {
+	if (ret < 0)
 		dev_err(&data->client->dev, "Error reading reg_inl\n");
-		return ret;
-	}
-
-	return 0;
 }
 
 static const struct iio_trigger_ops kmx61_trigger_ops = {
 	.set_trigger_state = kmx61_data_rdy_trigger_set_state,
-	.try_reenable = kmx61_trig_try_reenable,
+	.reenable = kmx61_trig_reenable,
 };
 
 static irqreturn_t kmx61_event_handler(int irq, void *private)
diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index f902be90980b..ea3c9859b258 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -203,8 +203,8 @@ EXPORT_SYMBOL(iio_trigger_poll_chained);
 void iio_trigger_notify_done(struct iio_trigger *trig)
 {
 	if (atomic_dec_and_test(&trig->use_count) && trig->ops &&
-	    trig->ops->try_reenable)
-		trig->ops->try_reenable(trig);
+	    trig->ops->reenable)
+		trig->ops->reenable(trig);
 }
 EXPORT_SYMBOL(iio_trigger_notify_done);
 
diff --git a/drivers/iio/magnetometer/bmc150_magn.c b/drivers/iio/magnetometer/bmc150_magn.c
index 73e55ec815ec..fa09fcab620a 100644
--- a/drivers/iio/magnetometer/bmc150_magn.c
+++ b/drivers/iio/magnetometer/bmc150_magn.c
@@ -766,20 +766,20 @@ static int bmc150_magn_reset_intr(struct bmc150_magn_data *data)
 	return regmap_read(data->regmap, BMC150_MAGN_REG_X_L, &tmp);
 }
 
-static int bmc150_magn_trig_try_reen(struct iio_trigger *trig)
+static void bmc150_magn_trig_reen(struct iio_trigger *trig)
 {
 	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	struct bmc150_magn_data *data = iio_priv(indio_dev);
 	int ret;
 
 	if (!data->dready_trigger_on)
-		return 0;
+		return;
 
 	mutex_lock(&data->mutex);
 	ret = bmc150_magn_reset_intr(data);
 	mutex_unlock(&data->mutex);
-
-	return ret;
+	if (ret)
+		dev_err(data->dev, "Failed to reset interrupt\n");
 }
 
 static int bmc150_magn_data_rdy_trigger_set_state(struct iio_trigger *trig,
@@ -817,7 +817,7 @@ err_unlock:
 
 static const struct iio_trigger_ops bmc150_magn_trigger_ops = {
 	.set_trigger_state = bmc150_magn_data_rdy_trigger_set_state,
-	.try_reenable = bmc150_magn_trig_try_reen,
+	.reenable = bmc150_magn_trig_reen,
 };
 
 static int bmc150_magn_buffer_preenable(struct iio_dev *indio_dev)
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index f930c4ccf378..055890b6ffcf 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -21,7 +21,7 @@ struct iio_trigger;
 /**
  * struct iio_trigger_ops - operations structure for an iio_trigger.
  * @set_trigger_state:	switch on/off the trigger on demand
- * @try_reenable:	function to reenable the trigger when the
+ * @reenable:		function to reenable the trigger when the
  *			use count is zero (may be NULL)
  * @validate_device:	function to validate the device when the
  *			current trigger gets changed.
@@ -31,7 +31,7 @@ struct iio_trigger;
  **/
 struct iio_trigger_ops {
 	int (*set_trigger_state)(struct iio_trigger *trig, bool state);
-	int (*try_reenable)(struct iio_trigger *trig);
+	void (*reenable)(struct iio_trigger *trig);
 	int (*validate_device)(struct iio_trigger *trig,
 			       struct iio_dev *indio_dev);
 };
-- 
cgit 


From 41dd9596d6b239a125c3d19f9d0ca90bdbfbf876 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 30 Nov 2020 16:36:29 +0100
Subject: security: add const qualifier to struct sock in various places

A followup change to tcp_request_sock_op would have to drop the 'const'
qualifier from the 'route_req' function as the
'security_inet_conn_request' call is moved there - and that function
expects a 'struct sock *'.

However, it turns out its also possible to add a const qualifier to
security_inet_conn_request instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/lsm_audit.h       | 2 +-
 include/linux/lsm_hook_defs.h   | 2 +-
 include/linux/security.h        | 4 ++--
 security/apparmor/include/net.h | 2 +-
 security/apparmor/lsm.c         | 2 +-
 security/apparmor/net.c         | 6 +++---
 security/lsm_audit.c            | 4 ++--
 security/security.c             | 2 +-
 security/selinux/hooks.c        | 2 +-
 security/smack/smack_lsm.c      | 4 ++--
 10 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 28f23b341c1c..cd23355d2271 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -26,7 +26,7 @@
 
 struct lsm_network_audit {
 	int netif;
-	struct sock *sk;
+	const struct sock *sk;
 	u16 family;
 	__be16 dport;
 	__be16 sport;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 32a940117e7a..acc0494cceba 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -301,7 +301,7 @@ LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const struct sock *sk,
 	 struct sock *newsk)
 LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, struct sock *sk, u32 *secid)
 LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket *parent)
-LSM_HOOK(int, 0, inet_conn_request, struct sock *sk, struct sk_buff *skb,
+LSM_HOOK(int, 0, inet_conn_request, const struct sock *sk, struct sk_buff *skb,
 	 struct request_sock *req)
 LSM_HOOK(void, LSM_RET_VOID, inet_csk_clone, struct sock *newsk,
 	 const struct request_sock *req)
diff --git a/include/linux/security.h b/include/linux/security.h
index bc2725491560..0df62735651b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1358,7 +1358,7 @@ void security_sk_clone(const struct sock *sk, struct sock *newsk);
 void security_sk_classify_flow(struct sock *sk, struct flowi *fl);
 void security_req_classify_flow(const struct request_sock *req, struct flowi *fl);
 void security_sock_graft(struct sock*sk, struct socket *parent);
-int security_inet_conn_request(struct sock *sk,
+int security_inet_conn_request(const struct sock *sk,
 			struct sk_buff *skb, struct request_sock *req);
 void security_inet_csk_clone(struct sock *newsk,
 			const struct request_sock *req);
@@ -1519,7 +1519,7 @@ static inline void security_sock_graft(struct sock *sk, struct socket *parent)
 {
 }
 
-static inline int security_inet_conn_request(struct sock *sk,
+static inline int security_inet_conn_request(const struct sock *sk,
 			struct sk_buff *skb, struct request_sock *req)
 {
 	return 0;
diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h
index 2431c011800d..aadb4b29fb66 100644
--- a/security/apparmor/include/net.h
+++ b/security/apparmor/include/net.h
@@ -107,6 +107,6 @@ int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request,
 		      struct socket *sock);
 
 int apparmor_secmark_check(struct aa_label *label, char *op, u32 request,
-			   u32 secid, struct sock *sk);
+			   u32 secid, const struct sock *sk);
 
 #endif /* __AA_NET_H */
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index ffeaee5ed968..1b0aba8eb723 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1147,7 +1147,7 @@ static void apparmor_sock_graft(struct sock *sk, struct socket *parent)
 }
 
 #ifdef CONFIG_NETWORK_SECMARK
-static int apparmor_inet_conn_request(struct sock *sk, struct sk_buff *skb,
+static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb,
 				      struct request_sock *req)
 {
 	struct aa_sk_ctx *ctx = SK_CTX(sk);
diff --git a/security/apparmor/net.c b/security/apparmor/net.c
index fa0e85568450..e0c1b50d6edd 100644
--- a/security/apparmor/net.c
+++ b/security/apparmor/net.c
@@ -211,7 +211,7 @@ static int apparmor_secmark_init(struct aa_secmark *secmark)
 }
 
 static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid,
-			   struct common_audit_data *sa, struct sock *sk)
+			   struct common_audit_data *sa)
 {
 	int i, ret;
 	struct aa_perms perms = { };
@@ -244,13 +244,13 @@ static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid,
 }
 
 int apparmor_secmark_check(struct aa_label *label, char *op, u32 request,
-			   u32 secid, struct sock *sk)
+			   u32 secid, const struct sock *sk)
 {
 	struct aa_profile *profile;
 	DEFINE_AUDIT_SK(sa, op, sk);
 
 	return fn_for_each_confined(label, profile,
 				    aa_secmark_perm(profile, request, secid,
-						    &sa, sk));
+						    &sa));
 }
 #endif
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 53d0d183db8f..078f9cdcd7f5 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -183,7 +183,7 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb,
 
 
 static inline void print_ipv6_addr(struct audit_buffer *ab,
-				   struct in6_addr *addr, __be16 port,
+				   const struct in6_addr *addr, __be16 port,
 				   char *name1, char *name2)
 {
 	if (!ipv6_addr_any(addr))
@@ -322,7 +322,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	}
 	case LSM_AUDIT_DATA_NET:
 		if (a->u.net->sk) {
-			struct sock *sk = a->u.net->sk;
+			const struct sock *sk = a->u.net->sk;
 			struct unix_sock *u;
 			struct unix_address *addr;
 			int len = 0;
diff --git a/security/security.c b/security/security.c
index a28045dc9e7f..6509f95d203f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2225,7 +2225,7 @@ void security_sock_graft(struct sock *sk, struct socket *parent)
 }
 EXPORT_SYMBOL(security_sock_graft);
 
-int security_inet_conn_request(struct sock *sk,
+int security_inet_conn_request(const struct sock *sk,
 			struct sk_buff *skb, struct request_sock *req)
 {
 	return call_int_hook(inet_conn_request, 0, sk, skb, req);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 6b1826fc3658..6fa593006802 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5355,7 +5355,7 @@ static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
 	selinux_netlbl_sctp_sk_clone(sk, newsk);
 }
 
-static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
+static int selinux_inet_conn_request(const struct sock *sk, struct sk_buff *skb,
 				     struct request_sock *req)
 {
 	struct sk_security_struct *sksec = sk->sk_security;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 5c90b9fa4d40..3a62d6aa74a6 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3864,7 +3864,7 @@ static inline struct smack_known *smack_from_skb(struct sk_buff *skb)
  *
  * Returns smack_known of the IP options or NULL if that won't work.
  */
-static struct smack_known *smack_from_netlbl(struct sock *sk, u16 family,
+static struct smack_known *smack_from_netlbl(const struct sock *sk, u16 family,
 					     struct sk_buff *skb)
 {
 	struct netlbl_lsm_secattr secattr;
@@ -4114,7 +4114,7 @@ static void smack_sock_graft(struct sock *sk, struct socket *parent)
  * Returns 0 if a task with the packet label could write to
  * the socket, otherwise an error code
  */
-static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
+static int smack_inet_conn_request(const struct sock *sk, struct sk_buff *skb,
 				   struct request_sock *req)
 {
 	u16 family = sk->sk_family;
-- 
cgit 


From a15ac665b9e9c90b1557499f2a46c1e89d29154a Mon Sep 17 00:00:00 2001
From: Eric Farman <farman@linux.ibm.com>
Date: Thu, 3 Dec 2020 22:35:11 +0100
Subject: vfio-mdev: Wire in a request handler for mdev parent

While performing some destructive tests with vfio-ccw, where the
paths to a device are forcible removed and thus the device itself
is unreachable, it is rather easy to end up in an endless loop in
vfio_del_group_dev() due to the lack of a request callback for the
associated device.

In this example, one MDEV (77c) is used by a guest, while another
(77b) is not. The symptom is that the iommu is detached from the
mdev for 77b, but not 77c, until that guest is shutdown:

    [  238.794867] vfio_ccw 0.0.077b: MDEV: Unregistering
    [  238.794996] vfio_mdev 11f2d2bc-4083-431d-a023-eff72715c4f0: Removing from iommu group 2
    [  238.795001] vfio_mdev 11f2d2bc-4083-431d-a023-eff72715c4f0: MDEV: detaching iommu
    [  238.795036] vfio_ccw 0.0.077c: MDEV: Unregistering
    ...silence...

Let's wire in the request call back to the mdev device, so that a
device being physically removed from the host can be (gracefully?)
handled by the parent device at the time the device is removed.

Add a message when registering the device if a driver doesn't
provide this callback, so a clue is given that this same loop
may be encountered in a similar situation, and a message when
this occurs instead of the awkward silence noted above.

Signed-off-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/mdev/mdev_core.c |  4 ++++
 drivers/vfio/mdev/vfio_mdev.c | 13 +++++++++++++
 include/linux/mdev.h          |  4 ++++
 3 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index b558d4cfd082..6de97d25a3f8 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -154,6 +154,10 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	if (!dev)
 		return -EINVAL;
 
+	/* Not mandatory, but its absence could be a problem */
+	if (!ops->request)
+		dev_info(dev, "Driver cannot be asked to release device\n");
+
 	mutex_lock(&parent_list_lock);
 
 	/* Check for duplicate */
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 30964a4e0a28..b52eea128549 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -98,6 +98,18 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
 	return parent->ops->mmap(mdev, vma);
 }
 
+static void vfio_mdev_request(void *device_data, unsigned int count)
+{
+	struct mdev_device *mdev = device_data;
+	struct mdev_parent *parent = mdev->parent;
+
+	if (parent->ops->request)
+		parent->ops->request(mdev, count);
+	else if (count == 0)
+		dev_notice(mdev_dev(mdev),
+			   "No mdev vendor driver request callback support, blocked until released by user\n");
+}
+
 static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.name		= "vfio-mdev",
 	.open		= vfio_mdev_open,
@@ -106,6 +118,7 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.read		= vfio_mdev_read,
 	.write		= vfio_mdev_write,
 	.mmap		= vfio_mdev_mmap,
+	.request	= vfio_mdev_request,
 };
 
 static int vfio_mdev_probe(struct device *dev)
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 0ce30ca78db0..9004375c462e 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -72,6 +72,9 @@ struct device *mdev_get_iommu_device(struct device *dev);
  * @mmap:		mmap callback
  *			@mdev: mediated device structure
  *			@vma: vma structure
+ * @request:		request callback to release device
+ *			@mdev: mediated device structure
+ *			@count: request sequence number
  * Parent device that support mediated device should be registered with mdev
  * module with mdev_parent_ops structure.
  **/
@@ -92,6 +95,7 @@ struct mdev_parent_ops {
 	long	(*ioctl)(struct mdev_device *mdev, unsigned int cmd,
 			 unsigned long arg);
 	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
+	void	(*request)(struct mdev_device *mdev, unsigned int count);
 };
 
 /* interface for exporting mdev supported type attributes */
-- 
cgit 


From 22dc4a0f5ed11b6dc8fd73a0892fa0ea1a4c3cdf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 3 Dec 2020 12:46:29 -0800
Subject: bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier

Remove a permeating assumption thoughout BPF verifier of vmlinux BTF. Instead,
wherever BTF type IDs are involved, also track the instance of struct btf that
goes along with the type ID. This allows to gradually add support for kernel
module BTFs and using/tracking module types across BPF helper calls and
registers.

This patch also renames btf_id() function to btf_obj_id() to minimize naming
clash with using btf_id to denote BTF *type* ID, rather than BTF *object*'s ID.

Also, altough btf_vmlinux can't get destructed and thus doesn't need
refcounting, module BTFs need that, so apply BTF refcounting universally when
BPF program is using BTF-powered attachment (tp_btf, fentry/fexit, etc). This
makes for simpler clean up code.

Now that BTF type ID is not enough to uniquely identify a BTF type, extend BPF
trampoline key to include BTF object ID. To differentiate that from target
program BPF ID, set 31st bit of type ID. BTF type IDs (at least currently) are
not allowed to take full 32 bits, so there is no danger of confusing that bit
with a valid BTF type ID.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201203204634.1325171-10-andrii@kernel.org
---
 include/linux/bpf.h          | 13 +++++---
 include/linux/bpf_verifier.h | 28 ++++++++++++----
 include/linux/btf.h          |  5 ++-
 kernel/bpf/btf.c             | 65 +++++++++++++++++++++++++------------
 kernel/bpf/syscall.c         | 24 ++++++++++++--
 kernel/bpf/verifier.c        | 77 +++++++++++++++++++++++++++-----------------
 net/ipv4/bpf_tcp_ca.c        |  3 +-
 7 files changed, 148 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a9de5711b23f..d05e75ed8c1b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -421,7 +421,10 @@ struct bpf_insn_access_aux {
 	enum bpf_reg_type reg_type;
 	union {
 		int ctx_field_size;
-		u32 btf_id;
+		struct {
+			struct btf *btf;
+			u32 btf_id;
+		};
 	};
 	struct bpf_verifier_log *log; /* for verbose logs */
 };
@@ -458,6 +461,7 @@ struct bpf_verifier_ops {
 				  struct bpf_insn *dst,
 				  struct bpf_prog *prog, u32 *target_size);
 	int (*btf_struct_access)(struct bpf_verifier_log *log,
+				 const struct btf *btf,
 				 const struct btf_type *t, int off, int size,
 				 enum bpf_access_type atype,
 				 u32 *next_btf_id);
@@ -771,6 +775,7 @@ struct bpf_prog_aux {
 	u32 ctx_arg_info_size;
 	u32 max_rdonly_access;
 	u32 max_rdwr_access;
+	struct btf *attach_btf;
 	const struct bpf_ctx_arg_aux *ctx_arg_info;
 	struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
 	struct bpf_prog *dst_prog;
@@ -1005,7 +1010,6 @@ struct bpf_event_entry {
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 int bpf_prog_calc_tag(struct bpf_prog *fp);
-const char *kernel_type_name(u32 btf_type_id);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
@@ -1450,12 +1454,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
-int btf_struct_access(struct bpf_verifier_log *log,
+int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, int off, int size,
 		      enum bpf_access_type atype,
 		      u32 *next_btf_id);
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
-			  int off, u32 id, u32 need_type_id);
+			  const struct btf *btf, u32 id, int off,
+			  const struct btf *need_btf, u32 need_type_id);
 
 int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf *btf,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 306869d4743b..e941fe1484e5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -5,6 +5,7 @@
 #define _LINUX_BPF_VERIFIER_H 1
 
 #include <linux/bpf.h> /* for enum bpf_reg_type */
+#include <linux/btf.h> /* for struct btf and btf_id() */
 #include <linux/filter.h> /* for MAX_BPF_STACK */
 #include <linux/tnum.h>
 
@@ -43,6 +44,8 @@ enum bpf_reg_liveness {
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
+	/* Fixed part of pointer offset, pointer types only */
+	s32 off;
 	union {
 		/* valid when type == PTR_TO_PACKET */
 		int range;
@@ -52,15 +55,20 @@ struct bpf_reg_state {
 		 */
 		struct bpf_map *map_ptr;
 
-		u32 btf_id; /* for PTR_TO_BTF_ID */
+		/* for PTR_TO_BTF_ID */
+		struct {
+			struct btf *btf;
+			u32 btf_id;
+		};
 
 		u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
 
 		/* Max size from any of the above. */
-		unsigned long raw;
+		struct {
+			unsigned long raw1;
+			unsigned long raw2;
+		} raw;
 	};
-	/* Fixed part of pointer offset, pointer types only */
-	s32 off;
 	/* For PTR_TO_PACKET, used to find other pointers with the same variable
 	 * offset, so they can share range knowledge.
 	 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
@@ -311,7 +319,10 @@ struct bpf_insn_aux_data {
 		struct {
 			enum bpf_reg_type reg_type;	/* type of pseudo_btf_id */
 			union {
-				u32 btf_id;	/* btf_id for struct typed var */
+				struct {
+					struct btf *btf;
+					u32 btf_id;	/* btf_id for struct typed var */
+				};
 				u32 mem_size;	/* mem_size for non-struct typed var */
 			};
 		} btf_var;
@@ -459,9 +470,12 @@ int check_ctx_reg(struct bpf_verifier_env *env,
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
-					     u32 btf_id)
+					     struct btf *btf, u32 btf_id)
 {
-        return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id;
+	if (tgt_prog)
+		return ((u64)tgt_prog->aux->id << 32) | btf_id;
+	else
+		return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id;
 }
 
 int bpf_check_attach_target(struct bpf_verifier_log *log,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 2bf641829664..fb608e4de076 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -18,6 +18,7 @@ struct btf_show;
 
 extern const struct file_operations btf_fops;
 
+void btf_get(struct btf *btf);
 void btf_put(struct btf *btf);
 int btf_new_fd(const union bpf_attr *attr);
 struct btf *btf_get_by_fd(int fd);
@@ -88,7 +89,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
 			   char *buf, int len, u64 flags);
 
 int btf_get_fd_by_id(u32 id);
-u32 btf_id(const struct btf *btf);
+u32 btf_obj_id(const struct btf *btf);
 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
@@ -206,6 +207,8 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo(
 }
 
 #ifdef CONFIG_BPF_SYSCALL
+struct bpf_prog;
+
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
 struct btf *btf_parse_vmlinux(void);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6b2d508b33d4..7a19bf5bfe97 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1524,6 +1524,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
 	btf_free(btf);
 }
 
+void btf_get(struct btf *btf)
+{
+	refcount_inc(&btf->refcnt);
+}
+
 void btf_put(struct btf *btf)
 {
 	if (btf && refcount_dec_and_test(&btf->refcnt)) {
@@ -4555,11 +4560,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
 {
 	struct bpf_prog *tgt_prog = prog->aux->dst_prog;
 
-	if (tgt_prog) {
+	if (tgt_prog)
 		return tgt_prog->aux->btf;
-	} else {
-		return btf_vmlinux;
-	}
+	else
+		return prog->aux->attach_btf;
 }
 
 static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
@@ -4700,6 +4704,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 
 		if (ctx_arg_info->offset == off) {
 			info->reg_type = ctx_arg_info->reg_type;
+			info->btf = btf_vmlinux;
 			info->btf_id = ctx_arg_info->btf_id;
 			return true;
 		}
@@ -4716,6 +4721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 
 		ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
 		if (ret > 0) {
+			info->btf = btf_vmlinux;
 			info->btf_id = ret;
 			return true;
 		} else {
@@ -4723,6 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		}
 	}
 
+	info->btf = btf;
 	info->btf_id = t->type;
 	t = btf_type_by_id(btf, t->type);
 	/* skip modifiers */
@@ -4749,7 +4756,7 @@ enum bpf_struct_walk_result {
 	WALK_STRUCT,
 };
 
-static int btf_struct_walk(struct bpf_verifier_log *log,
+static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 			   const struct btf_type *t, int off, int size,
 			   u32 *next_btf_id)
 {
@@ -4760,7 +4767,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log,
 	u32 vlen, elem_id, mid;
 
 again:
-	tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
+	tname = __btf_name_by_offset(btf, t->name_off);
 	if (!btf_type_is_struct(t)) {
 		bpf_log(log, "Type '%s' is not a struct\n", tname);
 		return -EINVAL;
@@ -4777,7 +4784,7 @@ again:
 			goto error;
 
 		member = btf_type_member(t) + vlen - 1;
-		mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
+		mtype = btf_type_skip_modifiers(btf, member->type,
 						NULL);
 		if (!btf_type_is_array(mtype))
 			goto error;
@@ -4793,7 +4800,7 @@ again:
 		/* Only allow structure for now, can be relaxed for
 		 * other types later.
 		 */
-		t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type,
+		t = btf_type_skip_modifiers(btf, array_elem->type,
 					    NULL);
 		if (!btf_type_is_struct(t))
 			goto error;
@@ -4851,10 +4858,10 @@ error:
 
 		/* type of the field */
 		mid = member->type;
-		mtype = btf_type_by_id(btf_vmlinux, member->type);
-		mname = __btf_name_by_offset(btf_vmlinux, member->name_off);
+		mtype = btf_type_by_id(btf, member->type);
+		mname = __btf_name_by_offset(btf, member->name_off);
 
-		mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize,
+		mtype = __btf_resolve_size(btf, mtype, &msize,
 					   &elem_type, &elem_id, &total_nelems,
 					   &mid);
 		if (IS_ERR(mtype)) {
@@ -4949,7 +4956,7 @@ error:
 					mname, moff, tname, off, size);
 				return -EACCES;
 			}
-			stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id);
+			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
 			if (btf_type_is_struct(stype)) {
 				*next_btf_id = id;
 				return WALK_PTR;
@@ -4975,7 +4982,7 @@ error:
 	return -EINVAL;
 }
 
-int btf_struct_access(struct bpf_verifier_log *log,
+int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, int off, int size,
 		      enum bpf_access_type atype __maybe_unused,
 		      u32 *next_btf_id)
@@ -4984,7 +4991,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
 	u32 id;
 
 	do {
-		err = btf_struct_walk(log, t, off, size, &id);
+		err = btf_struct_walk(log, btf, t, off, size, &id);
 
 		switch (err) {
 		case WALK_PTR:
@@ -5000,7 +5007,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
 			 * by diving in it. At this point the offset is
 			 * aligned with the new type, so set it to 0.
 			 */
-			t = btf_type_by_id(btf_vmlinux, id);
+			t = btf_type_by_id(btf, id);
 			off = 0;
 			break;
 		default:
@@ -5016,21 +5023,37 @@ int btf_struct_access(struct bpf_verifier_log *log,
 	return -EINVAL;
 }
 
+/* Check that two BTF types, each specified as an BTF object + id, are exactly
+ * the same. Trivial ID check is not enough due to module BTFs, because we can
+ * end up with two different module BTFs, but IDs point to the common type in
+ * vmlinux BTF.
+ */
+static bool btf_types_are_same(const struct btf *btf1, u32 id1,
+			       const struct btf *btf2, u32 id2)
+{
+	if (id1 != id2)
+		return false;
+	if (btf1 == btf2)
+		return true;
+	return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2);
+}
+
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
-			  int off, u32 id, u32 need_type_id)
+			  const struct btf *btf, u32 id, int off,
+			  const struct btf *need_btf, u32 need_type_id)
 {
 	const struct btf_type *type;
 	int err;
 
 	/* Are we already done? */
-	if (need_type_id == id && off == 0)
+	if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
 		return true;
 
 again:
-	type = btf_type_by_id(btf_vmlinux, id);
+	type = btf_type_by_id(btf, id);
 	if (!type)
 		return false;
-	err = btf_struct_walk(log, type, off, 1, &id);
+	err = btf_struct_walk(log, btf, type, off, 1, &id);
 	if (err != WALK_STRUCT)
 		return false;
 
@@ -5039,7 +5062,7 @@ again:
 	 * continue the search with offset 0 in the new
 	 * type.
 	 */
-	if (need_type_id != id) {
+	if (!btf_types_are_same(btf, id, need_btf, need_type_id)) {
 		off = 0;
 		goto again;
 	}
@@ -5710,7 +5733,7 @@ int btf_get_fd_by_id(u32 id)
 	return fd;
 }
 
-u32 btf_id(const struct btf *btf)
+u32 btf_obj_id(const struct btf *btf)
 {
 	return btf->id;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d16dd4945100..184204169949 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1691,6 +1691,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 	bpf_prog_kallsyms_del_all(prog);
 	btf_put(prog->aux->btf);
 	bpf_prog_free_linfo(prog);
+	if (prog->aux->attach_btf)
+		btf_put(prog->aux->attach_btf);
 
 	if (deferred) {
 		if (prog->aux->sleepable)
@@ -2113,6 +2115,20 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	prog->expected_attach_type = attr->expected_attach_type;
 	prog->aux->attach_btf_id = attr->attach_btf_id;
+
+	if (attr->attach_btf_id && !attr->attach_prog_fd) {
+		struct btf *btf;
+
+		btf = bpf_get_btf_vmlinux();
+		if (IS_ERR(btf))
+			return PTR_ERR(btf);
+		if (!btf)
+			return -EINVAL;
+
+		btf_get(btf);
+		prog->aux->attach_btf = btf;
+	}
+
 	if (attr->attach_prog_fd) {
 		struct bpf_prog *dst_prog;
 
@@ -2209,6 +2225,8 @@ free_prog_sec:
 	free_uid(prog->aux->user);
 	security_bpf_prog_free(prog->aux);
 free_prog:
+	if (prog->aux->attach_btf)
+		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
 	return err;
 }
@@ -2566,7 +2584,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 			goto out_put_prog;
 		}
 
-		key = bpf_trampoline_compute_key(tgt_prog, btf_id);
+		key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
 	}
 
 	link = kzalloc(sizeof(*link), GFP_USER);
@@ -3543,7 +3561,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
 	}
 
 	if (prog->aux->btf)
-		info.btf_id = btf_id(prog->aux->btf);
+		info.btf_id = btf_obj_id(prog->aux->btf);
 
 	ulen = info.nr_func_info;
 	info.nr_func_info = prog->aux->func_info_cnt;
@@ -3646,7 +3664,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
 	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (map->btf) {
-		info.btf_id = btf_id(map->btf);
+		info.btf_id = btf_obj_id(map->btf);
 		info.btf_key_type_id = map->btf_key_type_id;
 		info.btf_value_type_id = map->btf_value_type_id;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e333ce43f281..2f3950839b85 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,7 +238,9 @@ struct bpf_call_arg_meta {
 	u64 msize_max_value;
 	int ref_obj_id;
 	int func_id;
+	struct btf *btf;
 	u32 btf_id;
+	struct btf *ret_btf;
 	u32 ret_btf_id;
 };
 
@@ -556,10 +558,9 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env,
 	return cur->frame[reg->frameno];
 }
 
-const char *kernel_type_name(u32 id)
+static const char *kernel_type_name(const struct btf* btf, u32 id)
 {
-	return btf_name_by_offset(btf_vmlinux,
-				  btf_type_by_id(btf_vmlinux, id)->name_off);
+	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
 }
 
 static void print_verifier_state(struct bpf_verifier_env *env,
@@ -589,7 +590,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (t == PTR_TO_BTF_ID ||
 			    t == PTR_TO_BTF_ID_OR_NULL ||
 			    t == PTR_TO_PERCPU_BTF_ID)
-				verbose(env, "%s", kernel_type_name(reg->btf_id));
+				verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
 			verbose(env, "(id=%d", reg->id);
 			if (reg_type_may_be_refcounted_or_null(t))
 				verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
@@ -1383,7 +1384,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 
 static void mark_btf_ld_reg(struct bpf_verifier_env *env,
 			    struct bpf_reg_state *regs, u32 regno,
-			    enum bpf_reg_type reg_type, u32 btf_id)
+			    enum bpf_reg_type reg_type,
+			    struct btf *btf, u32 btf_id)
 {
 	if (reg_type == SCALAR_VALUE) {
 		mark_reg_unknown(env, regs, regno);
@@ -1391,6 +1393,7 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env,
 	}
 	mark_reg_known_zero(env, regs, regno);
 	regs[regno].type = PTR_TO_BTF_ID;
+	regs[regno].btf = btf;
 	regs[regno].btf_id = btf_id;
 }
 
@@ -2764,7 +2767,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type,
-			    u32 *btf_id)
+			    struct btf **btf, u32 *btf_id)
 {
 	struct bpf_insn_access_aux info = {
 		.reg_type = *reg_type,
@@ -2782,10 +2785,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
+		if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
+			*btf = info.btf;
 			*btf_id = info.btf_id;
-		else
+		} else {
 			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+		}
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -3297,8 +3302,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 				   int value_regno)
 {
 	struct bpf_reg_state *reg = regs + regno;
-	const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id);
-	const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
+	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
 	u32 btf_id;
 	int ret;
 
@@ -3319,23 +3324,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	}
 
 	if (env->ops->btf_struct_access) {
-		ret = env->ops->btf_struct_access(&env->log, t, off, size,
-						  atype, &btf_id);
+		ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
+						  off, size, atype, &btf_id);
 	} else {
 		if (atype != BPF_READ) {
 			verbose(env, "only read is supported\n");
 			return -EACCES;
 		}
 
-		ret = btf_struct_access(&env->log, t, off, size, atype,
-					&btf_id);
+		ret = btf_struct_access(&env->log, reg->btf, t, off, size,
+					atype, &btf_id);
 	}
 
 	if (ret < 0)
 		return ret;
 
 	if (atype == BPF_READ && value_regno >= 0)
-		mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
+		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
 
 	return 0;
 }
@@ -3385,12 +3390,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
+	ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
 	if (ret < 0)
 		return ret;
 
 	if (value_regno >= 0)
-		mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
+		mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
 
 	return 0;
 }
@@ -3466,6 +3471,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
+		struct btf *btf = NULL;
 		u32 btf_id = 0;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
@@ -3478,7 +3484,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (err < 0)
 			return err;
 
-		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf_id);
+		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
 		if (err)
 			verbose_linfo(env, insn_idx, "; ");
 		if (!err && t == BPF_READ && value_regno >= 0) {
@@ -3500,8 +3506,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				 */
 				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
 				if (reg_type == PTR_TO_BTF_ID ||
-				    reg_type == PTR_TO_BTF_ID_OR_NULL)
+				    reg_type == PTR_TO_BTF_ID_OR_NULL) {
+					regs[value_regno].btf = btf;
 					regs[value_regno].btf_id = btf_id;
+				}
 			}
 			regs[value_regno].type = reg_type;
 		}
@@ -4118,11 +4126,11 @@ found:
 			arg_btf_id = compatible->btf_id;
 		}
 
-		if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id,
-					  *arg_btf_id)) {
+		if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+					  btf_vmlinux, *arg_btf_id)) {
 			verbose(env, "R%d is of type %s but %s is expected\n",
-				regno, kernel_type_name(reg->btf_id),
-				kernel_type_name(*arg_btf_id));
+				regno, kernel_type_name(reg->btf, reg->btf_id),
+				kernel_type_name(btf_vmlinux, *arg_btf_id));
 			return -EACCES;
 		}
 
@@ -4244,6 +4252,7 @@ skip_type_check:
 			verbose(env, "Helper has invalid btf_id in R%d\n", regno);
 			return -EACCES;
 		}
+		meta->ret_btf = reg->btf;
 		meta->ret_btf_id = reg->btf_id;
 	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
 		if (meta->func_id == BPF_FUNC_spin_lock) {
@@ -5190,16 +5199,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		const struct btf_type *t;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
+		t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL);
 		if (!btf_type_is_struct(t)) {
 			u32 tsize;
 			const struct btf_type *ret;
 			const char *tname;
 
 			/* resolve the type size of ksym. */
-			ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+			ret = btf_resolve_size(meta.ret_btf, t, &tsize);
 			if (IS_ERR(ret)) {
-				tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+				tname = btf_name_by_offset(meta.ret_btf, t->name_off);
 				verbose(env, "unable to resolve the size of type '%s': %ld\n",
 					tname, PTR_ERR(ret));
 				return -EINVAL;
@@ -5212,6 +5221,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			regs[BPF_REG_0].type =
 				fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
 				PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
+			regs[BPF_REG_0].btf = meta.ret_btf;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
 	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
@@ -5228,6 +5238,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 				fn->ret_type, func_id_name(func_id), func_id);
 			return -EINVAL;
 		}
+		/* current BPF helper definitions are only coming from
+		 * built-in code with type IDs from  vmlinux BTF
+		 */
+		regs[BPF_REG_0].btf = btf_vmlinux;
 		regs[BPF_REG_0].btf_id = ret_btf_id;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
@@ -5627,7 +5641,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
-			dst_reg->raw = 0;
+			memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
 		}
 		break;
 	case BPF_SUB:
@@ -5692,7 +5706,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
-				dst_reg->raw = 0;
+				memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
 		}
 		break;
 	case BPF_AND:
@@ -7744,6 +7758,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			break;
 		case PTR_TO_BTF_ID:
 		case PTR_TO_PERCPU_BTF_ID:
+			dst_reg->btf = aux->btf_var.btf;
 			dst_reg->btf_id = aux->btf_var.btf_id;
 			break;
 		default:
@@ -9739,6 +9754,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 	t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
 	if (percpu) {
 		aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+		aux->btf_var.btf = btf_vmlinux;
 		aux->btf_var.btf_id = type;
 	} else if (!btf_type_is_struct(t)) {
 		const struct btf_type *ret;
@@ -9757,6 +9773,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 		aux->btf_var.mem_size = tsize;
 	} else {
 		aux->btf_var.reg_type = PTR_TO_BTF_ID;
+		aux->btf_var.btf = btf_vmlinux;
 		aux->btf_var.btf_id = type;
 	}
 	return 0;
@@ -11609,7 +11626,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		bpf_log(log, "Tracing programs must provide btf_id\n");
 		return -EINVAL;
 	}
-	btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux;
+	btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
 	if (!btf) {
 		bpf_log(log,
 			"FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
@@ -11885,7 +11902,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 			return ret;
 	}
 
-	key = bpf_trampoline_compute_key(tgt_prog, btf_id);
+	key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
 	tr = bpf_trampoline_get(key, &tgt_info);
 	if (!tr)
 		return -ENOMEM;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 618954f82764..d520e61649c8 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -95,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
 }
 
 static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
+					const struct btf *btf,
 					const struct btf_type *t, int off,
 					int size, enum bpf_access_type atype,
 					u32 *next_btf_id)
@@ -102,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
 	size_t end;
 
 	if (atype == BPF_READ)
-		return btf_struct_access(log, t, off, size, atype, next_btf_id);
+		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
 
 	if (t != tcp_sock_type) {
 		bpf_log(log, "only read is supported\n");
-- 
cgit 


From 290248a5b7d829871b3ea3c62578613a580a1744 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 3 Dec 2020 12:46:30 -0800
Subject: bpf: Allow to specify kernel module BTFs when attaching BPF programs

Add ability for user-space programs to specify non-vmlinux BTF when attaching
BTF-powered BPF programs: raw_tp, fentry/fexit/fmod_ret, LSM, etc. For this,
attach_prog_fd (now with the alias name attach_btf_obj_fd) should specify FD
of a module or vmlinux BTF object. For backwards compatibility reasons,
0 denotes vmlinux BTF. Only kernel BTF (vmlinux or module) can be specified.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201203204634.1325171-11-andrii@kernel.org
---
 include/linux/btf.h            |  1 +
 include/uapi/linux/bpf.h       |  7 +++-
 kernel/bpf/btf.c               |  5 +++
 kernel/bpf/syscall.c           | 82 ++++++++++++++++++++++++++----------------
 tools/include/uapi/linux/bpf.h |  7 +++-
 5 files changed, 69 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index fb608e4de076..4c200f5d242b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -90,6 +90,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
 
 int btf_get_fd_by_id(u32 id);
 u32 btf_obj_id(const struct btf *btf);
+bool btf_is_kernel(const struct btf *btf);
 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3458ec1f30a..1233f14f659f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -557,7 +557,12 @@ union bpf_attr {
 		__aligned_u64	line_info;	/* line info */
 		__u32		line_info_cnt;	/* number of bpf_line_info records */
 		__u32		attach_btf_id;	/* in-kernel BTF type id to attach to */
-		__u32		attach_prog_fd; /* 0 to attach to vmlinux */
+		union {
+			/* valid prog_fd to attach to bpf prog */
+			__u32		attach_prog_fd;
+			/* or valid module BTF object fd or 0 to attach to vmlinux */
+			__u32		attach_btf_obj_fd;
+		};
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7a19bf5bfe97..8d6bdb4f4d61 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5738,6 +5738,11 @@ u32 btf_obj_id(const struct btf *btf)
 	return btf->id;
 }
 
+bool btf_is_kernel(const struct btf *btf)
+{
+	return btf->kernel_btf;
+}
+
 static int btf_id_cmp_func(const void *a, const void *b)
 {
 	const int *pa = a, *pb = b;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 184204169949..0cd3cc2af9c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1926,12 +1926,16 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
 static int
 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 			   enum bpf_attach_type expected_attach_type,
-			   u32 btf_id, u32 prog_fd)
+			   struct btf *attach_btf, u32 btf_id,
+			   struct bpf_prog *dst_prog)
 {
 	if (btf_id) {
 		if (btf_id > BTF_MAX_TYPE)
 			return -EINVAL;
 
+		if (!attach_btf && !dst_prog)
+			return -EINVAL;
+
 		switch (prog_type) {
 		case BPF_PROG_TYPE_TRACING:
 		case BPF_PROG_TYPE_LSM:
@@ -1943,7 +1947,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		}
 	}
 
-	if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
+	if (attach_btf && (!btf_id || dst_prog))
+		return -EINVAL;
+
+	if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
 	    prog_type != BPF_PROG_TYPE_EXT)
 		return -EINVAL;
 
@@ -2060,7 +2067,8 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 {
 	enum bpf_prog_type type = attr->prog_type;
-	struct bpf_prog *prog;
+	struct bpf_prog *prog, *dst_prog = NULL;
+	struct btf *attach_btf = NULL;
 	int err;
 	char license[128];
 	bool is_gpl;
@@ -2102,44 +2110,56 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	if (is_perfmon_prog_type(type) && !perfmon_capable())
 		return -EPERM;
 
+	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
+	 * or btf, we need to check which one it is
+	 */
+	if (attr->attach_prog_fd) {
+		dst_prog = bpf_prog_get(attr->attach_prog_fd);
+		if (IS_ERR(dst_prog)) {
+			dst_prog = NULL;
+			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
+			if (IS_ERR(attach_btf))
+				return -EINVAL;
+			if (!btf_is_kernel(attach_btf)) {
+				btf_put(attach_btf);
+				return -EINVAL;
+			}
+		}
+	} else if (attr->attach_btf_id) {
+		/* fall back to vmlinux BTF, if BTF type ID is specified */
+		attach_btf = bpf_get_btf_vmlinux();
+		if (IS_ERR(attach_btf))
+			return PTR_ERR(attach_btf);
+		if (!attach_btf)
+			return -EINVAL;
+		btf_get(attach_btf);
+	}
+
 	bpf_prog_load_fixup_attach_type(attr);
 	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
-				       attr->attach_btf_id,
-				       attr->attach_prog_fd))
+				       attach_btf, attr->attach_btf_id,
+				       dst_prog)) {
+		if (dst_prog)
+			bpf_prog_put(dst_prog);
+		if (attach_btf)
+			btf_put(attach_btf);
 		return -EINVAL;
+	}
 
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
-	if (!prog)
+	if (!prog) {
+		if (dst_prog)
+			bpf_prog_put(dst_prog);
+		if (attach_btf)
+			btf_put(attach_btf);
 		return -ENOMEM;
+	}
 
 	prog->expected_attach_type = attr->expected_attach_type;
+	prog->aux->attach_btf = attach_btf;
 	prog->aux->attach_btf_id = attr->attach_btf_id;
-
-	if (attr->attach_btf_id && !attr->attach_prog_fd) {
-		struct btf *btf;
-
-		btf = bpf_get_btf_vmlinux();
-		if (IS_ERR(btf))
-			return PTR_ERR(btf);
-		if (!btf)
-			return -EINVAL;
-
-		btf_get(btf);
-		prog->aux->attach_btf = btf;
-	}
-
-	if (attr->attach_prog_fd) {
-		struct bpf_prog *dst_prog;
-
-		dst_prog = bpf_prog_get(attr->attach_prog_fd);
-		if (IS_ERR(dst_prog)) {
-			err = PTR_ERR(dst_prog);
-			goto free_prog;
-		}
-		prog->aux->dst_prog = dst_prog;
-	}
-
+	prog->aux->dst_prog = dst_prog;
 	prog->aux->offload_requested = !!attr->prog_ifindex;
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c3458ec1f30a..1233f14f659f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -557,7 +557,12 @@ union bpf_attr {
 		__aligned_u64	line_info;	/* line info */
 		__u32		line_info_cnt;	/* number of bpf_line_info records */
 		__u32		attach_btf_id;	/* in-kernel BTF type id to attach to */
-		__u32		attach_prog_fd; /* 0 to attach to vmlinux */
+		union {
+			/* valid prog_fd to attach to bpf prog */
+			__u32		attach_prog_fd;
+			/* or valid module BTF object fd or 0 to attach to vmlinux */
+			__u32		attach_btf_obj_fd;
+		};
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-- 
cgit 


From 6df3e14436f6ee254b1a4952d90ee8988be59c89 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 2 Dec 2020 18:41:02 +0000
Subject: psci: Add accessor for psci_0_1_function_ids

Make it possible to retrieve a copy of the psci_0_1_function_ids struct.
This is useful for KVM if it is configured to intercept host's PSCI SMCs.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20201202184122.26046-7-dbrazdil@google.com
---
 drivers/firmware/psci/psci.c | 12 +++++-------
 include/linux/psci.h         |  9 +++++++++
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index 593fdd0e09a2..f5fc429cae3f 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -58,15 +58,13 @@ typedef unsigned long (psci_fn)(unsigned long, unsigned long,
 				unsigned long, unsigned long);
 static psci_fn *invoke_psci_fn;
 
-struct psci_0_1_function_ids {
-	u32 cpu_suspend;
-	u32 cpu_on;
-	u32 cpu_off;
-	u32 migrate;
-};
-
 static struct psci_0_1_function_ids psci_0_1_function_ids;
 
+struct psci_0_1_function_ids get_psci_0_1_function_ids(void)
+{
+	return psci_0_1_function_ids;
+}
+
 #define PSCI_0_2_POWER_STATE_MASK		\
 				(PSCI_0_2_POWER_STATE_ID_MASK | \
 				PSCI_0_2_POWER_STATE_TYPE_MASK | \
diff --git a/include/linux/psci.h b/include/linux/psci.h
index 2a1bfb890e58..4ca0060a3fc4 100644
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -34,6 +34,15 @@ struct psci_operations {
 
 extern struct psci_operations psci_ops;
 
+struct psci_0_1_function_ids {
+	u32 cpu_suspend;
+	u32 cpu_on;
+	u32 cpu_off;
+	u32 migrate;
+};
+
+struct psci_0_1_function_ids get_psci_0_1_function_ids(void);
+
 #if defined(CONFIG_ARM_PSCI_FW)
 int __init psci_dt_init(void);
 #else
-- 
cgit 


From 7de3697e9cbd4bd3d62bafa249d57990e1b8f294 Mon Sep 17 00:00:00 2001
From: Dave Ertman <david.m.ertman@intel.com>
Date: Wed, 2 Dec 2020 16:54:24 -0800
Subject: Add auxiliary bus support

Add support for the Auxiliary Bus, auxiliary_device and auxiliary_driver.
It enables drivers to create an auxiliary_device and bind an
auxiliary_driver to it.

The bus supports probe/remove shutdown and suspend/resume callbacks.
Each auxiliary_device has a unique string based id; driver binds to
an auxiliary_device based on this id through the bus.

Co-developed-by: Kiran Patil <kiran.patil@intel.com>
Co-developed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Co-developed-by: Fred Oh <fred.oh@linux.intel.com>
Co-developed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Kiran Patil <kiran.patil@intel.com>
Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Fred Oh <fred.oh@linux.intel.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Martin Habets <mhabets@solarflare.com>
Link: https://lore.kernel.org/r/20201113161859.1775473-2-david.m.ertman@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/160695681289.505290.8978295443574440604.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/auxiliary_bus.rst | 234 +++++++++++++++++++++++++
 Documentation/driver-api/index.rst         |   1 +
 drivers/base/Kconfig                       |   3 +
 drivers/base/Makefile                      |   1 +
 drivers/base/auxiliary.c                   | 268 +++++++++++++++++++++++++++++
 include/linux/auxiliary_bus.h              |  78 +++++++++
 include/linux/mod_devicetable.h            |   8 +
 scripts/mod/devicetable-offsets.c          |   3 +
 scripts/mod/file2alias.c                   |   8 +
 9 files changed, 604 insertions(+)
 create mode 100644 Documentation/driver-api/auxiliary_bus.rst
 create mode 100644 drivers/base/auxiliary.c
 create mode 100644 include/linux/auxiliary_bus.h

(limited to 'include/linux')

diff --git a/Documentation/driver-api/auxiliary_bus.rst b/Documentation/driver-api/auxiliary_bus.rst
new file mode 100644
index 000000000000..5dd7804631ef
--- /dev/null
+++ b/Documentation/driver-api/auxiliary_bus.rst
@@ -0,0 +1,234 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+=============
+Auxiliary Bus
+=============
+
+In some subsystems, the functionality of the core device (PCI/ACPI/other) is
+too complex for a single device to be managed by a monolithic driver
+(e.g. Sound Open Firmware), multiple devices might implement a common
+intersection of functionality (e.g. NICs + RDMA), or a driver may want to
+export an interface for another subsystem to drive (e.g. SIOV Physical Function
+export Virtual Function management).  A split of the functinoality into child-
+devices representing sub-domains of functionality makes it possible to
+compartmentalize, layer, and distribute domain-specific concerns via a Linux
+device-driver model.
+
+An example for this kind of requirement is the audio subsystem where a single
+IP is handling multiple entities such as HDMI, Soundwire, local devices such as
+mics/speakers etc. The split for the core's functionality can be arbitrary or
+be defined by the DSP firmware topology and include hooks for test/debug. This
+allows for the audio core device to be minimal and focused on hardware-specific
+control and communication.
+
+Each auxiliary_device represents a part of its parent functionality. The
+generic behavior can be extended and specialized as needed by encapsulating an
+auxiliary_device within other domain-specific structures and the use of .ops
+callbacks. Devices on the auxiliary bus do not share any structures and the use
+of a communication channel with the parent is domain-specific.
+
+Note that ops are intended as a way to augment instance behavior within a class
+of auxiliary devices, it is not the mechanism for exporting common
+infrastructure from the parent. Consider EXPORT_SYMBOL_NS() to convey
+infrastructure from the parent module to the auxiliary module(s).
+
+
+When Should the Auxiliary Bus Be Used
+=====================================
+
+The auxiliary bus is to be used when a driver and one or more kernel modules,
+who share a common header file with the driver, need a mechanism to connect and
+provide access to a shared object allocated by the auxiliary_device's
+registering driver.  The registering driver for the auxiliary_device(s) and the
+kernel module(s) registering auxiliary_drivers can be from the same subsystem,
+or from multiple subsystems.
+
+The emphasis here is on a common generic interface that keeps subsystem
+customization out of the bus infrastructure.
+
+One example is a PCI network device that is RDMA-capable and exports a child
+device to be driven by an auxiliary_driver in the RDMA subsystem.  The PCI
+driver allocates and registers an auxiliary_device for each physical
+function on the NIC.  The RDMA driver registers an auxiliary_driver that claims
+each of these auxiliary_devices.  This conveys data/ops published by the parent
+PCI device/driver to the RDMA auxiliary_driver.
+
+Another use case is for the PCI device to be split out into multiple sub
+functions.  For each sub function an auxiliary_device is created.  A PCI sub
+function driver binds to such devices that creates its own one or more class
+devices.  A PCI sub function auxiliary device is likely to be contained in a
+struct with additional attributes such as user defined sub function number and
+optional attributes such as resources and a link to the parent device.  These
+attributes could be used by systemd/udev; and hence should be initialized
+before a driver binds to an auxiliary_device.
+
+A key requirement for utilizing the auxiliary bus is that there is no
+dependency on a physical bus, device, register accesses or regmap support.
+These individual devices split from the core cannot live on the platform bus as
+they are not physical devices that are controlled by DT/ACPI.  The same
+argument applies for not using MFD in this scenario as MFD relies on individual
+function devices being physical devices.
+
+Auxiliary Device
+================
+
+An auxiliary_device represents a part of its parent device's functionality. It
+is given a name that, combined with the registering drivers KBUILD_MODNAME,
+creates a match_name that is used for driver binding, and an id that combined
+with the match_name provide a unique name to register with the bus subsystem.
+
+Registering an auxiliary_device is a two-step process.  First call
+auxiliary_device_init(), which checks several aspects of the auxiliary_device
+struct and performs a device_initialize().  After this step completes, any
+error state must have a call to auxiliary_device_uninit() in its resolution path.
+The second step in registering an auxiliary_device is to perform a call to
+auxiliary_device_add(), which sets the name of the device and add the device to
+the bus.
+
+Unregistering an auxiliary_device is also a two-step process to mirror the
+register process.  First call auxiliary_device_delete(), then call
+auxiliary_device_uninit().
+
+.. code-block:: c
+
+	struct auxiliary_device {
+		struct device dev;
+                const char *name;
+		u32 id;
+	};
+
+If two auxiliary_devices both with a match_name "mod.foo" are registered onto
+the bus, they must have unique id values (e.g. "x" and "y") so that the
+registered devices names are "mod.foo.x" and "mod.foo.y".  If match_name + id
+are not unique, then the device_add fails and generates an error message.
+
+The auxiliary_device.dev.type.release or auxiliary_device.dev.release must be
+populated with a non-NULL pointer to successfully register the auxiliary_device.
+
+The auxiliary_device.dev.parent must also be populated.
+
+Auxiliary Device Memory Model and Lifespan
+------------------------------------------
+
+The registering driver is the entity that allocates memory for the
+auxiliary_device and register it on the auxiliary bus.  It is important to note
+that, as opposed to the platform bus, the registering driver is wholly
+responsible for the management for the memory used for the driver object.
+
+A parent object, defined in the shared header file, contains the
+auxiliary_device.  It also contains a pointer to the shared object(s), which
+also is defined in the shared header.  Both the parent object and the shared
+object(s) are allocated by the registering driver.  This layout allows the
+auxiliary_driver's registering module to perform a container_of() call to go
+from the pointer to the auxiliary_device, that is passed during the call to the
+auxiliary_driver's probe function, up to the parent object, and then have
+access to the shared object(s).
+
+The memory for the auxiliary_device is freed only in its release() callback
+flow as defined by its registering driver.
+
+The memory for the shared object(s) must have a lifespan equal to, or greater
+than, the lifespan of the memory for the auxiliary_device.  The auxiliary_driver
+should only consider that this shared object is valid as long as the
+auxiliary_device is still registered on the auxiliary bus.  It is up to the
+registering driver to manage (e.g. free or keep available) the memory for the
+shared object beyond the life of the auxiliary_device.
+
+The registering driver must unregister all auxiliary devices before its own
+driver.remove() is completed.
+
+Auxiliary Drivers
+=================
+
+Auxiliary drivers follow the standard driver model convention, where
+discovery/enumeration is handled by the core, and drivers
+provide probe() and remove() methods. They support power management
+and shutdown notifications using the standard conventions.
+
+.. code-block:: c
+
+	struct auxiliary_driver {
+		int (*probe)(struct auxiliary_device *,
+                             const struct auxiliary_device_id *id);
+		int (*remove)(struct auxiliary_device *);
+		void (*shutdown)(struct auxiliary_device *);
+		int (*suspend)(struct auxiliary_device *, pm_message_t);
+		int (*resume)(struct auxiliary_device *);
+		struct device_driver driver;
+		const struct auxiliary_device_id *id_table;
+	};
+
+Auxiliary drivers register themselves with the bus by calling
+auxiliary_driver_register(). The id_table contains the match_names of auxiliary
+devices that a driver can bind with.
+
+Example Usage
+=============
+
+Auxiliary devices are created and registered by a subsystem-level core device
+that needs to break up its functionality into smaller fragments. One way to
+extend the scope of an auxiliary_device is to encapsulate it within a domain-
+pecific structure defined by the parent device. This structure contains the
+auxiliary_device and any associated shared data/callbacks needed to establish
+the connection with the parent.
+
+An example is:
+
+.. code-block:: c
+
+        struct foo {
+		struct auxiliary_device auxdev;
+		void (*connect)(struct auxiliary_device *auxdev);
+		void (*disconnect)(struct auxiliary_device *auxdev);
+		void *data;
+        };
+
+The parent device then registers the auxiliary_device by calling
+auxiliary_device_init(), and then auxiliary_device_add(), with the pointer to
+the auxdev member of the above structure. The parent provides a name for the
+auxiliary_device that, combined with the parent's KBUILD_MODNAME, creates a
+match_name that is be used for matching and binding with a driver.
+
+Whenever an auxiliary_driver is registered, based on the match_name, the
+auxiliary_driver's probe() is invoked for the matching devices.  The
+auxiliary_driver can also be encapsulated inside custom drivers that make the
+core device's functionality extensible by adding additional domain-specific ops
+as follows:
+
+.. code-block:: c
+
+	struct my_ops {
+		void (*send)(struct auxiliary_device *auxdev);
+		void (*receive)(struct auxiliary_device *auxdev);
+	};
+
+
+	struct my_driver {
+		struct auxiliary_driver auxiliary_drv;
+		const struct my_ops ops;
+	};
+
+An example of this type of usage is:
+
+.. code-block:: c
+
+	const struct auxiliary_device_id my_auxiliary_id_table[] = {
+		{ .name = "foo_mod.foo_dev" },
+		{ },
+	};
+
+	const struct my_ops my_custom_ops = {
+		.send = my_tx,
+		.receive = my_rx,
+	};
+
+	const struct my_driver my_drv = {
+		.auxiliary_drv = {
+			.name = "myauxiliarydrv",
+			.id_table = my_auxiliary_id_table,
+			.probe = my_probe,
+			.remove = my_remove,
+			.shutdown = my_shutdown,
+		},
+		.ops = my_custom_ops,
+	};
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index f357f3eb400c..86759a74b7f1 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -72,6 +72,7 @@ available subsections can be seen below.
    thermal/index
    fpga/index
    acpi/index
+   auxiliary_bus
    backlight/lp855x-driver.rst
    connector
    console
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 8d7001712062..040be48ce046 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 menu "Generic Driver Options"
 
+config AUXILIARY_BUS
+	bool
+
 config UEVENT_HELPER
 	bool "Support for uevent helper"
 	help
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 41369fc7004f..5e7bf9669a81 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -7,6 +7,7 @@ obj-y			:= component.o core.o bus.o dd.o syscore.o \
 			   attribute_container.o transport_class.o \
 			   topology.o container.o property.o cacheinfo.o \
 			   swnode.o
+obj-$(CONFIG_AUXILIARY_BUS) += auxiliary.o
 obj-$(CONFIG_DEVTMPFS)	+= devtmpfs.o
 obj-y			+= power/
 obj-$(CONFIG_ISA_BUS_API)	+= isa.o
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c
new file mode 100644
index 000000000000..ef2af417438b
--- /dev/null
+++ b/drivers/base/auxiliary.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2019-2020 Intel Corporation
+ *
+ * Please see Documentation/driver-api/auxiliary_bus.rst for more information.
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <linux/string.h>
+#include <linux/auxiliary_bus.h>
+
+static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id,
+							    const struct auxiliary_device *auxdev)
+{
+	for (; id->name[0]; id++) {
+		const char *p = strrchr(dev_name(&auxdev->dev), '.');
+		int match_size;
+
+		if (!p)
+			continue;
+		match_size = p - dev_name(&auxdev->dev);
+
+		/* use dev_name(&auxdev->dev) prefix before last '.' char to match to */
+		if (strlen(id->name) == match_size &&
+		    !strncmp(dev_name(&auxdev->dev), id->name, match_size))
+			return id;
+	}
+	return NULL;
+}
+
+static int auxiliary_match(struct device *dev, struct device_driver *drv)
+{
+	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
+	struct auxiliary_driver *auxdrv = to_auxiliary_drv(drv);
+
+	return !!auxiliary_match_id(auxdrv->id_table, auxdev);
+}
+
+static int auxiliary_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	const char *name, *p;
+
+	name = dev_name(dev);
+	p = strrchr(name, '.');
+
+	return add_uevent_var(env, "MODALIAS=%s%.*s", AUXILIARY_MODULE_PREFIX, (int)(p - name),
+			      name);
+}
+
+static const struct dev_pm_ops auxiliary_dev_pm_ops = {
+	SET_RUNTIME_PM_OPS(pm_generic_runtime_suspend, pm_generic_runtime_resume, NULL)
+	SET_SYSTEM_SLEEP_PM_OPS(pm_generic_suspend, pm_generic_resume)
+};
+
+static int auxiliary_bus_probe(struct device *dev)
+{
+	struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver);
+	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
+	int ret;
+
+	ret = dev_pm_domain_attach(dev, true);
+	if (ret) {
+		dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret);
+		return ret;
+	}
+
+	ret = auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev));
+	if (ret)
+		dev_pm_domain_detach(dev, true);
+
+	return ret;
+}
+
+static int auxiliary_bus_remove(struct device *dev)
+{
+	struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver);
+	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
+	int ret = 0;
+
+	if (auxdrv->remove)
+		ret = auxdrv->remove(auxdev);
+	dev_pm_domain_detach(dev, true);
+
+	return ret;
+}
+
+static void auxiliary_bus_shutdown(struct device *dev)
+{
+	struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver);
+	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
+
+	if (auxdrv->shutdown)
+		auxdrv->shutdown(auxdev);
+}
+
+static struct bus_type auxiliary_bus_type = {
+	.name = "auxiliary",
+	.probe = auxiliary_bus_probe,
+	.remove = auxiliary_bus_remove,
+	.shutdown = auxiliary_bus_shutdown,
+	.match = auxiliary_match,
+	.uevent = auxiliary_uevent,
+	.pm = &auxiliary_dev_pm_ops,
+};
+
+/**
+ * auxiliary_device_init - check auxiliary_device and initialize
+ * @auxdev: auxiliary device struct
+ *
+ * This is the first step in the two-step process to register an auxiliary_device.
+ *
+ * When this function returns an error code, then the device_initialize will *not* have
+ * been performed, and the caller will be responsible to free any memory allocated for the
+ * auxiliary_device in the error path directly.
+ *
+ * It returns 0 on success.  On success, the device_initialize has been performed.  After this
+ * point any error unwinding will need to include a call to auxiliary_device_uninit().
+ * In this post-initialize error scenario, a call to the device's .release callback will be
+ * triggered, and all memory clean-up is expected to be handled there.
+ */
+int auxiliary_device_init(struct auxiliary_device *auxdev)
+{
+	struct device *dev = &auxdev->dev;
+
+	if (!dev->parent) {
+		pr_err("auxiliary_device has a NULL dev->parent\n");
+		return -EINVAL;
+	}
+
+	if (!auxdev->name) {
+		pr_err("auxiliary_device has a NULL name\n");
+		return -EINVAL;
+	}
+
+	dev->bus = &auxiliary_bus_type;
+	device_initialize(&auxdev->dev);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(auxiliary_device_init);
+
+/**
+ * __auxiliary_device_add - add an auxiliary bus device
+ * @auxdev: auxiliary bus device to add to the bus
+ * @modname: name of the parent device's driver module
+ *
+ * This is the second step in the two-step process to register an auxiliary_device.
+ *
+ * This function must be called after a successful call to auxiliary_device_init(), which
+ * will perform the device_initialize.  This means that if this returns an error code, then a
+ * call to auxiliary_device_uninit() must be performed so that the .release callback will
+ * be triggered to free the memory associated with the auxiliary_device.
+ *
+ * The expectation is that users will call the "auxiliary_device_add" macro so that the caller's
+ * KBUILD_MODNAME is automatically inserted for the modname parameter.  Only if a user requires
+ * a custom name would this version be called directly.
+ */
+int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname)
+{
+	struct device *dev = &auxdev->dev;
+	int ret;
+
+	if (!modname) {
+		pr_err("auxiliary device modname is NULL\n");
+		return -EINVAL;
+	}
+
+	ret = dev_set_name(dev, "%s.%s.%d", modname, auxdev->name, auxdev->id);
+	if (ret) {
+		pr_err("auxiliary device dev_set_name failed: %d\n", ret);
+		return ret;
+	}
+
+	ret = device_add(dev);
+	if (ret)
+		dev_err(dev, "adding auxiliary device failed!: %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__auxiliary_device_add);
+
+/**
+ * auxiliary_find_device - auxiliary device iterator for locating a particular device.
+ * @start: Device to begin with
+ * @data: Data to pass to match function
+ * @match: Callback function to check device
+ *
+ * This function returns a reference to a device that is 'found'
+ * for later use, as determined by the @match callback.
+ *
+ * The callback should return 0 if the device doesn't match and non-zero
+ * if it does.  If the callback returns non-zero, this function will
+ * return to the caller and not iterate over any more devices.
+ */
+struct auxiliary_device *
+auxiliary_find_device(struct device *start, const void *data,
+		      int (*match)(struct device *dev, const void *data))
+{
+	struct device *dev;
+
+	dev = bus_find_device(&auxiliary_bus_type, start, data, match);
+	if (!dev)
+		return NULL;
+
+	return to_auxiliary_dev(dev);
+}
+EXPORT_SYMBOL_GPL(auxiliary_find_device);
+
+/**
+ * __auxiliary_driver_register - register a driver for auxiliary bus devices
+ * @auxdrv: auxiliary_driver structure
+ * @owner: owning module/driver
+ * @modname: KBUILD_MODNAME for parent driver
+ */
+int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module *owner,
+				const char *modname)
+{
+	if (WARN_ON(!auxdrv->probe) || WARN_ON(!auxdrv->id_table))
+		return -EINVAL;
+
+	if (auxdrv->name)
+		auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s.%s", modname, auxdrv->name);
+	else
+		auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s", modname);
+	if (!auxdrv->driver.name)
+		return -ENOMEM;
+
+	auxdrv->driver.owner = owner;
+	auxdrv->driver.bus = &auxiliary_bus_type;
+	auxdrv->driver.mod_name = modname;
+
+	return driver_register(&auxdrv->driver);
+}
+EXPORT_SYMBOL_GPL(__auxiliary_driver_register);
+
+/**
+ * auxiliary_driver_unregister - unregister a driver
+ * @auxdrv: auxiliary_driver structure
+ */
+void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv)
+{
+	driver_unregister(&auxdrv->driver);
+	kfree(auxdrv->driver.name);
+}
+EXPORT_SYMBOL_GPL(auxiliary_driver_unregister);
+
+static int __init auxiliary_bus_init(void)
+{
+	return bus_register(&auxiliary_bus_type);
+}
+
+static void __exit auxiliary_bus_exit(void)
+{
+	bus_unregister(&auxiliary_bus_type);
+}
+
+module_init(auxiliary_bus_init);
+module_exit(auxiliary_bus_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Auxiliary Bus");
+MODULE_AUTHOR("David Ertman <david.m.ertman@intel.com>");
+MODULE_AUTHOR("Kiran Patil <kiran.patil@intel.com>");
diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
new file mode 100644
index 000000000000..282fbf7bf9af
--- /dev/null
+++ b/include/linux/auxiliary_bus.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2019-2020 Intel Corporation
+ *
+ * Please see Documentation/driver-api/auxiliary_bus.rst for more information.
+ */
+
+#ifndef _AUXILIARY_BUS_H_
+#define _AUXILIARY_BUS_H_
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+
+struct auxiliary_device {
+	struct device dev;
+	const char *name;
+	u32 id;
+};
+
+struct auxiliary_driver {
+	int (*probe)(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id);
+	int (*remove)(struct auxiliary_device *auxdev);
+	void (*shutdown)(struct auxiliary_device *auxdev);
+	int (*suspend)(struct auxiliary_device *auxdev, pm_message_t state);
+	int (*resume)(struct auxiliary_device *auxdev);
+	const char *name;
+	struct device_driver driver;
+	const struct auxiliary_device_id *id_table;
+};
+
+static inline struct auxiliary_device *to_auxiliary_dev(struct device *dev)
+{
+	return container_of(dev, struct auxiliary_device, dev);
+}
+
+static inline struct auxiliary_driver *to_auxiliary_drv(struct device_driver *drv)
+{
+	return container_of(drv, struct auxiliary_driver, driver);
+}
+
+int auxiliary_device_init(struct auxiliary_device *auxdev);
+int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname);
+#define auxiliary_device_add(auxdev) __auxiliary_device_add(auxdev, KBUILD_MODNAME)
+
+static inline void auxiliary_device_uninit(struct auxiliary_device *auxdev)
+{
+	put_device(&auxdev->dev);
+}
+
+static inline void auxiliary_device_delete(struct auxiliary_device *auxdev)
+{
+	device_del(&auxdev->dev);
+}
+
+int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module *owner,
+				const char *modname);
+#define auxiliary_driver_register(auxdrv) \
+	__auxiliary_driver_register(auxdrv, THIS_MODULE, KBUILD_MODNAME)
+
+void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv);
+
+/**
+ * module_auxiliary_driver() - Helper macro for registering an auxiliary driver
+ * @__auxiliary_driver: auxiliary driver struct
+ *
+ * Helper macro for auxiliary drivers which do not do anything special in
+ * module init/exit. This eliminates a lot of boilerplate. Each module may only
+ * use this macro once, and calling it replaces module_init() and module_exit()
+ */
+#define module_auxiliary_driver(__auxiliary_driver) \
+	module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister)
+
+struct auxiliary_device *
+auxiliary_find_device(struct device *start, const void *data,
+		      int (*match)(struct device *dev, const void *data));
+
+#endif /* _AUXILIARY_BUS_H_ */
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 5b08a473cdba..c425290b21e2 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -838,4 +838,12 @@ struct mhi_device_id {
 	kernel_ulong_t driver_data;
 };
 
+#define AUXILIARY_NAME_SIZE 32
+#define AUXILIARY_MODULE_PREFIX "auxiliary:"
+
+struct auxiliary_device_id {
+	char name[AUXILIARY_NAME_SIZE];
+	kernel_ulong_t driver_data;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index 27007c18e754..e377f52dbfa3 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -243,5 +243,8 @@ int main(void)
 	DEVID(mhi_device_id);
 	DEVID_FIELD(mhi_device_id, chan);
 
+	DEVID(auxiliary_device_id);
+	DEVID_FIELD(auxiliary_device_id, name);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 2417dd1dee33..fb4827027536 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1364,6 +1364,13 @@ static int do_mhi_entry(const char *filename, void *symval, char *alias)
 {
 	DEF_FIELD_ADDR(symval, mhi_device_id, chan);
 	sprintf(alias, MHI_DEVICE_MODALIAS_FMT, *chan);
+	return 1;
+}
+
+static int do_auxiliary_entry(const char *filename, void *symval, char *alias)
+{
+	DEF_FIELD_ADDR(symval, auxiliary_device_id, name);
+	sprintf(alias, AUXILIARY_MODULE_PREFIX "%s", *name);
 
 	return 1;
 }
@@ -1442,6 +1449,7 @@ static const struct devtable devtable[] = {
 	{"tee", SIZE_tee_client_device_id, do_tee_entry},
 	{"wmi", SIZE_wmi_device_id, do_wmi_entry},
 	{"mhi", SIZE_mhi_device_id, do_mhi_entry},
+	{"auxiliary", SIZE_auxiliary_device_id, do_auxiliary_entry},
 };
 
 /* Create MODULE_ALIAS() statements.
-- 
cgit 


From 30ae3e13caeaa47884c222ebf5711ce27ed25f19 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Wed, 25 Nov 2020 22:29:59 +0100
Subject: mmc: tmio: set max_busy_timeout

Set max_busy_timeouts for variants known to support the TOPxx bits in
the SD_OPTION register. The timeout mechanism was running in the
background but not yet properly handled in the driver. So, let the MMC
core know when to not use R1B to avoid unhandled timeouts.

My datasheets for older variants (tmio_mmc.c) suggest that they support
it, too. However, actual bit descriptions are lacking, so I chose an
opt-in approach.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Tested-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Link: https://lore.kernel.org/r/20201125213001.15003-2-wsa+renesas@sang-engineering.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/renesas_sdhi_core.c |  3 +++
 drivers/mmc/host/tmio_mmc.h          |  2 ++
 drivers/mmc/host/tmio_mmc_core.c     | 15 +++++++++++++++
 drivers/mmc/host/uniphier-sd.c       |  1 +
 include/linux/mfd/tmio.h             |  7 ++++++-
 5 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c
index bb937411c2ec..153767054c05 100644
--- a/drivers/mmc/host/renesas_sdhi_core.c
+++ b/drivers/mmc/host/renesas_sdhi_core.c
@@ -1041,6 +1041,9 @@ int renesas_sdhi_probe(struct platform_device *pdev,
 	/* All SDHI have SDIO status bits which must be 1 */
 	mmc_data->flags |= TMIO_MMC_SDIO_STATUS_SETBITS;
 
+	/* All SDHI support HW busy detection */
+	mmc_data->flags |= TMIO_MMC_USE_BUSY_TIMEOUT;
+
 	dev_pm_domain_start(&pdev->dev);
 
 	ret = renesas_sdhi_clk_enable(host);
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 7ff41185896a..819198af17f4 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -80,6 +80,8 @@
 #define	CLK_CTL_SCLKEN		BIT(8)
 
 /* Definitions for values the CTL_SD_MEM_CARD_OPT register can take */
+#define CARD_OPT_TOP_MASK	0xf0
+#define CARD_OPT_TOP_SHIFT	4
 #define CARD_OPT_WIDTH8		BIT(13)
 #define CARD_OPT_WIDTH		BIT(15)
 
diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c
index a89547f5d733..7d2da5befe6b 100644
--- a/drivers/mmc/host/tmio_mmc_core.c
+++ b/drivers/mmc/host/tmio_mmc_core.c
@@ -887,6 +887,18 @@ static void tmio_mmc_set_bus_width(struct tmio_mmc_host *host,
 	sd_ctrl_write16(host, CTL_SD_MEM_CARD_OPT, reg);
 }
 
+static void tmio_mmc_max_busy_timeout(struct tmio_mmc_host *host)
+{
+	u16 val = sd_ctrl_read16(host, CTL_SD_MEM_CARD_OPT);
+	unsigned int clk_rate = host->mmc->actual_clock ?: host->mmc->f_max;
+	unsigned int cycles;
+
+	val = (val & CARD_OPT_TOP_MASK) >> CARD_OPT_TOP_SHIFT;
+	cycles = 1 << (13 + val);
+
+	host->mmc->max_busy_timeout = cycles / (clk_rate / MSEC_PER_SEC);
+}
+
 /* Set MMC clock / power.
  * Note: This controller uses a simple divider scheme therefore it cannot
  * run a MMC card at full speed (20MHz). The max clock is 24MHz on SD, but as
@@ -945,6 +957,9 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 		break;
 	}
 
+	if (host->pdata->flags & TMIO_MMC_USE_BUSY_TIMEOUT)
+		tmio_mmc_max_busy_timeout(host);
+
 	/* Let things settle. delay taken from winCE driver */
 	usleep_range(140, 200);
 	if (PTR_ERR(host->mrq) == -EINTR)
diff --git a/drivers/mmc/host/uniphier-sd.c b/drivers/mmc/host/uniphier-sd.c
index 3092466a99ab..a6cd16771d4e 100644
--- a/drivers/mmc/host/uniphier-sd.c
+++ b/drivers/mmc/host/uniphier-sd.c
@@ -586,6 +586,7 @@ static int uniphier_sd_probe(struct platform_device *pdev)
 
 	tmio_data = &priv->tmio_data;
 	tmio_data->flags |= TMIO_MMC_32BIT_DATA_PORT;
+	tmio_data->flags |= TMIO_MMC_USE_BUSY_TIMEOUT;
 
 	host = tmio_mmc_host_alloc(pdev, tmio_data);
 	if (IS_ERR(host))
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 8ba042430d8e..27264fe4b3b9 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -55,7 +55,12 @@
  */
 #define TMIO_MMC_HAS_IDLE_WAIT		BIT(4)
 
-/* BIT(5) is unused */
+/*
+ * Use the busy timeout feature. Probably all TMIO versions support it. Yet,
+ * we don't have documentation for old variants, so we enable only known good
+ * variants with this flag. Can be removed once all variants are known good.
+ */
+#define TMIO_MMC_USE_BUSY_TIMEOUT	BIT(5)
 
 /*
  * Some controllers have CMD12 automatically
-- 
cgit 


From 7bbb79ff5f7499e0c5d65987458410e8099207d8 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 4 Dec 2020 12:43:47 +0100
Subject: driver core: auxiliary bus: move slab.h from include file

No need to include slab.h in include/linux/auxiliary_bus.h, as it is not
needed there.  Move it to drivers/base/auxiliary.c instead.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Ertman <david.m.ertman@intel.com>
Cc: Fred Oh <fred.oh@linux.intel.com>
Cc: Kiran Patil <kiran.patil@intel.com>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Martin Habets <mhabets@solarflare.com>
Cc: Parav Pandit <parav@mellanox.com>
Cc: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Cc: Shiraz Saleem <shiraz.saleem@intel.com>
Link: https://lore.kernel.org/r/X8og8xi3WkoYXet9@kroah.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/auxiliary.c      | 1 +
 include/linux/auxiliary_bus.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c
index ef2af417438b..eca36d6284d0 100644
--- a/drivers/base/auxiliary.c
+++ b/drivers/base/auxiliary.c
@@ -9,6 +9,7 @@
 
 #include <linux/device.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
index 282fbf7bf9af..3580743d0e8d 100644
--- a/include/linux/auxiliary_bus.h
+++ b/include/linux/auxiliary_bus.h
@@ -10,7 +10,6 @@
 
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
-#include <linux/slab.h>
 
 struct auxiliary_device {
 	struct device dev;
-- 
cgit 


From 8142a46c50d2dd8160c42284e1044eed3bec0d18 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 4 Dec 2020 12:44:07 +0100
Subject: driver core: auxiliary bus: make remove function return void

There's an effort to move the remove() callback in the driver core to
not return an int, as nothing can be done if this function fails.  To
make that effort easier, make the aux bus remove function void to start
with so that no users have to be changed sometime in the future.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Ertman <david.m.ertman@intel.com>
Cc: Fred Oh <fred.oh@linux.intel.com>
Cc: Kiran Patil <kiran.patil@intel.com>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Martin Habets <mhabets@solarflare.com>
Cc: Parav Pandit <parav@mellanox.com>
Cc: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Cc: Shiraz Saleem <shiraz.saleem@intel.com>
Link: https://lore.kernel.org/r/X8ohB1ks1NK7kPop@kroah.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/auxiliary_bus.rst | 2 +-
 drivers/base/auxiliary.c                   | 5 ++---
 include/linux/auxiliary_bus.h              | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/auxiliary_bus.rst b/Documentation/driver-api/auxiliary_bus.rst
index 5dd7804631ef..2312506b0674 100644
--- a/Documentation/driver-api/auxiliary_bus.rst
+++ b/Documentation/driver-api/auxiliary_bus.rst
@@ -150,7 +150,7 @@ and shutdown notifications using the standard conventions.
 	struct auxiliary_driver {
 		int (*probe)(struct auxiliary_device *,
                              const struct auxiliary_device_id *id);
-		int (*remove)(struct auxiliary_device *);
+		void (*remove)(struct auxiliary_device *);
 		void (*shutdown)(struct auxiliary_device *);
 		int (*suspend)(struct auxiliary_device *, pm_message_t);
 		int (*resume)(struct auxiliary_device *);
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c
index eca36d6284d0..c44e85802b43 100644
--- a/drivers/base/auxiliary.c
+++ b/drivers/base/auxiliary.c
@@ -82,13 +82,12 @@ static int auxiliary_bus_remove(struct device *dev)
 {
 	struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver);
 	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
-	int ret = 0;
 
 	if (auxdrv->remove)
-		ret = auxdrv->remove(auxdev);
+		auxdrv->remove(auxdev);
 	dev_pm_domain_detach(dev, true);
 
-	return ret;
+	return 0;
 }
 
 static void auxiliary_bus_shutdown(struct device *dev)
diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
index 3580743d0e8d..d67b17606210 100644
--- a/include/linux/auxiliary_bus.h
+++ b/include/linux/auxiliary_bus.h
@@ -19,7 +19,7 @@ struct auxiliary_device {
 
 struct auxiliary_driver {
 	int (*probe)(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id);
-	int (*remove)(struct auxiliary_device *auxdev);
+	void (*remove)(struct auxiliary_device *auxdev);
 	void (*shutdown)(struct auxiliary_device *auxdev);
 	int (*suspend)(struct auxiliary_device *auxdev, pm_message_t state);
 	int (*resume)(struct auxiliary_device *auxdev);
-- 
cgit 


From 0d2bf11a6b3e275a526b8d42d8d4a3a6067cf953 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 4 Dec 2020 12:49:28 +0100
Subject: driver core: auxiliary bus: minor coding style tweaks

For some reason, the original aux bus patch had some really long lines
in a few places, probably due to it being a very long-lived patch in
development by many different people.  Fix that up so that the two files
all have the same length lines and function formatting styles.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Ertman <david.m.ertman@intel.com>
Cc: Fred Oh <fred.oh@linux.intel.com>
Cc: Kiran Patil <kiran.patil@intel.com>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Martin Habets <mhabets@solarflare.com>
Cc: Parav Pandit <parav@mellanox.com>
Cc: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Cc: Shiraz Saleem <shiraz.saleem@intel.com>
Link: https://lore.kernel.org/r/X8oiSFTpYHw1xE/o@kroah.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/auxiliary.c      | 58 ++++++++++++++++++++++++-------------------
 include/linux/auxiliary_bus.h |  6 ++---
 2 files changed, 35 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c
index c44e85802b43..f303daadf843 100644
--- a/drivers/base/auxiliary.c
+++ b/drivers/base/auxiliary.c
@@ -50,8 +50,8 @@ static int auxiliary_uevent(struct device *dev, struct kobj_uevent_env *env)
 	name = dev_name(dev);
 	p = strrchr(name, '.');
 
-	return add_uevent_var(env, "MODALIAS=%s%.*s", AUXILIARY_MODULE_PREFIX, (int)(p - name),
-			      name);
+	return add_uevent_var(env, "MODALIAS=%s%.*s", AUXILIARY_MODULE_PREFIX,
+			      (int)(p - name), name);
 }
 
 static const struct dev_pm_ops auxiliary_dev_pm_ops = {
@@ -113,16 +113,18 @@ static struct bus_type auxiliary_bus_type = {
  * auxiliary_device_init - check auxiliary_device and initialize
  * @auxdev: auxiliary device struct
  *
- * This is the first step in the two-step process to register an auxiliary_device.
+ * This is the first step in the two-step process to register an
+ * auxiliary_device.
  *
- * When this function returns an error code, then the device_initialize will *not* have
- * been performed, and the caller will be responsible to free any memory allocated for the
- * auxiliary_device in the error path directly.
+ * When this function returns an error code, then the device_initialize will
+ * *not* have been performed, and the caller will be responsible to free any
+ * memory allocated for the auxiliary_device in the error path directly.
  *
- * It returns 0 on success.  On success, the device_initialize has been performed.  After this
- * point any error unwinding will need to include a call to auxiliary_device_uninit().
- * In this post-initialize error scenario, a call to the device's .release callback will be
- * triggered, and all memory clean-up is expected to be handled there.
+ * It returns 0 on success.  On success, the device_initialize has been
+ * performed.  After this point any error unwinding will need to include a call
+ * to auxiliary_device_uninit().  In this post-initialize error scenario, a call
+ * to the device's .release callback will be triggered, and all memory clean-up
+ * is expected to be handled there.
  */
 int auxiliary_device_init(struct auxiliary_device *auxdev)
 {
@@ -149,16 +151,19 @@ EXPORT_SYMBOL_GPL(auxiliary_device_init);
  * @auxdev: auxiliary bus device to add to the bus
  * @modname: name of the parent device's driver module
  *
- * This is the second step in the two-step process to register an auxiliary_device.
+ * This is the second step in the two-step process to register an
+ * auxiliary_device.
  *
- * This function must be called after a successful call to auxiliary_device_init(), which
- * will perform the device_initialize.  This means that if this returns an error code, then a
- * call to auxiliary_device_uninit() must be performed so that the .release callback will
- * be triggered to free the memory associated with the auxiliary_device.
+ * This function must be called after a successful call to
+ * auxiliary_device_init(), which will perform the device_initialize.  This
+ * means that if this returns an error code, then a call to
+ * auxiliary_device_uninit() must be performed so that the .release callback
+ * will be triggered to free the memory associated with the auxiliary_device.
  *
- * The expectation is that users will call the "auxiliary_device_add" macro so that the caller's
- * KBUILD_MODNAME is automatically inserted for the modname parameter.  Only if a user requires
- * a custom name would this version be called directly.
+ * The expectation is that users will call the "auxiliary_device_add" macro so
+ * that the caller's KBUILD_MODNAME is automatically inserted for the modname
+ * parameter.  Only if a user requires a custom name would this version be
+ * called directly.
  */
 int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname)
 {
@@ -166,13 +171,13 @@ int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname)
 	int ret;
 
 	if (!modname) {
-		pr_err("auxiliary device modname is NULL\n");
+		dev_err(dev, "auxiliary device modname is NULL\n");
 		return -EINVAL;
 	}
 
 	ret = dev_set_name(dev, "%s.%s.%d", modname, auxdev->name, auxdev->id);
 	if (ret) {
-		pr_err("auxiliary device dev_set_name failed: %d\n", ret);
+		dev_err(dev, "auxiliary device dev_set_name failed: %d\n", ret);
 		return ret;
 	}
 
@@ -197,9 +202,9 @@ EXPORT_SYMBOL_GPL(__auxiliary_device_add);
  * if it does.  If the callback returns non-zero, this function will
  * return to the caller and not iterate over any more devices.
  */
-struct auxiliary_device *
-auxiliary_find_device(struct device *start, const void *data,
-		      int (*match)(struct device *dev, const void *data))
+struct auxiliary_device *auxiliary_find_device(struct device *start,
+					       const void *data,
+					       int (*match)(struct device *dev, const void *data))
 {
 	struct device *dev;
 
@@ -217,14 +222,15 @@ EXPORT_SYMBOL_GPL(auxiliary_find_device);
  * @owner: owning module/driver
  * @modname: KBUILD_MODNAME for parent driver
  */
-int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module *owner,
-				const char *modname)
+int __auxiliary_driver_register(struct auxiliary_driver *auxdrv,
+				struct module *owner, const char *modname)
 {
 	if (WARN_ON(!auxdrv->probe) || WARN_ON(!auxdrv->id_table))
 		return -EINVAL;
 
 	if (auxdrv->name)
-		auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s.%s", modname, auxdrv->name);
+		auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s.%s", modname,
+						auxdrv->name);
 	else
 		auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s", modname);
 	if (!auxdrv->driver.name)
diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
index d67b17606210..fc51d45f106b 100644
--- a/include/linux/auxiliary_bus.h
+++ b/include/linux/auxiliary_bus.h
@@ -70,8 +70,8 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv);
 #define module_auxiliary_driver(__auxiliary_driver) \
 	module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister)
 
-struct auxiliary_device *
-auxiliary_find_device(struct device *start, const void *data,
-		      int (*match)(struct device *dev, const void *data));
+struct auxiliary_device *auxiliary_find_device(struct device *start,
+					       const void *data,
+					       int (*match)(struct device *dev, const void *data));
 
 #endif /* _AUXILIARY_BUS_H_ */
-- 
cgit 


From 17a7612b99e66d2539341ab4f888f970c2c7f76d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 4 Oct 2020 14:30:58 +0300
Subject: net/mlx5_core: Clean driver version and name

Remove exposed driver version as it was done in other drivers,
so module version will work correctly by displaying the kernel
version for which it is compiled.

And move mlx5_core module name to general include, so auxiliary drivers
will be able to use it as a basis for a name in their device ID tables.

Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c       |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c    |  4 +---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c        |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c          | 10 ++++++----
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h     |  3 ---
 include/linux/mlx5/driver.h                             |  2 ++
 7 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index a28f95df2901..1a351e2f6ace 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -52,7 +52,7 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
 	u32 running_fw, stored_fw;
 	int err;
 
-	err = devlink_info_driver_name_put(req, DRIVER_NAME);
+	err = devlink_info_driver_name_put(req, KBUILD_MODNAME);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index d25a56ec6876..bcff18a87bcd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -40,9 +40,7 @@ void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 
-	strlcpy(drvinfo->driver, DRIVER_NAME, sizeof(drvinfo->driver));
-	strlcpy(drvinfo->version, DRIVER_VERSION,
-		sizeof(drvinfo->version));
+	strlcpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
 	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
 		 "%d.%d.%04d (%.16s)",
 		 fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev),
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 67247c33b9fd..ef2f8889ba0f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -64,7 +64,6 @@ static void mlx5e_rep_get_drvinfo(struct net_device *dev,
 
 	strlcpy(drvinfo->driver, mlx5e_rep_driver_name,
 		sizeof(drvinfo->driver));
-	strlcpy(drvinfo->version, UTS_RELEASE, sizeof(drvinfo->version));
 	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
 		 "%d.%d.%04d (%.16s)",
 		 fw_rev_maj(mdev), fw_rev_min(mdev),
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
index cac8f085b16d..97d96fc38a65 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
@@ -39,7 +39,7 @@ static void mlx5i_get_drvinfo(struct net_device *dev,
 	struct mlx5e_priv *priv = mlx5i_epriv(dev);
 
 	mlx5e_ethtool_get_drvinfo(priv, drvinfo);
-	strlcpy(drvinfo->driver, DRIVER_NAME "[ib_ipoib]",
+	strlcpy(drvinfo->driver, KBUILD_MODNAME "[ib_ipoib]",
 		sizeof(drvinfo->driver));
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index e455a2f31f07..daef141931e7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -77,7 +77,6 @@
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRIVER_VERSION);
 
 unsigned int mlx5_core_debug_mask;
 module_param_named(debug_mask, mlx5_core_debug_mask, uint, 0644);
@@ -228,7 +227,7 @@ static void mlx5_set_driver_version(struct mlx5_core_dev *dev)
 	strncat(string, ",", remaining_size);
 
 	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
-	strncat(string, DRIVER_NAME, remaining_size);
+	strncat(string, KBUILD_MODNAME, remaining_size);
 
 	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
 	strncat(string, ",", remaining_size);
@@ -313,7 +312,7 @@ static int request_bar(struct pci_dev *pdev)
 		return -ENODEV;
 	}
 
-	err = pci_request_regions(pdev, DRIVER_NAME);
+	err = pci_request_regions(pdev, KBUILD_MODNAME);
 	if (err)
 		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
 
@@ -1617,7 +1616,7 @@ void mlx5_recover_device(struct mlx5_core_dev *dev)
 }
 
 static struct pci_driver mlx5_core_driver = {
-	.name           = DRIVER_NAME,
+	.name           = KBUILD_MODNAME,
 	.id_table       = mlx5_core_pci_table,
 	.probe          = init_one,
 	.remove         = remove_one,
@@ -1643,6 +1642,9 @@ static int __init init(void)
 {
 	int err;
 
+	WARN_ONCE(strcmp(MLX5_ADEV_NAME, KBUILD_MODNAME),
+		  "mlx5_core name not in sync with kernel module name");
+
 	get_random_bytes(&sw_owner_id, sizeof(sw_owner_id));
 
 	mlx5_core_verify_params();
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 8cec85ab419d..b285f1515e4e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -42,9 +42,6 @@
 #include <linux/mlx5/fs.h>
 #include <linux/mlx5/driver.h>
 
-#define DRIVER_NAME "mlx5_core"
-#define DRIVER_VERSION "5.0-0"
-
 extern uint mlx5_core_debug_mask;
 
 #define mlx5_core_dbg(__dev, format, ...)				\
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 0f23e1ed5e71..5f04495c33d7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -56,6 +56,8 @@
 #include <linux/ptp_clock_kernel.h>
 #include <net/devlink.h>
 
+#define MLX5_ADEV_NAME "mlx5_core"
+
 enum {
 	MLX5_BOARD_ID_LEN = 64,
 };
-- 
cgit 


From 0aae392bea4da1a2a9f2e22683c0fa751dc07333 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Tue, 6 Oct 2020 12:34:25 +0300
Subject: vdpa/mlx5: Make hardware definitions visible to all mlx5 devices

Move mlx5_vdpa IFC header file to the general include folder, so
mlx5_core will be able to reuse it to check if VDPA is supported
prior to creating an auxiliary device.

As part of this move, update the header file name to mlx5 general
naming scheme.

Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h | 168 ---------------------------------
 drivers/vdpa/mlx5/net/main.c           |   2 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c      |   2 +-
 include/linux/mlx5/mlx5_ifc_vdpa.h     | 166 ++++++++++++++++++++++++++++++++
 4 files changed, 168 insertions(+), 170 deletions(-)
 delete mode 100644 drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h
 create mode 100644 include/linux/mlx5/mlx5_ifc_vdpa.h

(limited to 'include/linux')

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h b/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h
deleted file mode 100644
index f6f57a29b38e..000000000000
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
-/* Copyright (c) 2020 Mellanox Technologies Ltd. */
-
-#ifndef __MLX5_VDPA_IFC_H_
-#define __MLX5_VDPA_IFC_H_
-
-#include <linux/mlx5/mlx5_ifc.h>
-
-enum {
-	MLX5_VIRTIO_Q_EVENT_MODE_NO_MSIX_MODE  = 0x0,
-	MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE       = 0x1,
-	MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE     = 0x2,
-};
-
-enum {
-	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT   = 0x1, // do I check this caps?
-	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED  = 0x2,
-};
-
-enum {
-	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT   = 0,
-	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED  = 1,
-};
-
-struct mlx5_ifc_virtio_q_bits {
-	u8    virtio_q_type[0x8];
-	u8    reserved_at_8[0x5];
-	u8    event_mode[0x3];
-	u8    queue_index[0x10];
-
-	u8    full_emulation[0x1];
-	u8    virtio_version_1_0[0x1];
-	u8    reserved_at_22[0x2];
-	u8    offload_type[0x4];
-	u8    event_qpn_or_msix[0x18];
-
-	u8    doorbell_stride_index[0x10];
-	u8    queue_size[0x10];
-
-	u8    device_emulation_id[0x20];
-
-	u8    desc_addr[0x40];
-
-	u8    used_addr[0x40];
-
-	u8    available_addr[0x40];
-
-	u8    virtio_q_mkey[0x20];
-
-	u8    max_tunnel_desc[0x10];
-	u8    reserved_at_170[0x8];
-	u8    error_type[0x8];
-
-	u8    umem_1_id[0x20];
-
-	u8    umem_1_size[0x20];
-
-	u8    umem_1_offset[0x40];
-
-	u8    umem_2_id[0x20];
-
-	u8    umem_2_size[0x20];
-
-	u8    umem_2_offset[0x40];
-
-	u8    umem_3_id[0x20];
-
-	u8    umem_3_size[0x20];
-
-	u8    umem_3_offset[0x40];
-
-	u8    counter_set_id[0x20];
-
-	u8    reserved_at_320[0x8];
-	u8    pd[0x18];
-
-	u8    reserved_at_340[0xc0];
-};
-
-struct mlx5_ifc_virtio_net_q_object_bits {
-	u8    modify_field_select[0x40];
-
-	u8    reserved_at_40[0x20];
-
-	u8    vhca_id[0x10];
-	u8    reserved_at_70[0x10];
-
-	u8    queue_feature_bit_mask_12_3[0xa];
-	u8    dirty_bitmap_dump_enable[0x1];
-	u8    vhost_log_page[0x5];
-	u8    reserved_at_90[0xc];
-	u8    state[0x4];
-
-	u8    reserved_at_a0[0x5];
-	u8    queue_feature_bit_mask_2_0[0x3];
-	u8    tisn_or_qpn[0x18];
-
-	u8    dirty_bitmap_mkey[0x20];
-
-	u8    dirty_bitmap_size[0x20];
-
-	u8    dirty_bitmap_addr[0x40];
-
-	u8    hw_available_index[0x10];
-	u8    hw_used_index[0x10];
-
-	u8    reserved_at_160[0xa0];
-
-	struct mlx5_ifc_virtio_q_bits virtio_q_context;
-};
-
-struct mlx5_ifc_create_virtio_net_q_in_bits {
-	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
-
-	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
-};
-
-struct mlx5_ifc_create_virtio_net_q_out_bits {
-	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
-};
-
-struct mlx5_ifc_destroy_virtio_net_q_in_bits {
-	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_out_cmd_hdr;
-};
-
-struct mlx5_ifc_destroy_virtio_net_q_out_bits {
-	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
-};
-
-struct mlx5_ifc_query_virtio_net_q_in_bits {
-	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
-};
-
-struct mlx5_ifc_query_virtio_net_q_out_bits {
-	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
-
-	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
-};
-
-enum {
-	MLX5_VIRTQ_MODIFY_MASK_STATE                    = (u64)1 << 0,
-	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
-	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
-};
-
-enum {
-	MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT     = 0x0,
-	MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY      = 0x1,
-	MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND  = 0x2,
-	MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR      = 0x3,
-};
-
-enum {
-	MLX5_RQTC_LIST_Q_TYPE_RQ            = 0x0,
-	MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q  = 0x1,
-};
-
-struct mlx5_ifc_modify_virtio_net_q_in_bits {
-	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
-
-	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
-};
-
-struct mlx5_ifc_modify_virtio_net_q_out_bits {
-	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
-};
-
-#endif /* __MLX5_VDPA_IFC_H_ */
diff --git a/drivers/vdpa/mlx5/net/main.c b/drivers/vdpa/mlx5/net/main.c
index 838cd98386ff..4dd3f00f2306 100644
--- a/drivers/vdpa/mlx5/net/main.c
+++ b/drivers/vdpa/mlx5/net/main.c
@@ -4,7 +4,7 @@
 #include <linux/module.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/device.h>
-#include "mlx5_vdpa_ifc.h"
+#include <linux/mlx5/mlx5_ifc_vdpa.h>
 #include "mlx5_vnet.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 1fa6fcac8299..6c218b47b9f1 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -9,8 +9,8 @@
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/fs.h>
 #include <linux/mlx5/device.h>
+#include <linux/mlx5/mlx5_ifc_vdpa.h>
 #include "mlx5_vnet.h"
-#include "mlx5_vdpa_ifc.h"
 #include "mlx5_vdpa.h"
 
 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
new file mode 100644
index 000000000000..98b56b75c625
--- /dev/null
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#ifndef __MLX5_IFC_VDPA_H_
+#define __MLX5_IFC_VDPA_H_
+
+enum {
+	MLX5_VIRTIO_Q_EVENT_MODE_NO_MSIX_MODE  = 0x0,
+	MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE       = 0x1,
+	MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE     = 0x2,
+};
+
+enum {
+	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT   = 0x1, // do I check this caps?
+	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED  = 0x2,
+};
+
+enum {
+	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT   = 0,
+	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED  = 1,
+};
+
+struct mlx5_ifc_virtio_q_bits {
+	u8    virtio_q_type[0x8];
+	u8    reserved_at_8[0x5];
+	u8    event_mode[0x3];
+	u8    queue_index[0x10];
+
+	u8    full_emulation[0x1];
+	u8    virtio_version_1_0[0x1];
+	u8    reserved_at_22[0x2];
+	u8    offload_type[0x4];
+	u8    event_qpn_or_msix[0x18];
+
+	u8    doorbell_stride_index[0x10];
+	u8    queue_size[0x10];
+
+	u8    device_emulation_id[0x20];
+
+	u8    desc_addr[0x40];
+
+	u8    used_addr[0x40];
+
+	u8    available_addr[0x40];
+
+	u8    virtio_q_mkey[0x20];
+
+	u8    max_tunnel_desc[0x10];
+	u8    reserved_at_170[0x8];
+	u8    error_type[0x8];
+
+	u8    umem_1_id[0x20];
+
+	u8    umem_1_size[0x20];
+
+	u8    umem_1_offset[0x40];
+
+	u8    umem_2_id[0x20];
+
+	u8    umem_2_size[0x20];
+
+	u8    umem_2_offset[0x40];
+
+	u8    umem_3_id[0x20];
+
+	u8    umem_3_size[0x20];
+
+	u8    umem_3_offset[0x40];
+
+	u8    counter_set_id[0x20];
+
+	u8    reserved_at_320[0x8];
+	u8    pd[0x18];
+
+	u8    reserved_at_340[0xc0];
+};
+
+struct mlx5_ifc_virtio_net_q_object_bits {
+	u8    modify_field_select[0x40];
+
+	u8    reserved_at_40[0x20];
+
+	u8    vhca_id[0x10];
+	u8    reserved_at_70[0x10];
+
+	u8    queue_feature_bit_mask_12_3[0xa];
+	u8    dirty_bitmap_dump_enable[0x1];
+	u8    vhost_log_page[0x5];
+	u8    reserved_at_90[0xc];
+	u8    state[0x4];
+
+	u8    reserved_at_a0[0x5];
+	u8    queue_feature_bit_mask_2_0[0x3];
+	u8    tisn_or_qpn[0x18];
+
+	u8    dirty_bitmap_mkey[0x20];
+
+	u8    dirty_bitmap_size[0x20];
+
+	u8    dirty_bitmap_addr[0x40];
+
+	u8    hw_available_index[0x10];
+	u8    hw_used_index[0x10];
+
+	u8    reserved_at_160[0xa0];
+
+	struct mlx5_ifc_virtio_q_bits virtio_q_context;
+};
+
+struct mlx5_ifc_create_virtio_net_q_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+
+	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
+};
+
+struct mlx5_ifc_create_virtio_net_q_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
+struct mlx5_ifc_destroy_virtio_net_q_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
+struct mlx5_ifc_destroy_virtio_net_q_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
+struct mlx5_ifc_query_virtio_net_q_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+};
+
+struct mlx5_ifc_query_virtio_net_q_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+
+	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
+};
+
+enum {
+	MLX5_VIRTQ_MODIFY_MASK_STATE                    = (u64)1 << 0,
+	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
+	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
+};
+
+enum {
+	MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT     = 0x0,
+	MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY      = 0x1,
+	MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND  = 0x2,
+	MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR      = 0x3,
+};
+
+enum {
+	MLX5_RQTC_LIST_Q_TYPE_RQ            = 0x0,
+	MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q  = 0x1,
+};
+
+struct mlx5_ifc_modify_virtio_net_q_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+
+	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
+};
+
+struct mlx5_ifc_modify_virtio_net_q_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
+#endif /* __MLX5_IFC_VDPA_H_ */
-- 
cgit 


From a925b5e309c9b998658a6a94dbb53154ea901299 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 8 Oct 2020 16:06:37 +0300
Subject: net/mlx5: Register mlx5 devices to auxiliary virtual bus

Create auxiliary devices under new virtual bus. This will replace
the custom-made mlx5 ->add()/->remove() interfaces and next patches
will fill the missing callback and remove the old interface logic.

The attachment of auxiliary drivers to the devices is possible in
1-to-1 manner only and it requires us to create device for every protocol,
so that device (module) will be able to connect to it.

System with 2 IB and 1 RoCE cards:
[leonro@vm ~]$ lspci |grep nox
00:09.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5]
00:0a.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
00:0b.0 Ethernet controller: Mellanox Technologies MT2910 Family [ConnectX-7]
[leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/
 mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.eth.2
 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/mlx5_core.rdma.0
 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.rdma.1
 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.rdma.2
 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.vdpa.1
 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.vdpa.2
[leonro@vm ~]$ rdma dev
0: ibp0s9: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455
1: ibp0s10: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3456 sys_image_guid 5254:00c0:fe12:3456
2: rdmap0s11: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3457 sys_image_guid 5254:00c0:fe12:3457

System with RoCE SR-IOV card with 4 VFs:
[leonro@vm ~]$ lspci |grep nox
01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
01:00.3 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
01:00.4 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
[leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/
 mlx5_core.eth.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.eth.0
 mlx5_core.eth.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.eth.1
 mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.eth.2
 mlx5_core.eth.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.eth.3
 mlx5_core.eth.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.eth.4
 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.rdma.0
 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.rdma.1
 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.rdma.2
 mlx5_core.rdma.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.rdma.3
 mlx5_core.rdma.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.rdma.4
 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.vdpa.1
 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.vdpa.2
 mlx5_core.vdpa.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.vdpa.3
 mlx5_core.vdpa.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.vdpa.4
[leonro@vm ~]$ rdma dev
0: rocep1s0f0: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455
1: rocep1s0f0v0: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3456
2: rocep1s0f0v1: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3457
3: rocep1s0f0v2: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3458
4: rocep1s0f0v3: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3459

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      | 265 ++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  24 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  20 +-
 include/linux/mlx5/driver.h                        |  26 +-
 5 files changed, 325 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 99f1ec3b2575..485478979b1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -6,6 +6,7 @@
 config MLX5_CORE
 	tristate "Mellanox 5th generation network adapters (ConnectX series) core driver"
 	depends on PCI
+	select AUXILIARY_BUS
 	select NET_DEVLINK
 	depends on VXLAN || !VXLAN
 	depends on MLXFW || !MLXFW
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 1972ddd12704..8ddf469b2d05 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -37,6 +37,7 @@ static LIST_HEAD(intf_list);
 static LIST_HEAD(mlx5_dev_list);
 /* intf dev list mutex */
 static DEFINE_MUTEX(mlx5_intf_mutex);
+static DEFINE_IDA(mlx5_adev_ida);
 
 struct mlx5_device_context {
 	struct list_head	list;
@@ -50,6 +51,39 @@ enum {
 	MLX5_INTERFACE_ATTACHED,
 };
 
+static const struct mlx5_adev_device {
+	const char *suffix;
+	bool (*is_supported)(struct mlx5_core_dev *dev);
+} mlx5_adev_devices[1] = {};
+
+int mlx5_adev_idx_alloc(void)
+{
+	return ida_alloc(&mlx5_adev_ida, GFP_KERNEL);
+}
+
+void mlx5_adev_idx_free(int idx)
+{
+	ida_free(&mlx5_adev_ida, idx);
+}
+
+int mlx5_adev_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+
+	priv->adev = kcalloc(ARRAY_SIZE(mlx5_adev_devices),
+			     sizeof(struct mlx5_adev *), GFP_KERNEL);
+	if (!priv->adev)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_adev_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+
+	kfree(priv->adev);
+}
 
 void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 {
@@ -135,15 +169,99 @@ static void mlx5_attach_interface(struct mlx5_interface *intf, struct mlx5_priv
 	}
 }
 
-void mlx5_attach_device(struct mlx5_core_dev *dev)
+static void adev_release(struct device *dev)
+{
+	struct mlx5_adev *mlx5_adev =
+		container_of(dev, struct mlx5_adev, adev.dev);
+	struct mlx5_priv *priv = &mlx5_adev->mdev->priv;
+	int idx = mlx5_adev->idx;
+
+	kfree(mlx5_adev);
+	priv->adev[idx] = NULL;
+}
+
+static struct mlx5_adev *add_adev(struct mlx5_core_dev *dev, int idx)
+{
+	const char *suffix = mlx5_adev_devices[idx].suffix;
+	struct auxiliary_device *adev;
+	struct mlx5_adev *madev;
+	int ret;
+
+	madev = kzalloc(sizeof(*madev), GFP_KERNEL);
+	if (!madev)
+		return ERR_PTR(-ENOMEM);
+
+	adev = &madev->adev;
+	adev->id = dev->priv.adev_idx;
+	adev->name = suffix;
+	adev->dev.parent = dev->device;
+	adev->dev.release = adev_release;
+	madev->mdev = dev;
+	madev->idx = idx;
+
+	ret = auxiliary_device_init(adev);
+	if (ret) {
+		kfree(madev);
+		return ERR_PTR(ret);
+	}
+
+	ret = auxiliary_device_add(adev);
+	if (ret) {
+		auxiliary_device_uninit(adev);
+		return ERR_PTR(ret);
+	}
+	return madev;
+}
+
+static void del_adev(struct auxiliary_device *adev)
+{
+	auxiliary_device_delete(adev);
+	auxiliary_device_uninit(adev);
+}
+
+int mlx5_attach_device(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
+	struct auxiliary_device *adev;
+	struct auxiliary_driver *adrv;
 	struct mlx5_interface *intf;
+	int ret = 0, i;
 
 	mutex_lock(&mlx5_intf_mutex);
+	for (i = 0; i < ARRAY_SIZE(mlx5_adev_devices); i++) {
+		if (!priv->adev[i]) {
+			bool is_supported = false;
+
+			if (mlx5_adev_devices[i].is_supported)
+				is_supported = mlx5_adev_devices[i].is_supported(dev);
+
+			if (!is_supported)
+				continue;
+
+			priv->adev[i] = add_adev(dev, i);
+			if (IS_ERR(priv->adev[i])) {
+				ret = PTR_ERR(priv->adev[i]);
+				priv->adev[i] = NULL;
+			}
+		} else {
+			adev = &priv->adev[i]->adev;
+			adrv = to_auxiliary_drv(adev->dev.driver);
+
+			if (adrv->resume)
+				ret = adrv->resume(adev);
+		}
+		if (ret) {
+			mlx5_core_warn(dev, "Device[%d] (%s) failed to load\n",
+				       i, mlx5_adev_devices[i].suffix);
+
+			break;
+		}
+	}
+
 	list_for_each_entry(intf, &intf_list, list)
 		mlx5_attach_interface(intf, priv);
 	mutex_unlock(&mlx5_intf_mutex);
+	return ret;
 }
 
 static void mlx5_detach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
@@ -171,9 +289,29 @@ static void mlx5_detach_interface(struct mlx5_interface *intf, struct mlx5_priv
 void mlx5_detach_device(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
+	struct auxiliary_device *adev;
+	struct auxiliary_driver *adrv;
 	struct mlx5_interface *intf;
+	pm_message_t pm = {};
+	int i;
 
 	mutex_lock(&mlx5_intf_mutex);
+	for (i = ARRAY_SIZE(mlx5_adev_devices) - 1; i >= 0; i--) {
+		if (!priv->adev[i])
+			continue;
+
+		adev = &priv->adev[i]->adev;
+		adrv = to_auxiliary_drv(adev->dev.driver);
+
+		if (adrv->suspend) {
+			adrv->suspend(adev, pm);
+			continue;
+		}
+
+		del_adev(&priv->adev[i]->adev);
+		priv->adev[i] = NULL;
+	}
+
 	list_for_each_entry(intf, &intf_list, list)
 		mlx5_detach_interface(intf, priv);
 	mutex_unlock(&mlx5_intf_mutex);
@@ -193,16 +331,30 @@ bool mlx5_device_registered(struct mlx5_core_dev *dev)
 	return found;
 }
 
-void mlx5_register_device(struct mlx5_core_dev *dev)
+int mlx5_register_device(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_interface *intf;
+	int ret;
+
+	mutex_lock(&mlx5_intf_mutex);
+	dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV;
+	ret = mlx5_rescan_drivers_locked(dev);
+	mutex_unlock(&mlx5_intf_mutex);
+	if (ret)
+		goto add_err;
 
 	mutex_lock(&mlx5_intf_mutex);
 	list_add_tail(&priv->dev_list, &mlx5_dev_list);
 	list_for_each_entry(intf, &intf_list, list)
 		mlx5_add_device(intf, priv);
 	mutex_unlock(&mlx5_intf_mutex);
+
+	return 0;
+
+add_err:
+	mlx5_unregister_device(dev);
+	return ret;
 }
 
 void mlx5_unregister_device(struct mlx5_core_dev *dev)
@@ -214,6 +366,9 @@ void mlx5_unregister_device(struct mlx5_core_dev *dev)
 	list_for_each_entry_reverse(intf, &intf_list, list)
 		mlx5_remove_device(intf, priv);
 	list_del(&priv->dev_list);
+
+	dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV;
+	mlx5_rescan_drivers_locked(dev);
 	mutex_unlock(&mlx5_intf_mutex);
 }
 
@@ -246,6 +401,77 @@ void mlx5_unregister_interface(struct mlx5_interface *intf)
 }
 EXPORT_SYMBOL(mlx5_unregister_interface);
 
+static int add_drivers(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int i, ret = 0;
+
+	for (i = 0; i < ARRAY_SIZE(mlx5_adev_devices); i++) {
+		bool is_supported = false;
+
+		if (priv->adev[i])
+			continue;
+
+		if (mlx5_adev_devices[i].is_supported)
+			is_supported = mlx5_adev_devices[i].is_supported(dev);
+
+		if (!is_supported)
+			continue;
+
+		priv->adev[i] = add_adev(dev, i);
+		if (IS_ERR(priv->adev[i])) {
+			mlx5_core_warn(dev, "Device[%d] (%s) failed to load\n",
+				       i, mlx5_adev_devices[i].suffix);
+			/* We continue to rescan drivers and leave to the caller
+			 * to make decision if to release everything or continue.
+			 */
+			ret = PTR_ERR(priv->adev[i]);
+			priv->adev[i] = NULL;
+		}
+	}
+	return ret;
+}
+
+static void delete_drivers(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	bool delete_all;
+	int i;
+
+	delete_all = priv->flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV;
+
+	for (i = ARRAY_SIZE(mlx5_adev_devices) - 1; i >= 0; i--) {
+		bool is_supported = false;
+
+		if (!priv->adev[i])
+			continue;
+
+		if (mlx5_adev_devices[i].is_supported && !delete_all)
+			is_supported = mlx5_adev_devices[i].is_supported(dev);
+
+		if (is_supported)
+			continue;
+
+		del_adev(&priv->adev[i]->adev);
+		priv->adev[i] = NULL;
+	}
+}
+
+/* This function is used after mlx5_core_dev is reconfigured.
+ */
+int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+
+	lockdep_assert_held(&mlx5_intf_mutex);
+
+	delete_drivers(dev);
+	if (priv->flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
+		return 0;
+
+	return add_drivers(dev);
+}
+
 /* Must be called with intf_mutex held */
 static bool mlx5_has_added_dev_by_protocol(struct mlx5_core_dev *mdev, int protocol)
 {
@@ -299,24 +525,55 @@ void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol)
 		}
 }
 
-static u32 mlx5_gen_pci_id(struct mlx5_core_dev *dev)
+static u32 mlx5_gen_pci_id(const struct mlx5_core_dev *dev)
 {
 	return (u32)((pci_domain_nr(dev->pdev->bus) << 16) |
 		     (dev->pdev->bus->number << 8) |
 		     PCI_SLOT(dev->pdev->devfn));
 }
 
-/* Must be called with intf_mutex held */
+static int next_phys_dev(struct device *dev, const void *data)
+{
+	struct mlx5_adev *madev = container_of(dev, struct mlx5_adev, adev.dev);
+	struct mlx5_core_dev *mdev = madev->mdev;
+	const struct mlx5_core_dev *curr = data;
+
+	if (!mlx5_core_is_pf(mdev))
+		return 0;
+
+	if (mdev == curr)
+		return 0;
+
+	if (mlx5_gen_pci_id(mdev) != mlx5_gen_pci_id(curr))
+		return 0;
+
+	return 1;
+}
+
+/* This function is called with two flows:
+ * 1. During initialization of mlx5_core_dev and we don't need to lock it.
+ * 2. During LAG configure stage and caller holds &mlx5_intf_mutex.
+ */
 struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_dev *res = NULL;
 	struct mlx5_core_dev *tmp_dev;
+	struct auxiliary_device *adev;
+	struct mlx5_adev *madev;
 	struct mlx5_priv *priv;
 	u32 pci_id;
 
 	if (!mlx5_core_is_pf(dev))
 		return NULL;
 
+	adev = auxiliary_find_device(NULL, dev, &next_phys_dev);
+	if (adev) {
+		madev = container_of(adev, struct mlx5_adev, adev);
+
+		put_device(&adev->dev);
+		return madev->mdev;
+	}
+
 	pci_id = mlx5_gen_pci_id(dev);
 	list_for_each_entry(priv, &mlx5_dev_list, dev_list) {
 		tmp_dev = container_of(priv, struct mlx5_core_dev, priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index daef141931e7..ff68e5b17471 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1222,14 +1222,21 @@ int mlx5_load_one(struct mlx5_core_dev *dev, bool boot)
 		err = mlx5_devlink_register(priv_to_devlink(dev), dev->device);
 		if (err)
 			goto err_devlink_reg;
-		mlx5_register_device(dev);
+
+		err = mlx5_register_device(dev);
 	} else {
-		mlx5_attach_device(dev);
+		err = mlx5_attach_device(dev);
 	}
 
+	if (err)
+		goto err_register;
+
 	mutex_unlock(&dev->intf_state_mutex);
 	return 0;
 
+err_register:
+	if (boot)
+		mlx5_devlink_unregister(priv_to_devlink(dev));
 err_devlink_reg:
 	clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 	mlx5_unload(dev);
@@ -1306,8 +1313,14 @@ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 	if (err)
 		goto err_pagealloc_init;
 
+	err = mlx5_adev_init(dev);
+	if (err)
+		goto err_adev_init;
+
 	return 0;
 
+err_adev_init:
+	mlx5_pagealloc_cleanup(dev);
 err_pagealloc_init:
 	mlx5_health_cleanup(dev);
 err_health_init:
@@ -1324,6 +1337,7 @@ static void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 
+	mlx5_adev_cleanup(dev);
 	mlx5_pagealloc_cleanup(dev);
 	mlx5_health_cleanup(dev);
 	debugfs_remove_recursive(dev->priv.dbg_root);
@@ -1354,6 +1368,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->coredev_type = id->driver_data & MLX5_PCI_DEV_IS_VF ?
 			 MLX5_COREDEV_VF : MLX5_COREDEV_PF;
 
+	dev->priv.adev_idx = mlx5_adev_idx_alloc();
+	if (dev->priv.adev_idx < 0)
+		return dev->priv.adev_idx;
+
 	err = mlx5_mdev_init(dev, prof_sel);
 	if (err)
 		goto mdev_init_err;
@@ -1387,6 +1405,7 @@ err_load_one:
 pci_init_err:
 	mlx5_mdev_uninit(dev);
 mdev_init_err:
+	mlx5_adev_idx_free(dev->priv.adev_idx);
 	mlx5_devlink_free(devlink);
 
 	return err;
@@ -1403,6 +1422,7 @@ static void remove_one(struct pci_dev *pdev)
 	mlx5_unload_one(dev, true);
 	mlx5_pci_close(dev);
 	mlx5_mdev_uninit(dev);
+	mlx5_adev_idx_free(dev->priv.adev_idx);
 	mlx5_devlink_free(devlink);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index b285f1515e4e..11195381a870 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -178,12 +178,17 @@ void mlx5_events_cleanup(struct mlx5_core_dev *dev);
 void mlx5_events_start(struct mlx5_core_dev *dev);
 void mlx5_events_stop(struct mlx5_core_dev *dev);
 
+int mlx5_adev_idx_alloc(void);
+void mlx5_adev_idx_free(int idx);
+void mlx5_adev_cleanup(struct mlx5_core_dev *dev);
+int mlx5_adev_init(struct mlx5_core_dev *dev);
+
 void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv);
 void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv);
-void mlx5_attach_device(struct mlx5_core_dev *dev);
+int mlx5_attach_device(struct mlx5_core_dev *dev);
 void mlx5_detach_device(struct mlx5_core_dev *dev);
 bool mlx5_device_registered(struct mlx5_core_dev *dev);
-void mlx5_register_device(struct mlx5_core_dev *dev);
+int mlx5_register_device(struct mlx5_core_dev *dev);
 void mlx5_unregister_device(struct mlx5_core_dev *dev);
 void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol);
 void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol);
@@ -232,6 +237,17 @@ static inline int mlx5_lag_is_lacp_owner(struct mlx5_core_dev *dev)
 		    MLX5_CAP_GEN(dev, lag_master);
 }
 
+int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev);
+static inline int mlx5_rescan_drivers(struct mlx5_core_dev *dev)
+{
+	int ret;
+
+	mlx5_dev_list_lock();
+	ret = mlx5_rescan_drivers_locked(dev);
+	mlx5_dev_list_unlock();
+	return ret;
+}
+
 void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol);
 void mlx5_lag_update(struct mlx5_core_dev *dev);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5f04495c33d7..e31c72693bcf 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -48,6 +48,7 @@
 #include <linux/idr.h>
 #include <linux/notifier.h>
 #include <linux/refcount.h>
+#include <linux/auxiliary_bus.h>
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
@@ -536,6 +537,17 @@ struct mlx5_core_roce {
 	struct mlx5_flow_handle *allow_rule;
 };
 
+enum {
+	MLX5_PRIV_FLAGS_DISABLE_IB_ADEV = 1 << 0,
+	MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV = 1 << 1,
+};
+
+struct mlx5_adev {
+	struct auxiliary_device adev;
+	struct mlx5_core_dev *mdev;
+	int idx;
+};
+
 struct mlx5_priv {
 	/* IRQ table valid only for real pci devices PF or VF */
 	struct mlx5_irq_table   *irq_table;
@@ -573,6 +585,8 @@ struct mlx5_priv {
 	struct list_head        dev_list;
 	struct list_head        ctx_list;
 	spinlock_t              ctx_lock;
+	struct mlx5_adev       **adev;
+	int			adev_idx;
 	struct mlx5_events      *events;
 
 	struct mlx5_flow_steering *steering;
@@ -580,6 +594,7 @@ struct mlx5_priv {
 	struct mlx5_eswitch     *eswitch;
 	struct mlx5_core_sriov	sriov;
 	struct mlx5_lag		*lag;
+	u32			flags;
 	struct mlx5_devcom	*devcom;
 	struct mlx5_fw_reset	*fw_reset;
 	struct mlx5_core_roce	roce;
@@ -1062,9 +1077,14 @@ enum {
 };
 
 enum {
-	MLX5_INTERFACE_PROTOCOL_IB  = 0,
-	MLX5_INTERFACE_PROTOCOL_ETH = 1,
-	MLX5_INTERFACE_PROTOCOL_VDPA = 2,
+	MLX5_INTERFACE_PROTOCOL_ETH_REP,
+	MLX5_INTERFACE_PROTOCOL_ETH,
+
+	MLX5_INTERFACE_PROTOCOL_IB_REP,
+	MLX5_INTERFACE_PROTOCOL_MPIB,
+	MLX5_INTERFACE_PROTOCOL_IB,
+
+	MLX5_INTERFACE_PROTOCOL_VDPA,
 };
 
 struct mlx5_interface {
-- 
cgit 


From 62dcd9c59f324e484c1d655884e0101a988f6671 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:13 +0100
Subject: earlycon: simplify earlycon-table implementation

Instead of using the array-of-pointers trick to avoid having gcc mess up
the earlycon array stride, specify type alignment when declaring entries
to prevent gcc from increasing alignment.

This is essentially an alternative (one-line) fix to the problem
addressed by commit dd709e72cb93 ("earlycon: Use a pointer table to fix
__earlycon_table stride").

gcc can increase the alignment of larger objects with static extent as
an optimisation, but this can be suppressed by using the aligned
attribute when declaring variables.

Note that we have been relying on this behaviour for kernel parameters
for 16 years and it indeed hasn't changed since the introduction of the
aligned attribute in gcc-3.1.

Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20201123102319.8090-3-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/of/fdt.c              |  7 ++-----
 drivers/tty/serial/earlycon.c |  6 ++----
 include/linux/serial_core.h   | 20 +++++++-------------
 3 files changed, 11 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 4602e467ca8b..feb0f2d67fc5 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -906,7 +906,7 @@ int __init early_init_dt_scan_chosen_stdout(void)
 	int offset;
 	const char *p, *q, *options = NULL;
 	int l;
-	const struct earlycon_id **p_match;
+	const struct earlycon_id *match;
 	const void *fdt = initial_boot_params;
 
 	offset = fdt_path_offset(fdt, "/chosen");
@@ -933,10 +933,7 @@ int __init early_init_dt_scan_chosen_stdout(void)
 		return 0;
 	}
 
-	for (p_match = __earlycon_table; p_match < __earlycon_table_end;
-	     p_match++) {
-		const struct earlycon_id *match = *p_match;
-
+	for (match = __earlycon_table; match < __earlycon_table_end; match++) {
 		if (!match->compatible[0])
 			continue;
 
diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c
index b70877932d47..57c70851f22a 100644
--- a/drivers/tty/serial/earlycon.c
+++ b/drivers/tty/serial/earlycon.c
@@ -175,7 +175,7 @@ static int __init register_earlycon(char *buf, const struct earlycon_id *match)
  */
 int __init setup_earlycon(char *buf)
 {
-	const struct earlycon_id **p_match;
+	const struct earlycon_id *match;
 	bool empty_compatible = true;
 
 	if (!buf || !buf[0])
@@ -185,9 +185,7 @@ int __init setup_earlycon(char *buf)
 		return -EALREADY;
 
 again:
-	for (p_match = __earlycon_table; p_match < __earlycon_table_end;
-	     p_match++) {
-		const struct earlycon_id *match = *p_match;
+	for (match = __earlycon_table; match < __earlycon_table_end; match++) {
 		size_t len = strlen(match->name);
 
 		if (strncmp(buf, match->name, len))
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index ff63c2963359..3e32b788c28d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -357,8 +357,8 @@ struct earlycon_id {
 	int	(*setup)(struct earlycon_device *, const char *options);
 };
 
-extern const struct earlycon_id *__earlycon_table[];
-extern const struct earlycon_id *__earlycon_table_end[];
+extern const struct earlycon_id __earlycon_table[];
+extern const struct earlycon_id __earlycon_table_end[];
 
 #if defined(CONFIG_SERIAL_EARLYCON) && !defined(MODULE)
 #define EARLYCON_USED_OR_UNUSED	__used
@@ -366,19 +366,13 @@ extern const struct earlycon_id *__earlycon_table_end[];
 #define EARLYCON_USED_OR_UNUSED	__maybe_unused
 #endif
 
-#define _OF_EARLYCON_DECLARE(_name, compat, fn, unique_id)		\
-	static const struct earlycon_id unique_id			\
-	     EARLYCON_USED_OR_UNUSED __initconst			\
+#define OF_EARLYCON_DECLARE(_name, compat, fn)				\
+	static const struct earlycon_id __UNIQUE_ID(__earlycon_##_name) \
+		EARLYCON_USED_OR_UNUSED  __section("__earlycon_table")  \
+		__aligned(__alignof__(struct earlycon_id))		\
 		= { .name = __stringify(_name),				\
 		    .compatible = compat,				\
-		    .setup = fn  };					\
-	static const struct earlycon_id EARLYCON_USED_OR_UNUSED		\
-		__section("__earlycon_table")				\
-		* const __PASTE(__p, unique_id) = &unique_id
-
-#define OF_EARLYCON_DECLARE(_name, compat, fn)				\
-	_OF_EARLYCON_DECLARE(_name, compat, fn,				\
-			     __UNIQUE_ID(__earlycon_##_name))
+		    .setup = fn };
 
 #define EARLYCON_DECLARE(_name, fn)	OF_EARLYCON_DECLARE(_name, "", fn)
 
-- 
cgit 


From 5812b32e01c6d86ba7a84110702b46d8a8531fe9 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:12 +0100
Subject: of: fix linker-section match-table corruption

Specify type alignment when declaring linker-section match-table entries
to prevent gcc from increasing alignment and corrupting the various
tables with padding (e.g. timers, irqchips, clocks, reserved memory).

This is specifically needed on x86 where gcc (typically) aligns larger
objects like struct of_device_id with static extent on 32-byte
boundaries which at best prevents matching on anything but the first
entry. Specifying alignment when declaring variables suppresses this
optimisation.

Here's a 64-bit example where all entries are corrupt as 16 bytes of
padding has been inserted before the first entry:

	ffffffff8266b4b0 D __clk_of_table
	ffffffff8266b4c0 d __of_table_fixed_factor_clk
	ffffffff8266b5a0 d __of_table_fixed_clk
	ffffffff8266b680 d __clk_of_table_sentinel

And here's a 32-bit example where the 8-byte-aligned table happens to be
placed on a 32-byte boundary so that all but the first entry are corrupt
due to the 28 bytes of padding inserted between entries:

	812b3ec0 D __irqchip_of_table
	812b3ec0 d __of_table_irqchip1
	812b3fa0 d __of_table_irqchip2
	812b4080 d __of_table_irqchip3
	812b4160 d irqchip_of_match_end

Verified on x86 using gcc-9.3 and gcc-4.9 (which uses 64-byte
alignment), and on arm using gcc-7.2.

Note that there are no in-tree users of these tables on x86 currently
(even if they are included in the image).

Fixes: 54196ccbe0ba ("of: consolidate linker section OF match table declarations")
Fixes: f6e916b82022 ("irqchip: add basic infrastructure")
Cc: stable <stable@vger.kernel.org>     # 3.9
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20201123102319.8090-2-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 5d51891cbf1a..af655d264f10 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1300,6 +1300,7 @@ static inline int of_get_available_child_count(const struct device_node *np)
 #define _OF_DECLARE(table, name, compat, fn, fn_type)			\
 	static const struct of_device_id __of_table_##name		\
 		__used __section("__" #table "_of_table")		\
+		__aligned(__alignof__(struct of_device_id))		\
 		 = { .compatible = compat,				\
 		     .data = (fn == (fn_type)NULL) ? fn : fn  }
 #else
-- 
cgit 


From e0efb3168d34dc8c8c72718672b8902e40efff8f Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Thu, 3 Dec 2020 03:03:31 +0100
Subject: tty: Remove dead termiox code

set_termiox() and the TCGETX handler bail out with -EINVAL immediately
if ->termiox is NULL, but there are no code paths that can set
->termiox to a non-NULL pointer; and no such code paths seem to have
existed since the termiox mechanism was introduced back in
commit 1d65b4a088de ("tty: Add termiox") in v2.6.28.
Similarly, no driver actually implements .set_termiox; and it looks like
no driver ever has.

Delete this dead code; but leave the definition of struct termiox in the
UAPI headers intact.

Signed-off-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/r/20201203020331.2394754-1-jannh@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_ioctl.c    | 61 ++--------------------------------------------
 include/linux/tty.h        |  1 -
 include/linux/tty_driver.h |  9 -------
 3 files changed, 2 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index e18f318586ab..4de1c6ddb8ff 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -443,51 +443,6 @@ static int get_termio(struct tty_struct *tty, struct termio __user *termio)
 	return 0;
 }
 
-
-#ifdef TCGETX
-
-/**
- *	set_termiox	-	set termiox fields if possible
- *	@tty: terminal
- *	@arg: termiox structure from user
- *	@opt: option flags for ioctl type
- *
- *	Implement the device calling points for the SYS5 termiox ioctl
- *	interface in Linux
- */
-
-static int set_termiox(struct tty_struct *tty, void __user *arg, int opt)
-{
-	struct termiox tnew;
-	struct tty_ldisc *ld;
-
-	if (tty->termiox == NULL)
-		return -EINVAL;
-	if (copy_from_user(&tnew, arg, sizeof(struct termiox)))
-		return -EFAULT;
-
-	ld = tty_ldisc_ref(tty);
-	if (ld != NULL) {
-		if ((opt & TERMIOS_FLUSH) && ld->ops->flush_buffer)
-			ld->ops->flush_buffer(tty);
-		tty_ldisc_deref(ld);
-	}
-	if (opt & TERMIOS_WAIT) {
-		tty_wait_until_sent(tty, 0);
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-	}
-
-	down_write(&tty->termios_rwsem);
-	if (tty->ops->set_termiox)
-		tty->ops->set_termiox(tty, &tnew);
-	up_write(&tty->termios_rwsem);
-	return 0;
-}
-
-#endif
-
-
 #ifdef TIOCGETP
 /*
  * These are deprecated, but there is limited support..
@@ -815,23 +770,11 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		return ret;
 #endif
 #ifdef TCGETX
-	case TCGETX: {
-		struct termiox ktermx;
-		if (real_tty->termiox == NULL)
-			return -EINVAL;
-		down_read(&real_tty->termios_rwsem);
-		memcpy(&ktermx, real_tty->termiox, sizeof(struct termiox));
-		up_read(&real_tty->termios_rwsem);
-		if (copy_to_user(p, &ktermx, sizeof(struct termiox)))
-			ret = -EFAULT;
-		return ret;
-	}
+	case TCGETX:
 	case TCSETX:
-		return set_termiox(real_tty, p, 0);
 	case TCSETXW:
-		return set_termiox(real_tty, p, TERMIOS_WAIT);
 	case TCSETXF:
-		return set_termiox(real_tty, p, TERMIOS_FLUSH);
+		return -EINVAL;
 #endif		
 	case TIOCGSOFTCAR:
 		copy_termios(real_tty, &kterm);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 10212c6e4345..67c7a07c8083 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -303,7 +303,6 @@ struct tty_struct {
 	spinlock_t flow_lock;
 	/* Termios values are protected by the termios rwsem */
 	struct ktermios termios, termios_locked;
-	struct termiox *termiox;	/* May be NULL for unsupported */
 	char name[64];
 	struct pid *pgrp;		/* Protected by ctrl lock */
 	struct pid *session;
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 358446247ccd..61c3372d3f32 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -224,14 +224,6 @@
  *	line). See tty_do_resize() if you need to wrap the standard method
  *	in your own logic - the usual case.
  *
- * void (*set_termiox)(struct tty_struct *tty, struct termiox *new);
- *
- *	Called when the device receives a termiox based ioctl. Passes down
- *	the requested data from user space. This method will not be invoked
- *	unless the tty also has a valid tty->termiox pointer.
- *
- *	Optional: Called under the termios lock
- *
  * int (*get_icount)(struct tty_struct *tty, struct serial_icounter *icount);
  *
  *	Called when the device receives a TIOCGICOUNT ioctl. Passed a kernel
@@ -285,7 +277,6 @@ struct tty_operations {
 	int (*tiocmset)(struct tty_struct *tty,
 			unsigned int set, unsigned int clear);
 	int (*resize)(struct tty_struct *tty, struct winsize *ws);
-	int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);
 	int (*get_icount)(struct tty_struct *tty,
 				struct serial_icounter_struct *icount);
 	int  (*get_serial)(struct tty_struct *tty, struct serial_struct *p);
-- 
cgit 


From c8bcd9c5be24fb9e6132e97da5a35e55a83e36b9 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Thu, 3 Dec 2020 02:25:05 +0100
Subject: tty: Fix ->session locking

Currently, locking of ->session is very inconsistent; most places
protect it using the legacy tty mutex, but disassociate_ctty(),
__do_SAK(), tiocspgrp() and tiocgsid() don't.
Two of the writers hold the ctrl_lock (because they already need it for
->pgrp), but __proc_set_tty() doesn't do that yet.

On a PREEMPT=y system, an unprivileged user can theoretically abuse
this broken locking to read 4 bytes of freed memory via TIOCGSID if
tiocgsid() is preempted long enough at the right point. (Other things
might also go wrong, especially if root-only ioctls are involved; I'm
not sure about that.)

Change the locking on ->session such that:

 - tty_lock() is held by all writers: By making disassociate_ctty()
   hold it. This should be fine because the same lock can already be
   taken through the call to tty_vhangup_session().
   The tricky part is that we need to shorten the area covered by
   siglock to be able to take tty_lock() without ugly retry logic; as
   far as I can tell, this should be fine, since nothing in the
   signal_struct is touched in the `if (tty)` branch.
 - ctrl_lock is held by all writers: By changing __proc_set_tty() to
   hold the lock a little longer.
 - All readers that aren't holding tty_lock() hold ctrl_lock: By
   adding locking to tiocgsid() and __do_SAK(), and expanding the area
   covered by ctrl_lock in tiocspgrp().

Cc: stable@kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c      |  7 ++++++-
 drivers/tty/tty_jobctrl.c | 44 +++++++++++++++++++++++++++++++-------------
 include/linux/tty.h       |  4 ++++
 3 files changed, 41 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 9f8b9a567b35..56ade99ef99f 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2897,10 +2897,14 @@ void __do_SAK(struct tty_struct *tty)
 	struct task_struct *g, *p;
 	struct pid *session;
 	int		i;
+	unsigned long flags;
 
 	if (!tty)
 		return;
-	session = tty->session;
+
+	spin_lock_irqsave(&tty->ctrl_lock, flags);
+	session = get_pid(tty->session);
+	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 
 	tty_ldisc_flush(tty);
 
@@ -2932,6 +2936,7 @@ void __do_SAK(struct tty_struct *tty)
 		task_unlock(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
+	put_pid(session);
 #endif
 }
 
diff --git a/drivers/tty/tty_jobctrl.c b/drivers/tty/tty_jobctrl.c
index baadeea4a289..aa6d0537b379 100644
--- a/drivers/tty/tty_jobctrl.c
+++ b/drivers/tty/tty_jobctrl.c
@@ -103,8 +103,8 @@ static void __proc_set_tty(struct tty_struct *tty)
 	put_pid(tty->session);
 	put_pid(tty->pgrp);
 	tty->pgrp = get_pid(task_pgrp(current));
-	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 	tty->session = get_pid(task_session(current));
+	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 	if (current->signal->tty) {
 		tty_debug(tty, "current tty %s not NULL!!\n",
 			  current->signal->tty->name);
@@ -293,20 +293,23 @@ void disassociate_ctty(int on_exit)
 	spin_lock_irq(&current->sighand->siglock);
 	put_pid(current->signal->tty_old_pgrp);
 	current->signal->tty_old_pgrp = NULL;
-
 	tty = tty_kref_get(current->signal->tty);
+	spin_unlock_irq(&current->sighand->siglock);
+
 	if (tty) {
 		unsigned long flags;
+
+		tty_lock(tty);
 		spin_lock_irqsave(&tty->ctrl_lock, flags);
 		put_pid(tty->session);
 		put_pid(tty->pgrp);
 		tty->session = NULL;
 		tty->pgrp = NULL;
 		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+		tty_unlock(tty);
 		tty_kref_put(tty);
 	}
 
-	spin_unlock_irq(&current->sighand->siglock);
 	/* Now clear signal->tty under the lock */
 	read_lock(&tasklist_lock);
 	session_clear_tty(task_session(current));
@@ -477,14 +480,19 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 		return -ENOTTY;
 	if (retval)
 		return retval;
-	if (!current->signal->tty ||
-	    (current->signal->tty != real_tty) ||
-	    (real_tty->session != task_session(current)))
-		return -ENOTTY;
+
 	if (get_user(pgrp_nr, p))
 		return -EFAULT;
 	if (pgrp_nr < 0)
 		return -EINVAL;
+
+	spin_lock_irq(&real_tty->ctrl_lock);
+	if (!current->signal->tty ||
+	    (current->signal->tty != real_tty) ||
+	    (real_tty->session != task_session(current))) {
+		retval = -ENOTTY;
+		goto out_unlock_ctrl;
+	}
 	rcu_read_lock();
 	pgrp = find_vpid(pgrp_nr);
 	retval = -ESRCH;
@@ -494,12 +502,12 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 	if (session_of_pgrp(pgrp) != task_session(current))
 		goto out_unlock;
 	retval = 0;
-	spin_lock_irq(&real_tty->ctrl_lock);
 	put_pid(real_tty->pgrp);
 	real_tty->pgrp = get_pid(pgrp);
-	spin_unlock_irq(&real_tty->ctrl_lock);
 out_unlock:
 	rcu_read_unlock();
+out_unlock_ctrl:
+	spin_unlock_irq(&real_tty->ctrl_lock);
 	return retval;
 }
 
@@ -511,20 +519,30 @@ out_unlock:
  *
  *	Obtain the session id of the tty. If there is no session
  *	return an error.
- *
- *	Locking: none. Reference to current->signal->tty is safe.
  */
 static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
 {
+	unsigned long flags;
+	pid_t sid;
+
 	/*
 	 * (tty == real_tty) is a cheap way of
 	 * testing if the tty is NOT a master pty.
 	*/
 	if (tty == real_tty && current->signal->tty != real_tty)
 		return -ENOTTY;
+
+	spin_lock_irqsave(&real_tty->ctrl_lock, flags);
 	if (!real_tty->session)
-		return -ENOTTY;
-	return put_user(pid_vnr(real_tty->session), p);
+		goto err;
+	sid = pid_vnr(real_tty->session);
+	spin_unlock_irqrestore(&real_tty->ctrl_lock, flags);
+
+	return put_user(sid, p);
+
+err:
+	spin_unlock_irqrestore(&real_tty->ctrl_lock, flags);
+	return -ENOTTY;
 }
 
 /*
diff --git a/include/linux/tty.h b/include/linux/tty.h
index a99e9b8e4e31..eb33d948788c 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -306,6 +306,10 @@ struct tty_struct {
 	struct termiox *termiox;	/* May be NULL for unsupported */
 	char name[64];
 	struct pid *pgrp;		/* Protected by ctrl lock */
+	/*
+	 * Writes protected by both ctrl lock and legacy mutex, readers must use
+	 * at least one of them.
+	 */
 	struct pid *session;
 	unsigned long flags;
 	int count;
-- 
cgit 


From a54895fa057c67700270777f7661d8d3c7fda88a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 3 Dec 2020 17:21:39 +0100
Subject: block: remove the request_queue to argument request based tracepoints

The request_queue can trivially be derived from the request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c            |  2 +-
 block/blk-mq-sched.c         |  2 +-
 block/blk-mq.c               |  8 ++++----
 drivers/md/dm-rq.c           |  2 +-
 drivers/s390/scsi/zfcp_fsf.c |  3 +--
 include/linux/blktrace_api.h |  5 ++---
 include/trace/events/block.h | 30 ++++++++++++------------------
 kernel/trace/blktrace.c      | 44 +++++++++++++++++---------------------------
 8 files changed, 39 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4071daa88a5e..7497d86fff38 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -799,7 +799,7 @@ static struct request *attempt_merge(struct request_queue *q,
 	 */
 	blk_account_io_merge_request(next);
 
-	trace_block_rq_merge(q, next);
+	trace_block_rq_merge(next);
 
 	/*
 	 * ownership of bio passed from next to req, return 'next' for
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index d1eafe2c045c..deff4e826e23 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 
 void blk_mq_sched_request_inserted(struct request *rq)
 {
-	trace_block_rq_insert(rq->q, rq);
+	trace_block_rq_insert(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 21e2b4b6b742..cf3916e2852f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -733,7 +733,7 @@ void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
-	trace_block_rq_issue(q, rq);
+	trace_block_rq_issue(rq);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 		rq->io_start_time_ns = ktime_get_ns();
@@ -760,7 +760,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 
 	blk_mq_put_driver_tag(rq);
 
-	trace_block_rq_requeue(q, rq);
+	trace_block_rq_requeue(rq);
 	rq_qos_requeue(q, rq);
 
 	if (blk_mq_request_started(rq)) {
@@ -1821,7 +1821,7 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 
 	lockdep_assert_held(&ctx->lock);
 
-	trace_block_rq_insert(hctx->queue, rq);
+	trace_block_rq_insert(rq);
 
 	if (at_head)
 		list_add(&rq->queuelist, &ctx->rq_lists[type]);
@@ -1878,7 +1878,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	 */
 	list_for_each_entry(rq, list, queuelist) {
 		BUG_ON(rq->mq_ctx != ctx);
-		trace_block_rq_insert(hctx->queue, rq);
+		trace_block_rq_insert(rq);
 	}
 
 	spin_lock(&ctx->lock);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 729a72ec30cc..13b4385f4d5a 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -397,7 +397,7 @@ static int map_request(struct dm_rq_target_io *tio)
 		}
 
 		/* The target has remapped the I/O so dispatch it */
-		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
+		trace_block_rq_remap(clone, disk_devt(dm_disk(md)),
 				     blk_rq_pos(rq));
 		ret = dm_dispatch_clone_request(clone, rq);
 		if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 6cb963a06777..37d450f46952 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -2359,8 +2359,7 @@ static void zfcp_fsf_req_trace(struct zfcp_fsf_req *req, struct scsi_cmnd *scsi)
 		}
 	}
 
-	blk_add_driver_data(scsi->request->q, scsi->request, &blktrc,
-			    sizeof(blktrc));
+	blk_add_driver_data(scsi->request, &blktrc, sizeof(blktrc));
 }
 
 /**
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 3b6ff5902edc..05556573b896 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -75,8 +75,7 @@ static inline bool blk_trace_note_message_enabled(struct request_queue *q)
 	return ret;
 }
 
-extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
-				void *data, size_t len);
+extern void blk_add_driver_data(struct request *rq, void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   struct block_device *bdev,
 			   char __user *arg);
@@ -90,7 +89,7 @@ extern struct attribute_group blk_trace_attr_group;
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 # define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 # define blk_trace_shutdown(q)				do { } while (0)
-# define blk_add_driver_data(q, rq, data, len)		do {} while (0)
+# define blk_add_driver_data(rq, data, len)		do {} while (0)
 # define blk_trace_setup(q, name, dev, bdev, arg)	(-ENOTTY)
 # define blk_trace_startstop(q, start)			(-ENOTTY)
 # define blk_trace_remove(q)				(-ENOTTY)
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 8fb89574d867..0d782663a005 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -64,7 +64,6 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
 
 /**
  * block_rq_requeue - place block IO request back on a queue
- * @q: queue holding operation
  * @rq: block IO operation request
  *
  * The block operation request @rq is being placed back into queue
@@ -73,9 +72,9 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
  */
 TRACE_EVENT(block_rq_requeue,
 
-	TP_PROTO(struct request_queue *q, struct request *rq),
+	TP_PROTO(struct request *rq),
 
-	TP_ARGS(q, rq),
+	TP_ARGS(rq),
 
 	TP_STRUCT__entry(
 		__field(  dev_t,	dev			)
@@ -147,9 +146,9 @@ TRACE_EVENT(block_rq_complete,
 
 DECLARE_EVENT_CLASS(block_rq,
 
-	TP_PROTO(struct request_queue *q, struct request *rq),
+	TP_PROTO(struct request *rq),
 
-	TP_ARGS(q, rq),
+	TP_ARGS(rq),
 
 	TP_STRUCT__entry(
 		__field(  dev_t,	dev			)
@@ -181,7 +180,6 @@ DECLARE_EVENT_CLASS(block_rq,
 
 /**
  * block_rq_insert - insert block operation request into queue
- * @q: target queue
  * @rq: block IO operation request
  *
  * Called immediately before block operation request @rq is inserted
@@ -191,14 +189,13 @@ DECLARE_EVENT_CLASS(block_rq,
  */
 DEFINE_EVENT(block_rq, block_rq_insert,
 
-	TP_PROTO(struct request_queue *q, struct request *rq),
+	TP_PROTO(struct request *rq),
 
-	TP_ARGS(q, rq)
+	TP_ARGS(rq)
 );
 
 /**
  * block_rq_issue - issue pending block IO request operation to device driver
- * @q: queue holding operation
  * @rq: block IO operation operation request
  *
  * Called when block operation request @rq from queue @q is sent to a
@@ -206,14 +203,13 @@ DEFINE_EVENT(block_rq, block_rq_insert,
  */
 DEFINE_EVENT(block_rq, block_rq_issue,
 
-	TP_PROTO(struct request_queue *q, struct request *rq),
+	TP_PROTO(struct request *rq),
 
-	TP_ARGS(q, rq)
+	TP_ARGS(rq)
 );
 
 /**
  * block_rq_merge - merge request with another one in the elevator
- * @q: queue holding operation
  * @rq: block IO operation operation request
  *
  * Called when block operation request @rq from queue @q is merged to another
@@ -221,9 +217,9 @@ DEFINE_EVENT(block_rq, block_rq_issue,
  */
 DEFINE_EVENT(block_rq, block_rq_merge,
 
-	TP_PROTO(struct request_queue *q, struct request *rq),
+	TP_PROTO(struct request *rq),
 
-	TP_ARGS(q, rq)
+	TP_ARGS(rq)
 );
 
 /**
@@ -491,7 +487,6 @@ TRACE_EVENT(block_bio_remap,
 
 /**
  * block_rq_remap - map request for a block operation request
- * @q: queue holding the operation
  * @rq: block IO operation request
  * @dev: device for the operation
  * @from: original sector for the operation
@@ -502,10 +497,9 @@ TRACE_EVENT(block_bio_remap,
  */
 TRACE_EVENT(block_rq_remap,
 
-	TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev,
-		 sector_t from),
+	TP_PROTO(struct request *rq, dev_t dev, sector_t from),
 
-	TP_ARGS(q, rq, dev, from),
+	TP_ARGS(rq, dev, from),
 
 	TP_STRUCT__entry(
 		__field( dev_t,		dev		)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 405637144a03..7839a78205c2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -795,12 +795,12 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
 #endif
 
 static u64
-blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+blk_trace_request_get_cgid(struct request *rq)
 {
 	if (!rq->bio)
 		return 0;
 	/* Use the first bio */
-	return blk_trace_bio_get_cgid(q, rq->bio);
+	return blk_trace_bio_get_cgid(rq->q, rq->bio);
 }
 
 /*
@@ -841,40 +841,35 @@ static void blk_add_trace_rq(struct request *rq, int error,
 	rcu_read_unlock();
 }
 
-static void blk_add_trace_rq_insert(void *ignore,
-				    struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
 {
 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
-			 blk_trace_request_get_cgid(q, rq));
+			 blk_trace_request_get_cgid(rq));
 }
 
-static void blk_add_trace_rq_issue(void *ignore,
-				   struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
 {
 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
-			 blk_trace_request_get_cgid(q, rq));
+			 blk_trace_request_get_cgid(rq));
 }
 
-static void blk_add_trace_rq_merge(void *ignore,
-				   struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
 {
 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
-			 blk_trace_request_get_cgid(q, rq));
+			 blk_trace_request_get_cgid(rq));
 }
 
-static void blk_add_trace_rq_requeue(void *ignore,
-				     struct request_queue *q,
-				     struct request *rq)
+static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
 {
 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
-			 blk_trace_request_get_cgid(q, rq));
+			 blk_trace_request_get_cgid(rq));
 }
 
 static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
 			int error, unsigned int nr_bytes)
 {
 	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
-			 blk_trace_request_get_cgid(rq->q, rq));
+			 blk_trace_request_get_cgid(rq));
 }
 
 /**
@@ -1037,16 +1032,14 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
  *     Add a trace for that action.
  *
  **/
-static void blk_add_trace_rq_remap(void *ignore,
-				   struct request_queue *q,
-				   struct request *rq, dev_t dev,
+static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
 				   sector_t from)
 {
 	struct blk_trace *bt;
 	struct blk_io_trace_remap r;
 
 	rcu_read_lock();
-	bt = rcu_dereference(q->blk_trace);
+	bt = rcu_dereference(rq->q->blk_trace);
 	if (likely(!bt)) {
 		rcu_read_unlock();
 		return;
@@ -1058,13 +1051,12 @@ static void blk_add_trace_rq_remap(void *ignore,
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
 			rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
-			sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+			sizeof(r), &r, blk_trace_request_get_cgid(rq));
 	rcu_read_unlock();
 }
 
 /**
  * blk_add_driver_data - Add binary message with driver-specific data
- * @q:		queue the io is for
  * @rq:		io request
  * @data:	driver-specific data
  * @len:	length of driver-specific data
@@ -1073,14 +1065,12 @@ static void blk_add_trace_rq_remap(void *ignore,
  *     Some drivers might want to write driver-specific data per request.
  *
  **/
-void blk_add_driver_data(struct request_queue *q,
-			 struct request *rq,
-			 void *data, size_t len)
+void blk_add_driver_data(struct request *rq, void *data, size_t len)
 {
 	struct blk_trace *bt;
 
 	rcu_read_lock();
-	bt = rcu_dereference(q->blk_trace);
+	bt = rcu_dereference(rq->q->blk_trace);
 	if (likely(!bt)) {
 		rcu_read_unlock();
 		return;
@@ -1088,7 +1078,7 @@ void blk_add_driver_data(struct request_queue *q,
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
 				BLK_TA_DRV_DATA, 0, len, data,
-				blk_trace_request_get_cgid(q, rq));
+				blk_trace_request_get_cgid(rq));
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
-- 
cgit 


From c9d659b60770db94b898f94947192a94bbf95c5c Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 20 Nov 2020 16:10:23 -0800
Subject: PCI/ERR: Bind RCEC devices to the Root Port driver

If a Root Complex Integrated Endpoint (RCiEP) is implemented, it may signal
errors through a Root Complex Event Collector (RCEC).  Each RCiEP must be
associated with no more than one RCEC.

For an RCEC (which is technically not a Bridge), error messages "received"
from associated RCiEPs must be enabled for "transmission" in order to cause
a System Error via the Root Control register or (when the Advanced Error
Reporting Capability is present) reporting via the Root Error Command
register and logging in the Root Error Status register and Error Source
Identification register.

Given the commonality with Root Ports and the need to also support AER and
PME services for RCECs, extend the Root Port driver to support RCEC devices
by adding the RCEC Class ID to the driver structure.

Co-developed-by: Sean V Kelley <sean.v.kelley@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-3-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
---
 drivers/pci/pcie/portdrv_pci.c | 5 ++++-
 include/linux/pci_ids.h        | 1 +
 include/uapi/linux/pci_regs.h  | 7 +++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 3a3ce40ae1ab..4d880679b9b1 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -106,7 +106,8 @@ static int pcie_portdrv_probe(struct pci_dev *dev,
 	if (!pci_is_pcie(dev) ||
 	    ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
 	     (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
-	     (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM)))
+	     (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM) &&
+	     (pci_pcie_type(dev) != PCI_EXP_TYPE_RC_EC)))
 		return -ENODEV;
 
 	status = pcie_port_device_register(dev);
@@ -195,6 +196,8 @@ static const struct pci_device_id port_pci_ids[] = {
 	{ PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x00), ~0) },
 	/* subtractive decode PCI-to-PCI bridge, class type is 060401h */
 	{ PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x01), ~0) },
+	/* handle any Root Complex Event Collector */
+	{ PCI_DEVICE_CLASS(((PCI_CLASS_SYSTEM_RCEC << 8) | 0x00), ~0) },
 	{ },
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 1ab1e24bcbce..d8156a5dbee8 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -81,6 +81,7 @@
 #define PCI_CLASS_SYSTEM_RTC		0x0803
 #define PCI_CLASS_SYSTEM_PCI_HOTPLUG	0x0804
 #define PCI_CLASS_SYSTEM_SDHCI		0x0805
+#define PCI_CLASS_SYSTEM_RCEC		0x0807
 #define PCI_CLASS_SYSTEM_OTHER		0x0880
 
 #define PCI_BASE_CLASS_INPUT		0x09
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index a95d55f9f257..bccd3e35cb65 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -831,6 +831,13 @@
 #define  PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
 #define PCI_EXT_CAP_PWR_SIZEOF	16
 
+/* Root Complex Event Collector Endpoint Association  */
+#define PCI_RCEC_RCIEP_BITMAP	4	/* Associated Bitmap for RCiEPs */
+#define PCI_RCEC_BUSN		8	/* RCEC Associated Bus Numbers */
+#define  PCI_RCEC_BUSN_REG_VER	0x02	/* Least version with BUSN present */
+#define  PCI_RCEC_BUSN_NEXT(x)	(((x) >> 8) & 0xff)
+#define  PCI_RCEC_BUSN_LAST(x)	(((x) >> 16) & 0xff)
+
 /* Vendor-Specific (VSEC, PCI_EXT_CAP_ID_VNDR) */
 #define PCI_VNDR_HEADER		4	/* Vendor-Specific Header */
 #define  PCI_VNDR_HEADER_ID(x)	((x) & 0xffff)
-- 
cgit 


From 90655631988f8f501529e6de5f13614389717ead Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:24 -0800
Subject: PCI/ERR: Cache RCEC EA Capability offset in pci_init_capabilities()

Extend support for Root Complex Event Collectors by decoding and caching
the RCEC Endpoint Association Extended Capabilities when enumerating. Use
that cached information for later error source reporting. See PCIe r5.0,
sec 7.9.10.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-4-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/pci/pci.h         | 17 ++++++++++++++
 drivers/pci/pcie/Makefile |  2 +-
 drivers/pci/pcie/rcec.c   | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/probe.c       |  2 ++
 include/linux/pci.h       |  4 ++++
 5 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 drivers/pci/pcie/rcec.c

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f86cae9aa1f4..12dcad8f3635 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -448,6 +448,15 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 #endif	/* CONFIG_PCIEAER */
 
+#ifdef CONFIG_PCIEPORTBUS
+/* Cached RCEC Endpoint Association */
+struct rcec_ea {
+	u8		nextbusn;
+	u8		lastbusn;
+	u32		bitmap;
+};
+#endif
+
 #ifdef CONFIG_PCIE_DPC
 void pci_save_dpc_state(struct pci_dev *dev);
 void pci_restore_dpc_state(struct pci_dev *dev);
@@ -460,6 +469,14 @@ static inline void pci_restore_dpc_state(struct pci_dev *dev) {}
 static inline void pci_dpc_init(struct pci_dev *pdev) {}
 #endif
 
+#ifdef CONFIG_PCIEPORTBUS
+void pci_rcec_init(struct pci_dev *dev);
+void pci_rcec_exit(struct pci_dev *dev);
+#else
+static inline void pci_rcec_init(struct pci_dev *dev) {}
+static inline void pci_rcec_exit(struct pci_dev *dev) {}
+#endif
+
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
 void pci_ats_init(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 68da9280ff11..d9697892fa3e 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
+pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o rcec.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
new file mode 100644
index 000000000000..038e9d706d5f
--- /dev/null
+++ b/drivers/pci/pcie/rcec.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Root Complex Event Collector Support
+ *
+ * Authors:
+ *  Sean V Kelley <sean.v.kelley@intel.com>
+ *  Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+ *
+ * Copyright (C) 2020 Intel Corp.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+
+#include "../pci.h"
+
+void pci_rcec_init(struct pci_dev *dev)
+{
+	struct rcec_ea *rcec_ea;
+	u32 rcec, hdr, busn;
+	u8 ver;
+
+	/* Only for Root Complex Event Collectors */
+	if (pci_pcie_type(dev) != PCI_EXP_TYPE_RC_EC)
+		return;
+
+	rcec = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_RCEC);
+	if (!rcec)
+		return;
+
+	rcec_ea = kzalloc(sizeof(*rcec_ea), GFP_KERNEL);
+	if (!rcec_ea)
+		return;
+
+	pci_read_config_dword(dev, rcec + PCI_RCEC_RCIEP_BITMAP,
+			      &rcec_ea->bitmap);
+
+	/* Check whether RCEC BUSN register is present */
+	pci_read_config_dword(dev, rcec, &hdr);
+	ver = PCI_EXT_CAP_VER(hdr);
+	if (ver >= PCI_RCEC_BUSN_REG_VER) {
+		pci_read_config_dword(dev, rcec + PCI_RCEC_BUSN, &busn);
+		rcec_ea->nextbusn = PCI_RCEC_BUSN_NEXT(busn);
+		rcec_ea->lastbusn = PCI_RCEC_BUSN_LAST(busn);
+	} else {
+		/* Avoid later ver check by setting nextbusn */
+		rcec_ea->nextbusn = 0xff;
+		rcec_ea->lastbusn = 0x00;
+	}
+
+	dev->rcec_ea = rcec_ea;
+}
+
+void pci_rcec_exit(struct pci_dev *dev)
+{
+	kfree(dev->rcec_ea);
+	dev->rcec_ea = NULL;
+}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4289030b0fff..f22e1451d65d 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2216,6 +2216,7 @@ static void pci_configure_device(struct pci_dev *dev)
 static void pci_release_capabilities(struct pci_dev *dev)
 {
 	pci_aer_exit(dev);
+	pci_rcec_exit(dev);
 	pci_vpd_release(dev);
 	pci_iov_release(dev);
 	pci_free_cap_save_buffers(dev);
@@ -2415,6 +2416,7 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	pci_ptm_init(dev);		/* Precision Time Measurement */
 	pci_aer_init(dev);		/* Advanced Error Reporting */
 	pci_dpc_init(dev);		/* Downstream Port Containment */
+	pci_rcec_init(dev);		/* Root Complex Event Collector */
 
 	pcie_report_downtraining(dev);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 22207a79762c..f8c927fd0602 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -304,6 +304,7 @@ struct pcie_link_state;
 struct pci_vpd;
 struct pci_sriov;
 struct pci_p2pdma;
+struct rcec_ea;
 
 /* The pci_dev structure describes PCI devices */
 struct pci_dev {
@@ -326,6 +327,9 @@ struct pci_dev {
 #ifdef CONFIG_PCIEAER
 	u16		aer_cap;	/* AER capability offset */
 	struct aer_stats *aer_stats;	/* AER stats for this device */
+#endif
+#ifdef CONFIG_PCIEPORTBUS
+	struct rcec_ea	*rcec_ea;	/* RCEC cached endpoint association */
 #endif
 	u8		pcie_cap;	/* PCIe capability offset */
 	u8		msi_cap;	/* MSI capability offset */
-- 
cgit 


From 3ee16db390b42b8a21f2ad2ea2518f3469c6e532 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 30 Nov 2020 10:57:43 -0500
Subject: dm: fix IO splitting

Commit 882ec4e609c1 ("dm table: stack 'chunk_sectors' limit to account
for target-specific splitting") caused a couple regressions:
1) Using lcm_not_zero() when stacking chunk_sectors was a bug because
   chunk_sectors must reflect the most limited of all devices in the
   IO stack.
2) DM targets that set max_io_len but that do _not_ provide an
   .iterate_devices method no longer had there IO split properly.

And commit 5091cdec56fa ("dm: change max_io_len() to use
blk_max_size_offset()") also caused a regression where DM no longer
supported varied (per target) IO splitting. The implication being the
potential for severely reduced performance for IO stacks that use a DM
target like dm-cache to hide performance limitations of a slower
device (e.g. one that requires 4K IO splitting).

Coming full circle: Fix all these issues by discontinuing stacking
chunk_sectors up using ti->max_io_len in dm_calculate_queue_limits(),
add optional chunk_sectors override argument to blk_max_size_offset()
and update DM's max_io_len() to pass ti->max_io_len to its
blk_max_size_offset() call.

Passing in an optional chunk_sectors override to blk_max_size_offset()
allows for code reuse of block's centralized calculation for max IO
size based on provided offset and split boundary.

Fixes: 882ec4e609c1 ("dm table: stack 'chunk_sectors' limit to account for target-specific splitting")
Fixes: 5091cdec56fa ("dm: change max_io_len() to use blk_max_size_offset()")
Cc: stable@vger.kernel.org
Reported-by: John Dorminy <jdorminy@redhat.com>
Reported-by: Bruce Johnston <bjohnsto@redhat.com>
Reported-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: John Dorminy <jdorminy@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c      |  2 +-
 drivers/md/dm-table.c  |  5 -----
 drivers/md/dm.c        | 19 +++++++++++--------
 include/linux/blkdev.h | 11 ++++++-----
 4 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index bcf5e4580603..97b7c2821565 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -144,7 +144,7 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
 static inline unsigned get_max_io_size(struct request_queue *q,
 				       struct bio *bio)
 {
-	unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
+	unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
 	unsigned max_sectors = sectors;
 	unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
 	unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2073ee8d18f4..7eeb7c4169c9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -18,7 +18,6 @@
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
-#include <linux/lcm.h>
 #include <linux/blk-mq.h>
 #include <linux/mount.h>
 #include <linux/dax.h>
@@ -1449,10 +1448,6 @@ int dm_calculate_queue_limits(struct dm_table *table,
 			zone_sectors = ti_limits.chunk_sectors;
 		}
 
-		/* Stack chunk_sectors if target-specific splitting is required */
-		if (ti->max_io_len)
-			ti_limits.chunk_sectors = lcm_not_zero(ti->max_io_len,
-							       ti_limits.chunk_sectors);
 		/* Set I/O hints portion of queue limits */
 		if (ti->type->io_hints)
 			ti->type->io_hints(ti, &ti_limits);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 98866e725f25..f7eb3d2964f3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1039,15 +1039,18 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
 	sector_t max_len;
 
 	/*
-	 * Does the target need to split even further?
-	 * - q->limits.chunk_sectors reflects ti->max_io_len so
-	 *   blk_max_size_offset() provides required splitting.
-	 * - blk_max_size_offset() also respects q->limits.max_sectors
+	 * Does the target need to split IO even further?
+	 * - varied (per target) IO splitting is a tenet of DM; this
+	 *   explains why stacked chunk_sectors based splitting via
+	 *   blk_max_size_offset() isn't possible here. So pass in
+	 *   ti->max_io_len to override stacked chunk_sectors.
 	 */
-	max_len = blk_max_size_offset(ti->table->md->queue,
-				      target_offset);
-	if (len > max_len)
-		len = max_len;
+	if (ti->max_io_len) {
+		max_len = blk_max_size_offset(ti->table->md->queue,
+					      target_offset, ti->max_io_len);
+		if (len > max_len)
+			len = max_len;
+	}
 
 	return len;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 639cae2c158b..24ae504cf77d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1073,11 +1073,12 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
  * file system requests.
  */
 static inline unsigned int blk_max_size_offset(struct request_queue *q,
-					       sector_t offset)
+					       sector_t offset,
+					       unsigned int chunk_sectors)
 {
-	unsigned int chunk_sectors = q->limits.chunk_sectors;
-
-	if (!chunk_sectors)
+	if (!chunk_sectors && q->limits.chunk_sectors)
+		chunk_sectors = q->limits.chunk_sectors;
+	else
 		return q->limits.max_sectors;
 
 	if (likely(is_power_of_2(chunk_sectors)))
@@ -1101,7 +1102,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 	    req_op(rq) == REQ_OP_SECURE_ERASE)
 		return blk_queue_get_max_sectors(q, req_op(rq));
 
-	return min(blk_max_size_offset(q, offset),
+	return min(blk_max_size_offset(q, offset, 0),
 			blk_queue_get_max_sectors(q, req_op(rq)));
 }
 
-- 
cgit 


From f646c2a0a6685a8a30a150509b112c911b8feff3 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay12@gmail.com>
Date: Sun, 29 Nov 2020 22:16:26 +0530
Subject: PCI: Return u8 from pci_find_capability() and similar

PCI Capabilities are linked in a list that must appear in the first 256
bytes of config space.  Each capabilities list pointer is 8 bits.

Change the return type of pci_find_capability() and supporting functions
from int to u8 to match the specification.

[bhelgaas: change other related interfaces, fix HyperTransport typos]
Link: https://lore.kernel.org/r/20201129164626.12887-1-puranjay12@gmail.com
Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 42 +++++++++++++++++++++---------------------
 include/linux/pci.h | 12 ++++++------
 2 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e578d34095e9..b3761e98377b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -399,8 +399,8 @@ found:
 	return 1;
 }
 
-static int __pci_find_next_cap_ttl(struct pci_bus *bus, unsigned int devfn,
-				   u8 pos, int cap, int *ttl)
+static u8 __pci_find_next_cap_ttl(struct pci_bus *bus, unsigned int devfn,
+				  u8 pos, int cap, int *ttl)
 {
 	u8 id;
 	u16 ent;
@@ -423,22 +423,22 @@ static int __pci_find_next_cap_ttl(struct pci_bus *bus, unsigned int devfn,
 	return 0;
 }
 
-static int __pci_find_next_cap(struct pci_bus *bus, unsigned int devfn,
-			       u8 pos, int cap)
+static u8 __pci_find_next_cap(struct pci_bus *bus, unsigned int devfn,
+			      u8 pos, int cap)
 {
 	int ttl = PCI_FIND_CAP_TTL;
 
 	return __pci_find_next_cap_ttl(bus, devfn, pos, cap, &ttl);
 }
 
-int pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap)
+u8 pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap)
 {
 	return __pci_find_next_cap(dev->bus, dev->devfn,
 				   pos + PCI_CAP_LIST_NEXT, cap);
 }
 EXPORT_SYMBOL_GPL(pci_find_next_capability);
 
-static int __pci_bus_find_cap_start(struct pci_bus *bus,
+static u8 __pci_bus_find_cap_start(struct pci_bus *bus,
 				    unsigned int devfn, u8 hdr_type)
 {
 	u16 status;
@@ -477,9 +477,9 @@ static int __pci_bus_find_cap_start(struct pci_bus *bus,
  *  %PCI_CAP_ID_PCIX         PCI-X
  *  %PCI_CAP_ID_EXP          PCI Express
  */
-int pci_find_capability(struct pci_dev *dev, int cap)
+u8 pci_find_capability(struct pci_dev *dev, int cap)
 {
-	int pos;
+	u8 pos;
 
 	pos = __pci_bus_find_cap_start(dev->bus, dev->devfn, dev->hdr_type);
 	if (pos)
@@ -502,10 +502,9 @@ EXPORT_SYMBOL(pci_find_capability);
  * device's PCI configuration space or 0 in case the device does not
  * support it.
  */
-int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap)
+u8 pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap)
 {
-	int pos;
-	u8 hdr_type;
+	u8 hdr_type, pos;
 
 	pci_bus_read_config_byte(bus, devfn, PCI_HEADER_TYPE, &hdr_type);
 
@@ -623,7 +622,7 @@ u64 pci_get_dsn(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_get_dsn);
 
-static int __pci_find_next_ht_cap(struct pci_dev *dev, int pos, int ht_cap)
+static u8 __pci_find_next_ht_cap(struct pci_dev *dev, u8 pos, int ht_cap)
 {
 	int rc, ttl = PCI_FIND_CAP_TTL;
 	u8 cap, mask;
@@ -650,11 +649,12 @@ static int __pci_find_next_ht_cap(struct pci_dev *dev, int pos, int ht_cap)
 
 	return 0;
 }
+
 /**
- * pci_find_next_ht_capability - query a device's Hypertransport capabilities
+ * pci_find_next_ht_capability - query a device's HyperTransport capabilities
  * @dev: PCI device to query
  * @pos: Position from which to continue searching
- * @ht_cap: Hypertransport capability code
+ * @ht_cap: HyperTransport capability code
  *
  * To be used in conjunction with pci_find_ht_capability() to search for
  * all capabilities matching @ht_cap. @pos should always be a value returned
@@ -663,26 +663,26 @@ static int __pci_find_next_ht_cap(struct pci_dev *dev, int pos, int ht_cap)
  * NB. To be 100% safe against broken PCI devices, the caller should take
  * steps to avoid an infinite loop.
  */
-int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap)
+u8 pci_find_next_ht_capability(struct pci_dev *dev, u8 pos, int ht_cap)
 {
 	return __pci_find_next_ht_cap(dev, pos + PCI_CAP_LIST_NEXT, ht_cap);
 }
 EXPORT_SYMBOL_GPL(pci_find_next_ht_capability);
 
 /**
- * pci_find_ht_capability - query a device's Hypertransport capabilities
+ * pci_find_ht_capability - query a device's HyperTransport capabilities
  * @dev: PCI device to query
- * @ht_cap: Hypertransport capability code
+ * @ht_cap: HyperTransport capability code
  *
- * Tell if a device supports a given Hypertransport capability.
+ * Tell if a device supports a given HyperTransport capability.
  * Returns an address within the device's PCI configuration space
  * or 0 in case the device does not support the request capability.
  * The address points to the PCI capability, of type PCI_CAP_ID_HT,
- * which has a Hypertransport capability matching @ht_cap.
+ * which has a HyperTransport capability matching @ht_cap.
  */
-int pci_find_ht_capability(struct pci_dev *dev, int ht_cap)
+u8 pci_find_ht_capability(struct pci_dev *dev, int ht_cap)
 {
-	int pos;
+	u8 pos;
 
 	pos = __pci_bus_find_cap_start(dev->bus, dev->devfn, dev->hdr_type);
 	if (pos)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e007bc3e8b6e..e615f8abdd79 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1064,12 +1064,13 @@ void pci_sort_breadthfirst(void);
 
 /* Generic PCI functions exported to card drivers */
 
-int pci_find_capability(struct pci_dev *dev, int cap);
-int pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap);
+u8 pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap);
+u8 pci_find_capability(struct pci_dev *dev, int cap);
+u8 pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap);
+u8 pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
+u8 pci_find_next_ht_capability(struct pci_dev *dev, u8 pos, int ht_cap);
 int pci_find_ext_capability(struct pci_dev *dev, int cap);
 int pci_find_next_ext_capability(struct pci_dev *dev, int pos, int cap);
-int pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
-int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
 
 u64 pci_get_dsn(struct pci_dev *dev);
@@ -1280,7 +1281,6 @@ void set_pcie_port_type(struct pci_dev *pdev);
 void set_pcie_hotplug_bridge(struct pci_dev *pdev);
 
 /* Functions for PCI Hotplug drivers to use */
-int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap);
 unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge);
 unsigned int pci_rescan_bus(struct pci_bus *bus);
 void pci_lock_rescan_remove(void);
@@ -1720,7 +1720,7 @@ static inline int __pci_register_driver(struct pci_driver *drv,
 static inline int pci_register_driver(struct pci_driver *drv)
 { return 0; }
 static inline void pci_unregister_driver(struct pci_driver *drv) { }
-static inline int pci_find_capability(struct pci_dev *dev, int cap)
+static inline u8 pci_find_capability(struct pci_dev *dev, int cap)
 { return 0; }
 static inline int pci_find_next_capability(struct pci_dev *dev, u8 post,
 					   int cap)
-- 
cgit 


From ee8b1c478a9fbce9c64151ee561c124c4dcd66be Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 4 Dec 2020 15:14:07 -0600
Subject: PCI: Return u16 from pci_find_ext_capability() and similar

PCI Express Extended Capabilities are in config space between offsets 256
and 4K.  These offsets all fit in 16 bits.

Change the return type of pci_find_ext_capability() and supporting
functions from int to u16 to match the specification.  Many callers use
"int", which is fine, but there's no need to store more than a u16.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 6 +++---
 include/linux/pci.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b3761e98377b..85cb873266d3 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -527,11 +527,11 @@ EXPORT_SYMBOL(pci_bus_find_capability);
  * not support it.  Some capabilities can occur several times, e.g., the
  * vendor-specific capability, and this provides a way to find them all.
  */
-int pci_find_next_ext_capability(struct pci_dev *dev, int start, int cap)
+u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 start, int cap)
 {
 	u32 header;
 	int ttl;
-	int pos = PCI_CFG_SPACE_SIZE;
+	u16 pos = PCI_CFG_SPACE_SIZE;
 
 	/* minimum 8 bytes per capability */
 	ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8;
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(pci_find_next_ext_capability);
  *  %PCI_EXT_CAP_ID_DSN		Device Serial Number
  *  %PCI_EXT_CAP_ID_PWR		Power Budgeting
  */
-int pci_find_ext_capability(struct pci_dev *dev, int cap)
+u16 pci_find_ext_capability(struct pci_dev *dev, int cap)
 {
 	return pci_find_next_ext_capability(dev, 0, cap);
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e615f8abdd79..441e5753da0c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -381,7 +381,7 @@ struct pci_dev {
 	struct pcie_link_state	*link_state;	/* ASPM link state */
 	unsigned int	ltr_path:1;	/* Latency Tolerance Reporting
 					   supported from root to here */
-	int		l1ss;		/* L1SS Capability pointer */
+	u16		l1ss;		/* L1SS Capability pointer */
 #endif
 	unsigned int	eetlp_prefix_path:1;	/* End-to-End TLP Prefix */
 
@@ -1069,8 +1069,8 @@ u8 pci_find_capability(struct pci_dev *dev, int cap);
 u8 pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap);
 u8 pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
 u8 pci_find_next_ht_capability(struct pci_dev *dev, u8 pos, int ht_cap);
-int pci_find_ext_capability(struct pci_dev *dev, int cap);
-int pci_find_next_ext_capability(struct pci_dev *dev, int pos, int cap);
+u16 pci_find_ext_capability(struct pci_dev *dev, int cap);
+u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 pos, int cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
 
 u64 pci_get_dsn(struct pci_dev *dev);
-- 
cgit 


From dba4a9256bb4d78ef89aaad5f49787aa27fcd5b4 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Fri, 4 Dec 2020 12:36:04 +0100
Subject: net: Remove the err argument from sock_from_file

Currently, the sock_from_file prototype takes an "err" pointer that is
either not set or set to -ENOTSOCK IFF the returned socket is NULL. This
makes the error redundant and it is ignored by a few callers.

This patch simplifies the API by letting callers deduce the error based
on whether the returned socket is NULL or not.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Florent Revest <revest@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: KP Singh <kpsingh@google.com>
Link: https://lore.kernel.org/bpf/20201204113609.1850150-1-revest@google.com
---
 fs/eventpoll.c               |  3 +--
 fs/io_uring.c                | 16 ++++++++--------
 include/linux/net.h          |  2 +-
 net/core/netclassid_cgroup.c |  3 +--
 net/core/netprio_cgroup.c    |  3 +--
 net/core/sock.c              |  8 +-------
 net/socket.c                 | 27 ++++++++++++++++-----------
 7 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 73c346e503d7..19499b7bb82c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -416,12 +416,11 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 	unsigned int napi_id;
 	struct socket *sock;
 	struct sock *sk;
-	int err;
 
 	if (!net_busy_loop_on())
 		return;
 
-	sock = sock_from_file(epi->ffd.file, &err);
+	sock = sock_from_file(epi->ffd.file);
 	if (!sock)
 		return;
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1023f7b44cea..7942a42be4c3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4356,9 +4356,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 	unsigned flags;
 	int ret;
 
-	sock = sock_from_file(req->file, &ret);
+	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
-		return ret;
+		return -ENOTSOCK;
 
 	if (req->async_data) {
 		kmsg = req->async_data;
@@ -4405,9 +4405,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
 	unsigned flags;
 	int ret;
 
-	sock = sock_from_file(req->file, &ret);
+	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
-		return ret;
+		return -ENOTSOCK;
 
 	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
 	if (unlikely(ret))
@@ -4584,9 +4584,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 	unsigned flags;
 	int ret, cflags = 0;
 
-	sock = sock_from_file(req->file, &ret);
+	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
-		return ret;
+		return -ENOTSOCK;
 
 	if (req->async_data) {
 		kmsg = req->async_data;
@@ -4647,9 +4647,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 	unsigned flags;
 	int ret, cflags = 0;
 
-	sock = sock_from_file(req->file, &ret);
+	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
-		return ret;
+		return -ENOTSOCK;
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		kbuf = io_recv_buffer_select(req, !force_nonblock);
diff --git a/include/linux/net.h b/include/linux/net.h
index 0dcd51feef02..9e2324efc26a 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -240,7 +240,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg);
 int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags);
 struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname);
 struct socket *sockfd_lookup(int fd, int *err);
-struct socket *sock_from_file(struct file *file, int *err);
+struct socket *sock_from_file(struct file *file);
 #define		     sockfd_put(sock) fput(sock->file)
 int net_ratelimit(void);
 
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 41b24cd31562..b49c57d35a88 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -68,9 +68,8 @@ struct update_classid_context {
 
 static int update_classid_sock(const void *v, struct file *file, unsigned n)
 {
-	int err;
 	struct update_classid_context *ctx = (void *)v;
-	struct socket *sock = sock_from_file(file, &err);
+	struct socket *sock = sock_from_file(file);
 
 	if (sock) {
 		spin_lock(&cgroup_sk_update_lock);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9bd4cab7d510..99a431c56f23 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -220,8 +220,7 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
 
 static int update_netprio(const void *v, struct file *file, unsigned n)
 {
-	int err;
-	struct socket *sock = sock_from_file(file, &err);
+	struct socket *sock = sock_from_file(file);
 	if (sock) {
 		spin_lock(&cgroup_sk_update_lock);
 		sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
diff --git a/net/core/sock.c b/net/core/sock.c
index 4fd7e785f177..bbcd4b97eddd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2827,14 +2827,8 @@ EXPORT_SYMBOL(sock_no_mmap);
 void __receive_sock(struct file *file)
 {
 	struct socket *sock;
-	int error;
 
-	/*
-	 * The resulting value of "error" is ignored here since we only
-	 * need to take action when the file is a socket and testing
-	 * "sock" for NULL is sufficient.
-	 */
-	sock = sock_from_file(file, &error);
+	sock = sock_from_file(file);
 	if (sock) {
 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
 		sock_update_classid(&sock->sk->sk_cgrp_data);
diff --git a/net/socket.c b/net/socket.c
index bfef11ba35b8..9a240b45bdf3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -445,17 +445,15 @@ static int sock_map_fd(struct socket *sock, int flags)
 /**
  *	sock_from_file - Return the &socket bounded to @file.
  *	@file: file
- *	@err: pointer to an error code return
  *
- *	On failure returns %NULL and assigns -ENOTSOCK to @err.
+ *	On failure returns %NULL.
  */
 
-struct socket *sock_from_file(struct file *file, int *err)
+struct socket *sock_from_file(struct file *file)
 {
 	if (file->f_op == &socket_file_ops)
 		return file->private_data;	/* set in sock_map_fd */
 
-	*err = -ENOTSOCK;
 	return NULL;
 }
 EXPORT_SYMBOL(sock_from_file);
@@ -484,9 +482,11 @@ struct socket *sockfd_lookup(int fd, int *err)
 		return NULL;
 	}
 
-	sock = sock_from_file(file, err);
-	if (!sock)
+	sock = sock_from_file(file);
+	if (!sock) {
+		*err = -ENOTSOCK;
 		fput(file);
+	}
 	return sock;
 }
 EXPORT_SYMBOL(sockfd_lookup);
@@ -498,11 +498,12 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
 
 	*err = -EBADF;
 	if (f.file) {
-		sock = sock_from_file(f.file, err);
+		sock = sock_from_file(f.file);
 		if (likely(sock)) {
 			*fput_needed = f.flags & FDPUT_FPUT;
 			return sock;
 		}
+		*err = -ENOTSOCK;
 		fdput(f);
 	}
 	return NULL;
@@ -1693,9 +1694,11 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
 	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
 		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
-	sock = sock_from_file(file, &err);
-	if (!sock)
+	sock = sock_from_file(file);
+	if (!sock) {
+		err = -ENOTSOCK;
 		goto out;
+	}
 
 	err = -ENFILE;
 	newsock = sock_alloc();
@@ -1818,9 +1821,11 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
 	struct socket *sock;
 	int err;
 
-	sock = sock_from_file(file, &err);
-	if (!sock)
+	sock = sock_from_file(file);
+	if (!sock) {
+		err = -ENOTSOCK;
 		goto out;
+	}
 
 	err =
 	    security_socket_connect(sock, (struct sockaddr *)address, addrlen);
-- 
cgit 


From 65f33b35722952fa076811d5686bfd8a611a80fa Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 4 Dec 2020 17:21:03 -0500
Subject: block: fix incorrect branching in blk_max_size_offset()

If non-zero 'chunk_sectors' is passed in to blk_max_size_offset() that
override will be incorrectly ignored.

Old blk_max_size_offset() branching, prior to commit 3ee16db390b4,
must be used only if passed 'chunk_sectors' override is zero.

Fixes: 3ee16db390b4 ("dm: fix IO splitting")
Cc: stable@vger.kernel.org # 5.9
Reported-by: John Dorminy <jdorminy@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 include/linux/blkdev.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 24ae504cf77d..033eb5f73b65 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1076,10 +1076,12 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
 					       sector_t offset,
 					       unsigned int chunk_sectors)
 {
-	if (!chunk_sectors && q->limits.chunk_sectors)
-		chunk_sectors = q->limits.chunk_sectors;
-	else
-		return q->limits.max_sectors;
+	if (!chunk_sectors) {
+		if (q->limits.chunk_sectors)
+			chunk_sectors = q->limits.chunk_sectors;
+		else
+			return q->limits.max_sectors;
+	}
 
 	if (likely(is_power_of_2(chunk_sectors)))
 		chunk_sectors -= offset & (chunk_sectors - 1);
-- 
cgit 


From 99efde6c9bb7b42eac0459876bf964fe08e5cef9 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 25 Nov 2020 12:07:33 +0300
Subject: PCI/PM: Rename pci_wakeup_bus() to pci_resume_bus()

A "wakeup" is a signal from a device telling the system that the device or
the whole system should be awakened and made active.  PCI devices are made
active by "resuming" them.

pci_wakeup_bus() is not involved with the wakeup signal; it *resumes*
devices on a bus (possibly in response to a wakeup signal, but that's at a
higher level).

Rename pci_wakeup_bus() to pci_resume_bus() to better reflect what it does.
No functional change intended.

[bhelgaas: commit log, reorder before removal of pci_wakeup_event()]
Link: https://lore.kernel.org/r/20201125090733.77782-2-mika.westerberg@linux.intel.com
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/gpu/vga/vga_switcheroo.c |  2 +-
 drivers/pci/pci.c                | 15 +++++----------
 include/linux/pci.h              |  2 +-
 3 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c
index 087304b1a5d7..8843b078ad4e 100644
--- a/drivers/gpu/vga/vga_switcheroo.c
+++ b/drivers/gpu/vga/vga_switcheroo.c
@@ -1039,7 +1039,7 @@ static int vga_switcheroo_runtime_resume(struct device *dev)
 	mutex_lock(&vgasr_mutex);
 	vga_switcheroo_power_switch(pdev, VGA_SWITCHEROO_ON);
 	mutex_unlock(&vgasr_mutex);
-	pci_wakeup_bus(pdev->bus);
+	pci_resume_bus(pdev->bus);
 	ret = dev->bus->pm->runtime_resume(dev);
 	if (ret)
 		return ret;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e578d34095e9..664aada79de2 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1174,12 +1174,7 @@ int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state)
 }
 EXPORT_SYMBOL_GPL(pci_platform_power_transition);
 
-/**
- * pci_wakeup - Wake up a PCI device
- * @pci_dev: Device to handle.
- * @ign: ignored parameter
- */
-static int pci_wakeup(struct pci_dev *pci_dev, void *ign)
+static int pci_resume_one(struct pci_dev *pci_dev, void *ign)
 {
 	pci_wakeup_event(pci_dev);
 	pm_request_resume(&pci_dev->dev);
@@ -1187,13 +1182,13 @@ static int pci_wakeup(struct pci_dev *pci_dev, void *ign)
 }
 
 /**
- * pci_wakeup_bus - Walk given bus and wake up devices on it
+ * pci_resume_bus - Walk given bus and runtime resume devices on it
  * @bus: Top bus of the subtree to walk.
  */
-void pci_wakeup_bus(struct pci_bus *bus)
+void pci_resume_bus(struct pci_bus *bus)
 {
 	if (bus)
-		pci_walk_bus(bus, pci_wakeup, NULL);
+		pci_walk_bus(bus, pci_resume_one, NULL);
 }
 
 static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
@@ -1256,7 +1251,7 @@ int pci_power_up(struct pci_dev *dev)
 		 * may be powered on into D0uninitialized state, resume them to
 		 * give them a chance to suspend again
 		 */
-		pci_wakeup_bus(dev->subordinate);
+		pci_resume_bus(dev->subordinate);
 	}
 
 	return pci_raw_set_power_state(dev, PCI_D0);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 22207a79762c..9256ef2e4327 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1271,7 +1271,7 @@ bool pci_dev_run_wake(struct pci_dev *dev);
 void pci_d3cold_enable(struct pci_dev *dev);
 void pci_d3cold_disable(struct pci_dev *dev);
 bool pcie_relaxed_ordering_enabled(struct pci_dev *dev);
-void pci_wakeup_bus(struct pci_bus *bus);
+void pci_resume_bus(struct pci_bus *bus);
 void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state);
 
 /* For use by arch with custom probe code */
-- 
cgit 


From ed9b25d1970a4787ac6a39c2091e63b127ecbfc1 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <shallyn@cisco.com>
Date: Sun, 15 Nov 2020 21:55:31 -0600
Subject: [SECURITY] fix namespaced fscaps when !CONFIG_SECURITY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Namespaced file capabilities were introduced in 8db6c34f1dbc .
When userspace reads an xattr for a namespaced capability, a
virtualized representation of it is returned if the caller is
in a user namespace owned by the capability's owning rootid.
The function which performs this virtualization was not hooked
up if CONFIG_SECURITY=n.  Therefore in that case the original
xattr was shown instead of the virtualized one.

To test this using libcap-bin (*1),

$ v=$(mktemp)
$ unshare -Ur setcap cap_sys_admin-eip $v
$ unshare -Ur setcap -v cap_sys_admin-eip $v
/tmp/tmp.lSiIFRvt8Y: OK

"setcap -v" verifies the values instead of setting them, and
will check whether the rootid value is set.  Therefore, with
this bug un-fixed, and with CONFIG_SECURITY=n, setcap -v will
fail:

$ v=$(mktemp)
$ unshare -Ur setcap cap_sys_admin=eip $v
$ unshare -Ur setcap -v cap_sys_admin=eip $v
nsowner[got=1000, want=0],/tmp/tmp.HHDiOOl9fY differs in []

Fix this bug by calling cap_inode_getsecurity() in
security_inode_getsecurity() instead of returning
-EOPNOTSUPP, when CONFIG_SECURITY=n.

*1 - note, if libcap is too old for getcap to have the '-n'
option, then use verify-caps instead.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209689
Cc: Hervé Guillemet <herve@guillemet.org>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Serge Hallyn <shallyn@cisco.com>
Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Signed-off-by: James Morris <jamorris@linux.microsoft.com>
---
 include/linux/security.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 0a0a03b36a3b..2befc0a25eb3 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -864,7 +864,7 @@ static inline int security_inode_killpriv(struct dentry *dentry)
 
 static inline int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
 {
-	return -EOPNOTSUPP;
+	return cap_inode_getsecurity(inode, name, buffer, alloc);
 }
 
 static inline int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
-- 
cgit 


From 507b460f814458605c47b0ed03c11e49a712fc08 Mon Sep 17 00:00:00 2001
From: Sean V Kelley <sean.v.kelley@intel.com>
Date: Fri, 20 Nov 2020 16:10:32 -0800
Subject: PCI/ERR: Add pcie_link_rcec() to associate RCiEPs

A Root Complex Event Collector terminates error and PME messages from
associated RCiEPs.

Use the RCEC Endpoint Association Extended Capability to identify
associated RCiEPs. Link the associated RCiEPs as the RCECs are enumerated.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20201121001036.8560-12-sean.v.kelley@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # non-native/no RCEC
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Sean V Kelley <sean.v.kelley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/pci/pci.h              |  2 +
 drivers/pci/pcie/portdrv_pci.c |  3 ++
 drivers/pci/pcie/rcec.c        | 94 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h            |  1 +
 4 files changed, 100 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 3c4570a3058f..ae2ee4df1cff 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -472,9 +472,11 @@ static inline void pci_dpc_init(struct pci_dev *pdev) {}
 #ifdef CONFIG_PCIEPORTBUS
 void pci_rcec_init(struct pci_dev *dev);
 void pci_rcec_exit(struct pci_dev *dev);
+void pcie_link_rcec(struct pci_dev *rcec);
 #else
 static inline void pci_rcec_init(struct pci_dev *dev) {}
 static inline void pci_rcec_exit(struct pci_dev *dev) {}
+static inline void pcie_link_rcec(struct pci_dev *rcec) {}
 #endif
 
 #ifdef CONFIG_PCI_ATS
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index ff9517ee92b3..0b250bc5f405 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -111,6 +111,9 @@ static int pcie_portdrv_probe(struct pci_dev *dev,
 	     (type != PCI_EXP_TYPE_RC_EC)))
 		return -ENODEV;
 
+	if (type == PCI_EXP_TYPE_RC_EC)
+		pcie_link_rcec(dev);
+
 	status = pcie_port_device_register(dev);
 	if (status)
 		return status;
diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
index 038e9d706d5f..cdec277cbd62 100644
--- a/drivers/pci/pcie/rcec.c
+++ b/drivers/pci/pcie/rcec.c
@@ -15,6 +15,100 @@
 
 #include "../pci.h"
 
+struct walk_rcec_data {
+	struct pci_dev *rcec;
+	int (*user_callback)(struct pci_dev *dev, void *data);
+	void *user_data;
+};
+
+static bool rcec_assoc_rciep(struct pci_dev *rcec, struct pci_dev *rciep)
+{
+	unsigned long bitmap = rcec->rcec_ea->bitmap;
+	unsigned int devn;
+
+	/* An RCiEP found on a different bus in range */
+	if (rcec->bus->number != rciep->bus->number)
+		return true;
+
+	/* Same bus, so check bitmap */
+	for_each_set_bit(devn, &bitmap, 32)
+		if (devn == rciep->devfn)
+			return true;
+
+	return false;
+}
+
+static int link_rcec_helper(struct pci_dev *dev, void *data)
+{
+	struct walk_rcec_data *rcec_data = data;
+	struct pci_dev *rcec = rcec_data->rcec;
+
+	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
+	    rcec_assoc_rciep(rcec, dev)) {
+		dev->rcec = rcec;
+		pci_dbg(dev, "PME & error events signaled via %s\n",
+			pci_name(rcec));
+	}
+
+	return 0;
+}
+
+static void walk_rcec(int (*cb)(struct pci_dev *dev, void *data),
+		      void *userdata)
+{
+	struct walk_rcec_data *rcec_data = userdata;
+	struct pci_dev *rcec = rcec_data->rcec;
+	u8 nextbusn, lastbusn;
+	struct pci_bus *bus;
+	unsigned int bnr;
+
+	if (!rcec->rcec_ea)
+		return;
+
+	/* Walk own bus for bitmap based association */
+	pci_walk_bus(rcec->bus, cb, rcec_data);
+
+	nextbusn = rcec->rcec_ea->nextbusn;
+	lastbusn = rcec->rcec_ea->lastbusn;
+
+	/* All RCiEP devices are on the same bus as the RCEC */
+	if (nextbusn == 0xff && lastbusn == 0x00)
+		return;
+
+	for (bnr = nextbusn; bnr <= lastbusn; bnr++) {
+		/* No association indicated (PCIe 5.0-1, 7.9.10.3) */
+		if (bnr == rcec->bus->number)
+			continue;
+
+		bus = pci_find_bus(pci_domain_nr(rcec->bus), bnr);
+		if (!bus)
+			continue;
+
+		/* Find RCiEP devices on the given bus ranges */
+		pci_walk_bus(bus, cb, rcec_data);
+	}
+}
+
+/**
+ * pcie_link_rcec - Link RCiEP devices associated with RCEC.
+ * @rcec: RCEC whose RCiEP devices should be linked.
+ *
+ * Link the given RCEC to each RCiEP device found.
+ */
+void pcie_link_rcec(struct pci_dev *rcec)
+{
+	struct walk_rcec_data rcec_data;
+
+	if (!rcec->rcec_ea)
+		return;
+
+	rcec_data.rcec = rcec;
+	rcec_data.user_callback = NULL;
+	rcec_data.user_data = NULL;
+
+	walk_rcec(link_rcec_helper, &rcec_data);
+}
+
 void pci_rcec_init(struct pci_dev *dev)
 {
 	struct rcec_ea *rcec_ea;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f8c927fd0602..7c7d2d23e8a3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -330,6 +330,7 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCIEPORTBUS
 	struct rcec_ea	*rcec_ea;	/* RCEC cached endpoint association */
+	struct pci_dev  *rcec;          /* Associated RCEC device */
 #endif
 	u8		pcie_cap;	/* PCIe capability offset */
 	u8		msi_cap;	/* MSI capability offset */
-- 
cgit 


From 601c10c89cbb32b9123d8716d193e6d1a8e5300d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Mon, 5 Oct 2020 11:13:56 +0300
Subject: net/mlx5: Delete custom device management logic

After conversion to use auxiliary bus, all custom device management is
not needed anymore, delete it.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      | 288 ++-------------------
 drivers/net/ethernet/mellanox/mlx5/core/lag.c      |  18 --
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   8 -
 include/linux/mlx5/driver.h                        |  22 --
 4 files changed, 18 insertions(+), 318 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 843a8579d8c8..3a81c2f1971b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -35,24 +35,10 @@
 #include <linux/mlx5/mlx5_ifc_vdpa.h>
 #include "mlx5_core.h"
 
-static LIST_HEAD(intf_list);
-static LIST_HEAD(mlx5_dev_list);
 /* intf dev list mutex */
 static DEFINE_MUTEX(mlx5_intf_mutex);
 static DEFINE_IDA(mlx5_adev_ida);
 
-struct mlx5_device_context {
-	struct list_head	list;
-	struct mlx5_interface  *intf;
-	void		       *context;
-	unsigned long		state;
-};
-
-enum {
-	MLX5_INTERFACE_ADDED,
-	MLX5_INTERFACE_ATTACHED,
-};
-
 static bool is_eth_rep_supported(struct mlx5_core_dev *dev)
 {
 	if (!IS_ENABLED(CONFIG_MLX5_ESWITCH))
@@ -204,11 +190,22 @@ static bool is_ib_supported(struct mlx5_core_dev *dev)
 	return true;
 }
 
+enum {
+	MLX5_INTERFACE_PROTOCOL_ETH_REP,
+	MLX5_INTERFACE_PROTOCOL_ETH,
+
+	MLX5_INTERFACE_PROTOCOL_IB_REP,
+	MLX5_INTERFACE_PROTOCOL_MPIB,
+	MLX5_INTERFACE_PROTOCOL_IB,
+
+	MLX5_INTERFACE_PROTOCOL_VNET,
+};
+
 static const struct mlx5_adev_device {
 	const char *suffix;
 	bool (*is_supported)(struct mlx5_core_dev *dev);
 } mlx5_adev_devices[] = {
-	[MLX5_INTERFACE_PROTOCOL_VDPA] = { .suffix = "vnet",
+	[MLX5_INTERFACE_PROTOCOL_VNET] = { .suffix = "vnet",
 					   .is_supported = &is_vnet_supported },
 	[MLX5_INTERFACE_PROTOCOL_IB] = { .suffix = "rdma",
 					 .is_supported = &is_ib_supported },
@@ -251,90 +248,6 @@ void mlx5_adev_cleanup(struct mlx5_core_dev *dev)
 	kfree(priv->adev);
 }
 
-void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
-{
-	struct mlx5_device_context *dev_ctx;
-	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
-
-	if (!mlx5_lag_intf_add(intf, priv))
-		return;
-
-	dev_ctx = kzalloc(sizeof(*dev_ctx), GFP_KERNEL);
-	if (!dev_ctx)
-		return;
-
-	dev_ctx->intf = intf;
-
-	dev_ctx->context = intf->add(dev);
-	if (dev_ctx->context) {
-		set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
-		if (intf->attach)
-			set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
-
-		spin_lock_irq(&priv->ctx_lock);
-		list_add_tail(&dev_ctx->list, &priv->ctx_list);
-		spin_unlock_irq(&priv->ctx_lock);
-	}
-
-	if (!dev_ctx->context)
-		kfree(dev_ctx);
-}
-
-static struct mlx5_device_context *mlx5_get_device(struct mlx5_interface *intf,
-						   struct mlx5_priv *priv)
-{
-	struct mlx5_device_context *dev_ctx;
-
-	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
-		if (dev_ctx->intf == intf)
-			return dev_ctx;
-	return NULL;
-}
-
-void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
-{
-	struct mlx5_device_context *dev_ctx;
-	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
-
-	dev_ctx = mlx5_get_device(intf, priv);
-	if (!dev_ctx)
-		return;
-
-	spin_lock_irq(&priv->ctx_lock);
-	list_del(&dev_ctx->list);
-	spin_unlock_irq(&priv->ctx_lock);
-
-	if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
-		intf->remove(dev, dev_ctx->context);
-
-	kfree(dev_ctx);
-}
-
-static void mlx5_attach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
-{
-	struct mlx5_device_context *dev_ctx;
-	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
-
-	dev_ctx = mlx5_get_device(intf, priv);
-	if (!dev_ctx)
-		return;
-
-	if (intf->attach) {
-		if (test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state))
-			return;
-		if (intf->attach(dev, dev_ctx->context))
-			return;
-		set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
-	} else {
-		if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
-			return;
-		dev_ctx->context = intf->add(dev);
-		if (!dev_ctx->context)
-			return;
-		set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
-	}
-}
-
 static void adev_release(struct device *dev)
 {
 	struct mlx5_adev *mlx5_adev =
@@ -390,7 +303,6 @@ int mlx5_attach_device(struct mlx5_core_dev *dev)
 	struct mlx5_priv *priv = &dev->priv;
 	struct auxiliary_device *adev;
 	struct auxiliary_driver *adrv;
-	struct mlx5_interface *intf;
 	int ret = 0, i;
 
 	mutex_lock(&mlx5_intf_mutex);
@@ -423,41 +335,15 @@ int mlx5_attach_device(struct mlx5_core_dev *dev)
 			break;
 		}
 	}
-
-	list_for_each_entry(intf, &intf_list, list)
-		mlx5_attach_interface(intf, priv);
 	mutex_unlock(&mlx5_intf_mutex);
 	return ret;
 }
 
-static void mlx5_detach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
-{
-	struct mlx5_device_context *dev_ctx;
-	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
-
-	dev_ctx = mlx5_get_device(intf, priv);
-	if (!dev_ctx)
-		return;
-
-	if (intf->detach) {
-		if (!test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state))
-			return;
-		intf->detach(dev, dev_ctx->context);
-		clear_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
-	} else {
-		if (!test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
-			return;
-		intf->remove(dev, dev_ctx->context);
-		clear_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
-	}
-}
-
 void mlx5_detach_device(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct auxiliary_device *adev;
 	struct auxiliary_driver *adrv;
-	struct mlx5_interface *intf;
 	pm_message_t pm = {};
 	int i;
 
@@ -477,30 +363,11 @@ void mlx5_detach_device(struct mlx5_core_dev *dev)
 		del_adev(&priv->adev[i]->adev);
 		priv->adev[i] = NULL;
 	}
-
-	list_for_each_entry(intf, &intf_list, list)
-		mlx5_detach_interface(intf, priv);
 	mutex_unlock(&mlx5_intf_mutex);
 }
 
-bool mlx5_device_registered(struct mlx5_core_dev *dev)
-{
-	struct mlx5_priv *priv;
-	bool found = false;
-
-	mutex_lock(&mlx5_intf_mutex);
-	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
-		if (priv == &dev->priv)
-			found = true;
-	mutex_unlock(&mlx5_intf_mutex);
-
-	return found;
-}
-
 int mlx5_register_device(struct mlx5_core_dev *dev)
 {
-	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_interface *intf;
 	int ret;
 
 	mutex_lock(&mlx5_intf_mutex);
@@ -508,65 +375,19 @@ int mlx5_register_device(struct mlx5_core_dev *dev)
 	ret = mlx5_rescan_drivers_locked(dev);
 	mutex_unlock(&mlx5_intf_mutex);
 	if (ret)
-		goto add_err;
-
-	mutex_lock(&mlx5_intf_mutex);
-	list_add_tail(&priv->dev_list, &mlx5_dev_list);
-	list_for_each_entry(intf, &intf_list, list)
-		mlx5_add_device(intf, priv);
-	mutex_unlock(&mlx5_intf_mutex);
-
-	return 0;
+		mlx5_unregister_device(dev);
 
-add_err:
-	mlx5_unregister_device(dev);
 	return ret;
 }
 
 void mlx5_unregister_device(struct mlx5_core_dev *dev)
 {
-	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_interface *intf;
-
 	mutex_lock(&mlx5_intf_mutex);
-	list_for_each_entry_reverse(intf, &intf_list, list)
-		mlx5_remove_device(intf, priv);
-	list_del(&priv->dev_list);
-
 	dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV;
 	mlx5_rescan_drivers_locked(dev);
 	mutex_unlock(&mlx5_intf_mutex);
 }
 
-int mlx5_register_interface(struct mlx5_interface *intf)
-{
-	struct mlx5_priv *priv;
-
-	if (!intf->add || !intf->remove)
-		return -EINVAL;
-
-	mutex_lock(&mlx5_intf_mutex);
-	list_add_tail(&intf->list, &intf_list);
-	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
-		mlx5_add_device(intf, priv);
-	mutex_unlock(&mlx5_intf_mutex);
-
-	return 0;
-}
-EXPORT_SYMBOL(mlx5_register_interface);
-
-void mlx5_unregister_interface(struct mlx5_interface *intf)
-{
-	struct mlx5_priv *priv;
-
-	mutex_lock(&mlx5_intf_mutex);
-	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
-		mlx5_remove_device(intf, priv);
-	list_del(&intf->list);
-	mutex_unlock(&mlx5_intf_mutex);
-}
-EXPORT_SYMBOL(mlx5_unregister_interface);
-
 static int add_drivers(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
@@ -638,59 +459,6 @@ int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev)
 	return add_drivers(dev);
 }
 
-/* Must be called with intf_mutex held */
-static bool mlx5_has_added_dev_by_protocol(struct mlx5_core_dev *mdev, int protocol)
-{
-	struct mlx5_device_context *dev_ctx;
-	struct mlx5_interface *intf;
-	bool found = false;
-
-	list_for_each_entry(intf, &intf_list, list) {
-		if (intf->protocol == protocol) {
-			dev_ctx = mlx5_get_device(intf, &mdev->priv);
-			if (dev_ctx && test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
-				found = true;
-			break;
-		}
-	}
-
-	return found;
-}
-
-void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol)
-{
-	mutex_lock(&mlx5_intf_mutex);
-	if (mlx5_has_added_dev_by_protocol(mdev, protocol)) {
-		mlx5_remove_dev_by_protocol(mdev, protocol);
-		mlx5_add_dev_by_protocol(mdev, protocol);
-	}
-	mutex_unlock(&mlx5_intf_mutex);
-}
-
-/* Must be called with intf_mutex held */
-void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol)
-{
-	struct mlx5_interface *intf;
-
-	list_for_each_entry(intf, &intf_list, list)
-		if (intf->protocol == protocol) {
-			mlx5_add_device(intf, &dev->priv);
-			break;
-		}
-}
-
-/* Must be called with intf_mutex held */
-void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol)
-{
-	struct mlx5_interface *intf;
-
-	list_for_each_entry(intf, &intf_list, list)
-		if (intf->protocol == protocol) {
-			mlx5_remove_device(intf, &dev->priv);
-			break;
-		}
-}
-
 static u32 mlx5_gen_pci_id(const struct mlx5_core_dev *dev)
 {
 	return (u32)((pci_domain_nr(dev->pdev->bus) << 16) |
@@ -722,45 +490,25 @@ static int next_phys_dev(struct device *dev, const void *data)
  */
 struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev)
 {
-	struct mlx5_core_dev *res = NULL;
-	struct mlx5_core_dev *tmp_dev;
 	struct auxiliary_device *adev;
 	struct mlx5_adev *madev;
-	struct mlx5_priv *priv;
-	u32 pci_id;
 
 	if (!mlx5_core_is_pf(dev))
 		return NULL;
 
 	adev = auxiliary_find_device(NULL, dev, &next_phys_dev);
-	if (adev) {
-		madev = container_of(adev, struct mlx5_adev, adev);
-
-		put_device(&adev->dev);
-		return madev->mdev;
-	}
-
-	pci_id = mlx5_gen_pci_id(dev);
-	list_for_each_entry(priv, &mlx5_dev_list, dev_list) {
-		tmp_dev = container_of(priv, struct mlx5_core_dev, priv);
-		if (!mlx5_core_is_pf(tmp_dev))
-			continue;
-
-		if ((dev != tmp_dev) && (mlx5_gen_pci_id(tmp_dev) == pci_id)) {
-			res = tmp_dev;
-			break;
-		}
-	}
+	if (!adev)
+		return NULL;
 
-	return res;
+	madev = container_of(adev, struct mlx5_adev, adev);
+	put_device(&adev->dev);
+	return madev->mdev;
 }
 
-
 void mlx5_dev_list_lock(void)
 {
 	mutex_lock(&mlx5_intf_mutex);
 }
-
 void mlx5_dev_list_unlock(void)
 {
 	mutex_unlock(&mlx5_intf_mutex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index 325f32b9525c..f3d45ef082cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -749,24 +749,6 @@ unlock:
 }
 EXPORT_SYMBOL(mlx5_lag_get_slave_port);
 
-bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv)
-{
-	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev,
-						 priv);
-	struct mlx5_lag *ldev;
-
-	if (intf->protocol != MLX5_INTERFACE_PROTOCOL_IB)
-		return true;
-
-	ldev = mlx5_lag_dev_get(dev);
-	if (!ldev || !__mlx5_lag_is_roce(ldev) ||
-	    ldev->pf[MLX5_LAG_P1].dev == dev)
-		return true;
-
-	/* If bonded, we do not add an IB device for PF1. */
-	return false;
-}
-
 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
 				 u64 *values,
 				 int num_counters,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index f7e44e04d465..dd7312621d0d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -183,22 +183,15 @@ void mlx5_adev_idx_free(int idx);
 void mlx5_adev_cleanup(struct mlx5_core_dev *dev);
 int mlx5_adev_init(struct mlx5_core_dev *dev);
 
-void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv);
-void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv);
 int mlx5_attach_device(struct mlx5_core_dev *dev);
 void mlx5_detach_device(struct mlx5_core_dev *dev);
-bool mlx5_device_registered(struct mlx5_core_dev *dev);
 int mlx5_register_device(struct mlx5_core_dev *dev);
 void mlx5_unregister_device(struct mlx5_core_dev *dev);
-void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol);
-void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol);
 struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev);
 void mlx5_dev_list_lock(void);
 void mlx5_dev_list_unlock(void);
 int mlx5_dev_list_trylock(void);
 
-bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv);
-
 int mlx5_query_mtpps(struct mlx5_core_dev *dev, u32 *mtpps, u32 mtpps_size);
 int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size);
 int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode);
@@ -248,7 +241,6 @@ static inline int mlx5_rescan_drivers(struct mlx5_core_dev *dev)
 	return ret;
 }
 
-void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol);
 void mlx5_lag_update(struct mlx5_core_dev *dev);
 
 enum {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e31c72693bcf..54299305a2c8 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1076,28 +1076,6 @@ enum {
 	MAX_MR_CACHE_ENTRIES
 };
 
-enum {
-	MLX5_INTERFACE_PROTOCOL_ETH_REP,
-	MLX5_INTERFACE_PROTOCOL_ETH,
-
-	MLX5_INTERFACE_PROTOCOL_IB_REP,
-	MLX5_INTERFACE_PROTOCOL_MPIB,
-	MLX5_INTERFACE_PROTOCOL_IB,
-
-	MLX5_INTERFACE_PROTOCOL_VDPA,
-};
-
-struct mlx5_interface {
-	void *			(*add)(struct mlx5_core_dev *dev);
-	void			(*remove)(struct mlx5_core_dev *dev, void *context);
-	int			(*attach)(struct mlx5_core_dev *dev, void *context);
-	void			(*detach)(struct mlx5_core_dev *dev, void *context);
-	int			protocol;
-	struct list_head	list;
-};
-
-int mlx5_register_interface(struct mlx5_interface *intf);
-void mlx5_unregister_interface(struct mlx5_interface *intf);
 int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
 int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
 int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
-- 
cgit 


From e87114022e1de734de0552e6b4f2dc5309efa27a Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sat, 10 Oct 2020 11:57:26 +0300
Subject: net/mlx5: Simplify eswitch mode check

Provide mlx5_core device instead of "priv" pointer while checking
eswith mode.

Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/counters.c             | 7 -------
 drivers/infiniband/hw/mlx5/ib_rep.c               | 5 -----
 drivers/infiniband/hw/mlx5/ib_rep.h               | 6 ------
 drivers/net/ethernet/mellanox/mlx5/core/dev.c     | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   | 8 +++-----
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 4 +++-
 include/linux/mlx5/eswitch.h                      | 8 ++++++--
 9 files changed, 16 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c
index 70c8fd67ee2f..084652e2b15a 100644
--- a/drivers/infiniband/hw/mlx5/counters.c
+++ b/drivers/infiniband/hw/mlx5/counters.c
@@ -138,13 +138,6 @@ static int mlx5_ib_create_counters(struct ib_counters *counters,
 }
 
 
-static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
-{
-	return MLX5_ESWITCH_MANAGER(mdev) &&
-	       mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
-		       MLX5_ESWITCH_OFFLOADS;
-}
-
 static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
 						   u8 port_num)
 {
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 3d889a70130b..571b7a0f9188 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -102,11 +102,6 @@ static const struct mlx5_eswitch_rep_ops rep_ops = {
 	.get_proto_dev = mlx5_ib_vport_get_proto_dev,
 };
 
-u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
-{
-	return mlx5_eswitch_mode(esw);
-}
-
 struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
 					  u16 vport_num)
 {
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index 94bf51ddd422..93f562735e89 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -12,7 +12,6 @@
 extern const struct mlx5_ib_profile raw_eth_profile;
 
 #ifdef CONFIG_MLX5_ESWITCH
-u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw);
 struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
 					  u16 vport_num);
 struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw);
@@ -26,11 +25,6 @@ struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
 struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
 					  u16 vport_num);
 #else /* CONFIG_MLX5_ESWITCH */
-static inline u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
-{
-	return MLX5_ESWITCH_NONE;
-}
-
 static inline
 struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
 					  u16 vport_num)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 3a81c2f1971b..b051417ede67 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -47,7 +47,7 @@ static bool is_eth_rep_supported(struct mlx5_core_dev *dev)
 	if (!MLX5_ESWITCH_MANAGER(dev))
 		return false;
 
-	if (mlx5_eswitch_mode(dev->priv.eswitch) != MLX5_ESWITCH_OFFLOADS)
+	if (!is_mdev_switchdev_mode(dev))
 		return false;
 
 	return true;
@@ -144,7 +144,7 @@ static bool is_ib_rep_supported(struct mlx5_core_dev *dev)
 	if (!MLX5_ESWITCH_MANAGER(dev))
 		return false;
 
-	if (mlx5_eswitch_mode(dev->priv.eswitch) != MLX5_ESWITCH_OFFLOADS)
+	if (!is_mdev_switchdev_mode(dev))
 		return false;
 
 	if (mlx5_core_mp_enabled(dev))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 1a351e2f6ace..aeffb6b135ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -221,7 +221,7 @@ static int mlx5_devlink_fs_mode_validate(struct devlink *devlink, u32 id,
 		u8 eswitch_mode;
 		bool smfs_cap;
 
-		eswitch_mode = mlx5_eswitch_mode(dev->priv.eswitch);
+		eswitch_mode = mlx5_eswitch_mode(dev);
 		smfs_cap = mlx5_fs_dr_is_supported(dev);
 
 		if (!smfs_cap) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 25490d1e3216..cd0e2f451342 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3135,7 +3135,7 @@ static void mlx5e_modify_admin_state(struct mlx5_core_dev *mdev,
 
 	mlx5_set_port_admin_status(mdev, state);
 
-	if (!MLX5_ESWITCH_MANAGER(mdev) ||  mlx5_eswitch_mode(esw) == MLX5_ESWITCH_OFFLOADS)
+	if (mlx5_eswitch_mode(mdev) != MLX5_ESWITCH_LEGACY)
 		return;
 
 	if (state == MLX5_PORT_UP)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index ce710f22b1ff..4cdf834fa74a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -271,8 +271,6 @@ mlx5e_tc_match_to_reg_set(struct mlx5_core_dev *mdev,
 	return 0;
 }
 
-#define esw_offloads_mode(esw) (mlx5_eswitch_mode(esw) == MLX5_ESWITCH_OFFLOADS)
-
 static struct mlx5_tc_ct_priv *
 get_ct_priv(struct mlx5e_priv *priv)
 {
@@ -280,7 +278,7 @@ get_ct_priv(struct mlx5e_priv *priv)
 	struct mlx5_rep_uplink_priv *uplink_priv;
 	struct mlx5e_rep_priv *uplink_rpriv;
 
-	if (esw_offloads_mode(esw)) {
+	if (is_mdev_switchdev_mode(priv->mdev)) {
 		uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
 		uplink_priv = &uplink_rpriv->uplink_priv;
 
@@ -297,7 +295,7 @@ mlx5_tc_rule_insert(struct mlx5e_priv *priv,
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 
-	if (esw_offloads_mode(esw))
+	if (is_mdev_switchdev_mode(priv->mdev))
 		return mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
 
 	return	mlx5e_add_offloaded_nic_rule(priv, spec, attr);
@@ -310,7 +308,7 @@ mlx5_tc_rule_delete(struct mlx5e_priv *priv,
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 
-	if (esw_offloads_mode(esw)) {
+	if (is_mdev_switchdev_mode(priv->mdev)) {
 		mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
 
 		return;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index cb06b6e53fdd..c23dd95e634d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2436,8 +2436,10 @@ free_out:
 	return err;
 }
 
-u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
+u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev)
 {
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+
 	return ESW_ALLOWED(esw) ? esw->mode : MLX5_ESWITCH_NONE;
 }
 EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index b0ae8020f13e..29fd832950e0 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -96,10 +96,10 @@ static inline u32 mlx5_eswitch_get_vport_metadata_mask(void)
 
 u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw,
 					      u16 vport_num);
-u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw);
+u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
 #else  /* CONFIG_MLX5_ESWITCH */
 
-static inline u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
+static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev)
 {
 	return MLX5_ESWITCH_NONE;
 }
@@ -136,4 +136,8 @@ mlx5_eswitch_get_vport_metadata_mask(void)
 }
 #endif /* CONFIG_MLX5_ESWITCH */
 
+static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev)
+{
+	return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS;
+}
 #endif
-- 
cgit 


From e91d8d78237de8d7120c320b3645b7100848f24d Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Sat, 5 Dec 2020 22:14:51 -0800
Subject: mm/zsmalloc.c: drop ZSMALLOC_PGTABLE_MAPPING

While I was doing zram testing, I found sometimes decompression failed
since the compression buffer was corrupted.  With investigation, I found
below commit calls cond_resched unconditionally so it could make a
problem in atomic context if the task is reschedule.

  BUG: sleeping function called from invalid context at mm/vmalloc.c:108
  in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 946, name: memhog
  3 locks held by memhog/946:
   #0: ffff9d01d4b193e8 (&mm->mmap_lock#2){++++}-{4:4}, at: __mm_populate+0x103/0x160
   #1: ffffffffa3d53de0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0xa98/0x1160
   #2: ffff9d01d56b8110 (&zspage->lock){.+.+}-{3:3}, at: zs_map_object+0x8e/0x1f0
  CPU: 0 PID: 946 Comm: memhog Not tainted 5.9.3-00011-gc5bfc0287345-dirty #316
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
  Call Trace:
    unmap_kernel_range_noflush+0x2eb/0x350
    unmap_kernel_range+0x14/0x30
    zs_unmap_object+0xd5/0xe0
    zram_bvec_rw.isra.0+0x38c/0x8e0
    zram_rw_page+0x90/0x101
    bdev_write_page+0x92/0xe0
    __swap_writepage+0x94/0x4a0
    pageout+0xe3/0x3a0
    shrink_page_list+0xb94/0xd60
    shrink_inactive_list+0x158/0x460

We can fix this by removing the ZSMALLOC_PGTABLE_MAPPING feature (which
contains the offending calling code) from zsmalloc.

Even though this option showed some amount improvement(e.g., 30%) in
some arm32 platforms, it has been headache to maintain since it have
abused APIs[1](e.g., unmap_kernel_range in atomic context).

Since we are approaching to deprecate 32bit machines and already made
the config option available for only builtin build since v5.8, lastly it
has been not default option in zsmalloc, it's time to drop the option
for better maintenance.

[1] http://lore.kernel.org/linux-mm/20201105170249.387069-1-minchan@kernel.org

Fixes: e47110e90584 ("mm/vunmap: add cond_resched() in vunmap_pmd_range")
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Harish Sriram <harish@linux.ibm.com>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20201117202916.GA3856507@google.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/configs/omap2plus_defconfig |  1 -
 include/linux/zsmalloc.h             |  1 -
 mm/Kconfig                           | 13 ---------
 mm/zsmalloc.c                        | 54 ------------------------------------
 4 files changed, 69 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 34793aabdb65..58df9fd79a76 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -81,7 +81,6 @@ CONFIG_PARTITION_ADVANCED=y
 CONFIG_BINFMT_MISC=y
 CONFIG_CMA=y
 CONFIG_ZSMALLOC=m
-CONFIG_ZSMALLOC_PGTABLE_MAPPING=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 0fdbf653b173..4807ca4d52e0 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -20,7 +20,6 @@
  * zsmalloc mapping modes
  *
  * NOTE: These only make a difference when a mapped object spans pages.
- * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected.
  */
 enum zs_mapmode {
 	ZS_MM_RW, /* normal read-write mapping */
diff --git a/mm/Kconfig b/mm/Kconfig
index d42423f884a7..390165ffbb0f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -707,19 +707,6 @@ config ZSMALLOC
 	  returned by an alloc().  This handle must be mapped in order to
 	  access the allocated space.
 
-config ZSMALLOC_PGTABLE_MAPPING
-	bool "Use page table mapping to access object in zsmalloc"
-	depends on ZSMALLOC=y
-	help
-	  By default, zsmalloc uses a copy-based object mapping method to
-	  access allocations that span two pages. However, if a particular
-	  architecture (ex, ARM) performs VM mapping faster than copying,
-	  then you should select this. This causes zsmalloc to use page table
-	  mapping rather than copying for object mapping.
-
-	  You can check speed with zsmalloc benchmark:
-	  https://github.com/spartacus06/zsmapbench
-
 config ZSMALLOC_STAT
 	bool "Export zsmalloc statistics"
 	depends on ZSMALLOC
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 918c7b019b3d..cdfaaadea8ff 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -293,11 +293,7 @@ struct zspage {
 };
 
 struct mapping_area {
-#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
-	struct vm_struct *vm; /* vm area for mapping object that span pages */
-#else
 	char *vm_buf; /* copy buffer for objects that span pages */
-#endif
 	char *vm_addr; /* address of kmap_atomic()'ed pages */
 	enum zs_mapmode vm_mm; /* mapping mode */
 };
@@ -1113,54 +1109,6 @@ static struct zspage *find_get_zspage(struct size_class *class)
 	return zspage;
 }
 
-#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
-	/*
-	 * Make sure we don't leak memory if a cpu UP notification
-	 * and zs_init() race and both call zs_cpu_up() on the same cpu
-	 */
-	if (area->vm)
-		return 0;
-	area->vm = get_vm_area(PAGE_SIZE * 2, 0);
-	if (!area->vm)
-		return -ENOMEM;
-
-	/*
-	 * Populate ptes in advance to avoid pte allocation with GFP_KERNEL
-	 * in non-preemtible context of zs_map_object.
-	 */
-	return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr,
-			PAGE_SIZE * 2, NULL, NULL);
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
-	if (area->vm)
-		free_vm_area(area->vm);
-	area->vm = NULL;
-}
-
-static inline void *__zs_map_object(struct mapping_area *area,
-				struct page *pages[2], int off, int size)
-{
-	unsigned long addr = (unsigned long)area->vm->addr;
-
-	BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0);
-	area->vm_addr = area->vm->addr;
-	return area->vm_addr + off;
-}
-
-static inline void __zs_unmap_object(struct mapping_area *area,
-				struct page *pages[2], int off, int size)
-{
-	unsigned long addr = (unsigned long)area->vm_addr;
-
-	unmap_kernel_range(addr, PAGE_SIZE * 2);
-}
-
-#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
-
 static inline int __zs_cpu_up(struct mapping_area *area)
 {
 	/*
@@ -1241,8 +1189,6 @@ out:
 	pagefault_enable();
 }
 
-#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
-
 static int zs_cpu_prepare(unsigned int cpu)
 {
 	struct mapping_area *area;
-- 
cgit 


From 85261c1ff156eb60fc26c378748387f2e85c6878 Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin@intel.com>
Date: Mon, 16 Nov 2020 14:56:11 +0200
Subject: mei: bus: add vtag support

Add API to support vtag in communication on mei bus.

Add mei_cldev_send_vtag, mei_cldev_recv_vtag and
mei_cldev_recv_nonblock_vtag functions to allow sending a message
with vtag set and to receive vtag of an incoming message.

Cc: Sean Z Huang <sean.z.huang@intel.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Link: https://lore.kernel.org/r/20201116125612.1660971-1-tomas.winkler@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus-fixup.c |  13 +++---
 drivers/misc/mei/bus.c       | 101 +++++++++++++++++++++++++++++++++++--------
 drivers/misc/mei/client.c    |   6 ++-
 drivers/misc/mei/mei_dev.h   |   4 +-
 include/linux/mei_cl_bus.h   |   6 +++
 5 files changed, 104 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c
index 4e30fa98fe7d..6cc3145bb716 100644
--- a/drivers/misc/mei/bus-fixup.c
+++ b/drivers/misc/mei/bus-fixup.c
@@ -148,7 +148,7 @@ static int mei_osver(struct mei_cl_device *cldev)
 	os_ver = (struct mei_os_ver *)fwcaps->data;
 	os_ver->os_type = OSTYPE_LINUX;
 
-	return __mei_cl_send(cldev->cl, buf, size, mode);
+	return __mei_cl_send(cldev->cl, buf, size, 0, mode);
 }
 
 #define MKHI_FWVER_BUF_LEN (sizeof(struct mkhi_msg_hdr) + \
@@ -169,7 +169,7 @@ static int mei_fwver(struct mei_cl_device *cldev)
 	req.hdr.group_id = MKHI_GEN_GROUP_ID;
 	req.hdr.command = MKHI_GEN_GET_FW_VERSION_CMD;
 
-	ret = __mei_cl_send(cldev->cl, (u8 *)&req, sizeof(req),
+	ret = __mei_cl_send(cldev->cl, (u8 *)&req, sizeof(req), 0,
 			    MEI_CL_IO_TX_BLOCKING);
 	if (ret < 0) {
 		dev_err(&cldev->dev, "Could not send ReqFWVersion cmd\n");
@@ -177,7 +177,7 @@ static int mei_fwver(struct mei_cl_device *cldev)
 	}
 
 	ret = 0;
-	bytes_recv = __mei_cl_recv(cldev->cl, buf, sizeof(buf), 0,
+	bytes_recv = __mei_cl_recv(cldev->cl, buf, sizeof(buf), NULL, 0,
 				   MKHI_RCV_TIMEOUT);
 	if (bytes_recv < 0 || (size_t)bytes_recv < MKHI_FWVER_LEN(1)) {
 		/*
@@ -324,13 +324,15 @@ static int mei_nfc_if_version(struct mei_cl *cl,
 	};
 	struct mei_nfc_reply *reply = NULL;
 	size_t if_version_length;
+	u8 vtag;
 	int bytes_recv, ret;
 
 	bus = cl->dev;
 
 	WARN_ON(mutex_is_locked(&bus->device_lock));
 
-	ret = __mei_cl_send(cl, (u8 *)&cmd, sizeof(cmd), MEI_CL_IO_TX_BLOCKING);
+	ret = __mei_cl_send(cl, (u8 *)&cmd, sizeof(cmd), 0,
+			    MEI_CL_IO_TX_BLOCKING);
 	if (ret < 0) {
 		dev_err(bus->dev, "Could not send IF version cmd\n");
 		return ret;
@@ -344,7 +346,8 @@ static int mei_nfc_if_version(struct mei_cl *cl,
 		return -ENOMEM;
 
 	ret = 0;
-	bytes_recv = __mei_cl_recv(cl, (u8 *)reply, if_version_length, 0, 0);
+	bytes_recv = __mei_cl_recv(cl, (u8 *)reply, if_version_length, &vtag,
+				   0, 0);
 	if (bytes_recv < 0 || (size_t)bytes_recv < if_version_length) {
 		dev_err(bus->dev, "Could not read IF version\n");
 		ret = -EIO;
diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 7fe48baa103a..2907db260fba 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -26,11 +26,12 @@
  * @cl: host client
  * @buf: buffer to send
  * @length: buffer length
+ * @vtag: virtual tag
  * @mode: sending mode
  *
  * Return: written size bytes or < 0 on error
  */
-ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
+ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length, u8 vtag,
 		      unsigned int mode)
 {
 	struct mei_device *bus;
@@ -86,6 +87,7 @@ ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
 		rets = -ENOMEM;
 		goto out;
 	}
+	cb->vtag = vtag;
 
 	cb->internal = !!(mode & MEI_CL_IO_TX_INTERNAL);
 	cb->blocking = !!(mode & MEI_CL_IO_TX_BLOCKING);
@@ -106,11 +108,12 @@ out:
  * @buf: buffer to receive
  * @length: buffer length
  * @mode: io mode
+ * @vtag: virtual tag
  * @timeout: recv timeout, 0 for infinite timeout
  *
  * Return: read size in bytes of < 0 on error
  */
-ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length,
+ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length, u8 *vtag,
 		      unsigned int mode, unsigned long timeout)
 {
 	struct mei_device *bus;
@@ -196,6 +199,8 @@ copy:
 	r_length = min_t(size_t, length, cb->buf_idx);
 	memcpy(buf, cb->buf.data, r_length);
 	rets = r_length;
+	if (vtag)
+		*vtag = cb->vtag;
 
 free:
 	mei_cl_del_rd_completed(cl, cb);
@@ -206,40 +211,87 @@ out:
 }
 
 /**
- * mei_cldev_send - me device send  (write)
+ * mei_cldev_send_vtag - me device send with vtag  (write)
  *
  * @cldev: me client device
  * @buf: buffer to send
  * @length: buffer length
+ * @vtag: virtual tag
  *
- * Return: written size in bytes or < 0 on error
+ * Return:
+ *  * written size in bytes
+ *  * < 0 on error
  */
-ssize_t mei_cldev_send(struct mei_cl_device *cldev, u8 *buf, size_t length)
+
+ssize_t mei_cldev_send_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			    u8 vtag)
 {
 	struct mei_cl *cl = cldev->cl;
 
-	return __mei_cl_send(cl, buf, length, MEI_CL_IO_TX_BLOCKING);
+	return __mei_cl_send(cl, buf, length, vtag, MEI_CL_IO_TX_BLOCKING);
 }
-EXPORT_SYMBOL_GPL(mei_cldev_send);
+EXPORT_SYMBOL_GPL(mei_cldev_send_vtag);
 
 /**
- * mei_cldev_recv_nonblock - non block client receive (read)
+ * mei_cldev_recv_vtag - client receive with vtag (read)
  *
  * @cldev: me client device
  * @buf: buffer to receive
  * @length: buffer length
+ * @vtag: virtual tag
  *
- * Return: read size in bytes of < 0 on error
- *         -EAGAIN if function will block.
+ * Return:
+ * * read size in bytes
+ * *  < 0 on error
  */
-ssize_t mei_cldev_recv_nonblock(struct mei_cl_device *cldev, u8 *buf,
-				size_t length)
+
+ssize_t mei_cldev_recv_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			    u8 *vtag)
 {
 	struct mei_cl *cl = cldev->cl;
 
-	return __mei_cl_recv(cl, buf, length, MEI_CL_IO_RX_NONBLOCK, 0);
+	return __mei_cl_recv(cl, buf, length, vtag, 0, 0);
 }
-EXPORT_SYMBOL_GPL(mei_cldev_recv_nonblock);
+EXPORT_SYMBOL_GPL(mei_cldev_recv_vtag);
+
+/**
+ * mei_cldev_recv_nonblock_vtag - non block client receive with vtag (read)
+ *
+ * @cldev: me client device
+ * @buf: buffer to receive
+ * @length: buffer length
+ * @vtag: virtual tag
+ *
+ * Return:
+ * * read size in bytes
+ * * -EAGAIN if function will block.
+ * * < 0 on other error
+ */
+ssize_t mei_cldev_recv_nonblock_vtag(struct mei_cl_device *cldev, u8 *buf,
+				     size_t length, u8 *vtag)
+{
+	struct mei_cl *cl = cldev->cl;
+
+	return __mei_cl_recv(cl, buf, length, vtag, MEI_CL_IO_RX_NONBLOCK, 0);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_recv_nonblock_vtag);
+
+/**
+ * mei_cldev_send - me device send  (write)
+ *
+ * @cldev: me client device
+ * @buf: buffer to send
+ * @length: buffer length
+ *
+ * Return:
+ *  * written size in bytes
+ *  * < 0 on error
+ */
+ssize_t mei_cldev_send(struct mei_cl_device *cldev, u8 *buf, size_t length)
+{
+	return mei_cldev_send_vtag(cldev, buf, length, 0);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_send);
 
 /**
  * mei_cldev_recv - client receive (read)
@@ -252,12 +304,27 @@ EXPORT_SYMBOL_GPL(mei_cldev_recv_nonblock);
  */
 ssize_t mei_cldev_recv(struct mei_cl_device *cldev, u8 *buf, size_t length)
 {
-	struct mei_cl *cl = cldev->cl;
-
-	return __mei_cl_recv(cl, buf, length, 0, 0);
+	return mei_cldev_recv_vtag(cldev, buf, length, NULL);
 }
 EXPORT_SYMBOL_GPL(mei_cldev_recv);
 
+/**
+ * mei_cldev_recv_nonblock - non block client receive (read)
+ *
+ * @cldev: me client device
+ * @buf: buffer to receive
+ * @length: buffer length
+ *
+ * Return: read size in bytes of < 0 on error
+ *         -EAGAIN if function will block.
+ */
+ssize_t mei_cldev_recv_nonblock(struct mei_cl_device *cldev, u8 *buf,
+				size_t length)
+{
+	return mei_cldev_recv_nonblock_vtag(cldev, buf, length, NULL);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_recv_nonblock);
+
 /**
  * mei_cl_bus_rx_work - dispatch rx event for a bus device
  *
diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c
index d5c3f7d54634..a56d41321f32 100644
--- a/drivers/misc/mei/client.c
+++ b/drivers/misc/mei/client.c
@@ -1306,7 +1306,7 @@ struct mei_cl_vtag *mei_cl_vtag_alloc(struct file *fp, u8 vtag)
  * mei_cl_fp_by_vtag - obtain the file pointer by vtag
  *
  * @cl: host client
- * @vtag: vm tag
+ * @vtag: virtual tag
  *
  * Return:
  * * A file pointer - on success
@@ -1317,7 +1317,9 @@ const struct file *mei_cl_fp_by_vtag(const struct mei_cl *cl, u8 vtag)
 	struct mei_cl_vtag *vtag_l;
 
 	list_for_each_entry(vtag_l, &cl->vtag_map, list)
-		if (vtag_l->vtag == vtag)
+		/* The client on bus has one fixed fp */
+		if ((cl->cldev && mei_cldev_enabled(cl->cldev)) ||
+		    vtag_l->vtag == vtag)
 			return vtag_l->fp;
 
 	return ERR_PTR(-ENOENT);
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index 2f4cc1a8aae8..8c395bfdf6f3 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -340,9 +340,9 @@ struct mei_hw_ops {
 /* MEI bus API*/
 void mei_cl_bus_rescan_work(struct work_struct *work);
 void mei_cl_bus_dev_fixup(struct mei_cl_device *dev);
-ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
+ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length, u8 vtag,
 		      unsigned int mode);
-ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length,
+ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length, u8 *vtag,
 		      unsigned int mode, unsigned long timeout);
 bool mei_cl_bus_rx_event(struct mei_cl *cl);
 bool mei_cl_bus_notify_event(struct mei_cl *cl);
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 52aa4821093a..959ad7d850b4 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -95,6 +95,12 @@ ssize_t mei_cldev_send(struct mei_cl_device *cldev, u8 *buf, size_t length);
 ssize_t mei_cldev_recv(struct mei_cl_device *cldev, u8 *buf, size_t length);
 ssize_t mei_cldev_recv_nonblock(struct mei_cl_device *cldev, u8 *buf,
 				size_t length);
+ssize_t mei_cldev_send_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			    u8 vtag);
+ssize_t mei_cldev_recv_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			    u8 *vtag);
+ssize_t mei_cldev_recv_nonblock_vtag(struct mei_cl_device *cldev, u8 *buf,
+				     size_t length, u8 *vtag);
 
 int mei_cldev_register_rx_cb(struct mei_cl_device *cldev, mei_cldev_cb_t rx_cb);
 int mei_cldev_register_notif_cb(struct mei_cl_device *cldev,
-- 
cgit 


From 76437b340b242fd21952f54ba8965d21a1ffa8c8 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 7 Dec 2020 10:16:01 +0100
Subject: earlycon: drop semicolon from earlycon macro

Drop the trailing semicolon from the OF_EARLYCON_DECLARE() macro
definition which was left when removing the array-of-pointer
indirection.

Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20201207091601.5202-1-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 3e32b788c28d..e1b684e33841 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -372,7 +372,7 @@ extern const struct earlycon_id __earlycon_table_end[];
 		__aligned(__alignof__(struct earlycon_id))		\
 		= { .name = __stringify(_name),				\
 		    .compatible = compat,				\
-		    .setup = fn };
+		    .setup = fn }
 
 #define EARLYCON_DECLARE(_name, fn)	OF_EARLYCON_DECLARE(_name, "", fn)
 
-- 
cgit 


From 2d26c716fc49f41a63e1efe8f1f772b0adeaacef Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 7 Dec 2020 10:13:08 +0100
Subject: module: drop semicolon from version macro

Drop the trailing semicolon from the MODULE_VERSION() macro definition
which was left when removing the array-of-pointer indirection.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 5958075ea3f4..e7a619c2457e 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -279,7 +279,7 @@ extern typeof(name) __mod_##type##__##name##_device_table		\
 			},						\
 			.module_name	= KBUILD_MODNAME,		\
 			.version	= _version,			\
-		};
+		}
 #endif
 
 /* Optional firmware file (or files) needed by the module
-- 
cgit 


From c69942bda5152d764ee7d897d1627d64c7177ea1 Mon Sep 17 00:00:00 2001
From: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Date: Mon, 30 Nov 2020 16:24:15 +0100
Subject: mtd: spi-nor: Fix multiple typos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are a few typos in comments in the SPI NOR framework; fix them.

Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Reviewed-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Link: https://lore.kernel.org/r/20201130152416.1283972-1-j.neuschaefer@gmx.net
---
 drivers/mtd/spi-nor/core.c  | 4 ++--
 drivers/mtd/spi-nor/sfdp.c  | 2 +-
 include/linux/mtd/spi-nor.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 5bee7c8da4dc..7b4850f94223 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -1599,7 +1599,7 @@ destroy_erase_cmd_list:
 
 /*
  * Erase an address range on the nor chip.  The address range may extend
- * one or more erase sectors.  Return an error is there is a problem erasing.
+ * one or more erase sectors. Return an error if there is a problem erasing.
  */
 static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr)
 {
@@ -2693,7 +2693,7 @@ spi_nor_select_uniform_erase(struct spi_nor_erase_map *map,
 		}
 
 		/*
-		 * Otherwise, the current erase size is still a valid canditate.
+		 * Otherwise, the current erase size is still a valid candidate.
 		 * Select the biggest valid candidate.
 		 */
 		if (!erase && tested_erase->size)
diff --git a/drivers/mtd/spi-nor/sfdp.c b/drivers/mtd/spi-nor/sfdp.c
index 22cb519efe3f..6ee7719e5903 100644
--- a/drivers/mtd/spi-nor/sfdp.c
+++ b/drivers/mtd/spi-nor/sfdp.c
@@ -65,7 +65,7 @@ struct sfdp_bfpt_read {
 
 struct sfdp_bfpt_erase {
 	/*
-	 * The half-word at offset <shift> in DWORD <dwoard> encodes the
+	 * The half-word at offset <shift> in DWORD <dword> encodes the
 	 * op code and erase sector size to be used by Sector Erase commands.
 	 */
 	u32			dword;
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 299685d15dc2..d13958de6d8a 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -433,7 +433,7 @@ static inline struct device_node *spi_nor_get_flash_node(struct spi_nor *nor)
  * @name:	the chip type name
  * @hwcaps:	the hardware capabilities supported by the controller driver
  *
- * The drivers can use this fuction to scan the SPI NOR.
+ * The drivers can use this function to scan the SPI NOR.
  * In the scanning, it will try to get all the necessary information to
  * fill the mtd_info{} and the spi_nor{}.
  *
-- 
cgit 


From 0eba770790426553f45b8643bcd77b854e045057 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Thu, 5 Nov 2020 20:27:45 +0100
Subject: clk: composite: add devm_clk_hw_register_composite_pdata()

This will simplify drivers which would only unregister the clk in their
remove() op.

Signed-off-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20201105192746.19564-3-michael@walle.cc
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-composite.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  7 +++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 2ddb54f7d3ab..0506046a5f4b 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/device.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 
@@ -405,3 +406,52 @@ void clk_hw_unregister_composite(struct clk_hw *hw)
 	kfree(composite);
 }
 EXPORT_SYMBOL_GPL(clk_hw_unregister_composite);
+
+static void devm_clk_hw_release_composite(struct device *dev, void *res)
+{
+	clk_hw_unregister_composite(*(struct clk_hw **)res);
+}
+
+static struct clk_hw *__devm_clk_hw_register_composite(struct device *dev,
+			const char *name, const char * const *parent_names,
+			const struct clk_parent_data *pdata, int num_parents,
+			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+			unsigned long flags)
+{
+	struct clk_hw **ptr, *hw;
+
+	ptr = devres_alloc(devm_clk_hw_release_composite, sizeof(*ptr),
+			   GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	hw = __clk_hw_register_composite(dev, name, parent_names, pdata,
+					 num_parents, mux_hw, mux_ops, rate_hw,
+					 rate_ops, gate_hw, gate_ops, flags);
+
+	if (!IS_ERR(hw)) {
+		*ptr = hw;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return hw;
+}
+
+struct clk_hw *devm_clk_hw_register_composite_pdata(struct device *dev,
+			const char *name,
+			const struct clk_parent_data *parent_data,
+			int num_parents,
+			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+			unsigned long flags)
+{
+	return __devm_clk_hw_register_composite(dev, name, NULL, parent_data,
+						num_parents, mux_hw, mux_ops,
+						rate_hw, rate_ops, gate_hw,
+						gate_ops, flags);
+}
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 03a5de5f99f4..33db52ff83a0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1062,6 +1062,13 @@ struct clk_hw *clk_hw_register_composite_pdata(struct device *dev,
 		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 		unsigned long flags);
+struct clk_hw *devm_clk_hw_register_composite_pdata(struct device *dev,
+		const char *name, const struct clk_parent_data *parent_data,
+		int num_parents,
+		struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+		unsigned long flags);
 void clk_hw_unregister_composite(struct clk_hw *hw);
 
 struct clk *clk_register(struct device *dev, struct clk_hw *hw);
-- 
cgit 


From d9a9280a0d0ae51dc1d4142138b99242b7ec8ac6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 26 Oct 2020 17:10:58 +0100
Subject: seq_buf: Avoid type mismatch for seq_buf_init

Building with W=2 prints a number of warnings for one function that
has a pointer type mismatch:

linux/seq_buf.h: In function 'seq_buf_init':
linux/seq_buf.h:35:12: warning: pointer targets in assignment from 'unsigned char *' to 'char *' differ in signedness [-Wpointer-sign]

Change the type in the function prototype according to the type in
the structure.

Link: https://lkml.kernel.org/r/20201026161108.3707783-1-arnd@kernel.org

Fixes: 9a7777935c34 ("tracing: Convert seq_buf fields to be like seq_file fields")
Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h   | 2 +-
 include/linux/trace_seq.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index fb0205d87d3c..9d6c28cc4d8f 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -30,7 +30,7 @@ static inline void seq_buf_clear(struct seq_buf *s)
 }
 
 static inline void
-seq_buf_init(struct seq_buf *s, unsigned char *buf, unsigned int size)
+seq_buf_init(struct seq_buf *s, char *buf, unsigned int size)
 {
 	s->buffer = buf;
 	s->size = size;
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 6c30508fca19..5a2c650d9e1c 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -12,7 +12,7 @@
  */
 
 struct trace_seq {
-	unsigned char		buffer[PAGE_SIZE];
+	char			buffer[PAGE_SIZE];
 	struct seq_buf		seq;
 	int			full;
 };
@@ -51,7 +51,7 @@ static inline int trace_seq_used(struct trace_seq *s)
  * that is about to be written to and then return the result
  * of that write.
  */
-static inline unsigned char *
+static inline char *
 trace_seq_buffer_ptr(struct trace_seq *s)
 {
 	return s->buffer + seq_buf_used(&s->seq);
-- 
cgit 


From 661d4f55a79483aee4970a76e3bd9d4cdc74ac79 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 22 Nov 2020 15:35:46 +0000
Subject: sbitmap: remove swap_lock

map->swap_lock protects map->cleared from concurrent modification,
however sbitmap_deferred_clear() is already atomically drains it, so
it's guaranteed to not loose bits on concurrent
sbitmap_deferred_clear().

A one threaded tag heavy test on top of nullbk showed ~1.5% t-put
increase, and 3% -> 1% cycle reduction of sbitmap_get() according to perf.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h |  5 -----
 lib/sbitmap.c           | 14 +++-----------
 2 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index e40d019c3d9d..74cc6384715e 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -32,11 +32,6 @@ struct sbitmap_word {
 	 * @cleared: word holding cleared bits
 	 */
 	unsigned long cleared ____cacheline_aligned_in_smp;
-
-	/**
-	 * @swap_lock: Held while swapping word <-> cleared
-	 */
-	spinlock_t swap_lock;
 } ____cacheline_aligned_in_smp;
 
 /**
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index c1c8a4e69325..4fd877048ba8 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -15,13 +15,9 @@
 static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
 {
 	unsigned long mask, val;
-	bool ret = false;
-	unsigned long flags;
 
-	spin_lock_irqsave(&map->swap_lock, flags);
-
-	if (!map->cleared)
-		goto out_unlock;
+	if (!READ_ONCE(map->cleared))
+		return false;
 
 	/*
 	 * First get a stable cleared mask, setting the old mask to 0.
@@ -35,10 +31,7 @@ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
 		val = map->word;
 	} while (cmpxchg(&map->word, val, val & ~mask) != val);
 
-	ret = true;
-out_unlock:
-	spin_unlock_irqrestore(&map->swap_lock, flags);
-	return ret;
+	return true;
 }
 
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
@@ -80,7 +73,6 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 	for (i = 0; i < sb->map_nr; i++) {
 		sb->map[i].depth = min(depth, bits_per_word);
 		depth -= sb->map[i].depth;
-		spin_lock_init(&sb->map[i].swap_lock);
 	}
 	return 0;
 }
-- 
cgit 


From 26792699fe3681102aa85f4ae6d39e80a6a7e6b6 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Sun, 8 Nov 2020 19:51:09 +0100
Subject: clk: divider: add devm_clk_hw_register_divider_table()

This will simplify drivers which would only unregister the clk in their
remove() op.

Signed-off-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20201108185113.31377-6-michael@walle.cc
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-divider.c    | 34 ++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h | 27 +++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index 8de12cb0c43d..c499799693cc 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/io.h>
@@ -578,3 +579,36 @@ void clk_hw_unregister_divider(struct clk_hw *hw)
 	kfree(div);
 }
 EXPORT_SYMBOL_GPL(clk_hw_unregister_divider);
+
+static void devm_clk_hw_release_divider(struct device *dev, void *res)
+{
+	clk_hw_unregister_divider(*(struct clk_hw **)res);
+}
+
+struct clk_hw *__devm_clk_hw_register_divider(struct device *dev,
+		struct device_node *np, const char *name,
+		const char *parent_name, const struct clk_hw *parent_hw,
+		const struct clk_parent_data *parent_data, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
+		const struct clk_div_table *table, spinlock_t *lock)
+{
+	struct clk_hw **ptr, *hw;
+
+	ptr = devres_alloc(devm_clk_hw_release_divider, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	hw = __clk_hw_register_divider(dev, np, name, parent_name, parent_hw,
+				       parent_data, flags, reg, shift, width,
+				       clk_divider_flags, table, lock);
+
+	if (!IS_ERR(hw)) {
+		*ptr = hw;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return hw;
+}
+EXPORT_SYMBOL_GPL(__devm_clk_hw_register_divider);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 33db52ff83a0..5f896df01f83 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -639,6 +639,12 @@ struct clk_hw *__clk_hw_register_divider(struct device *dev,
 		const struct clk_parent_data *parent_data, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
 		const struct clk_div_table *table, spinlock_t *lock);
+struct clk_hw *__devm_clk_hw_register_divider(struct device *dev,
+		struct device_node *np, const char *name,
+		const char *parent_name, const struct clk_hw *parent_hw,
+		const struct clk_parent_data *parent_data, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
+		const struct clk_div_table *table, spinlock_t *lock);
 struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
@@ -779,6 +785,27 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 				  (parent_data), (flags), (reg), (shift),     \
 				  (width), (clk_divider_flags), (table),      \
 				  (lock))
+/**
+ * devm_clk_hw_register_divider_table - register a table based divider clock
+ * with the clock framework (devres variant)
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @table: array of divider/value pairs ending with a div set to 0
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_divider_table(dev, name, parent_name, flags,     \
+					   reg, shift, width,		      \
+					   clk_divider_flags, table, lock)    \
+	__devm_clk_hw_register_divider((dev), NULL, (name), (parent_name),    \
+				       NULL, NULL, (flags), (reg), (shift),   \
+				       (width), (clk_divider_flags), (table), \
+				       (lock))
 
 void clk_unregister_divider(struct clk *clk);
 void clk_hw_unregister_divider(struct clk_hw *hw);
-- 
cgit 


From 374a96b9600ccf60083c0fec8f727e04752a7f0c Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Sun, 6 Dec 2020 11:12:54 +0200
Subject: net/mlx4: Remove unused #define MAX_MSIX_P_PORT

All usages of the definition MAX_MSIX_P_PORT were removed.
It's not in use anymore. Remove it.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Link: https://lore.kernel.org/r/20201206091254.12476-1-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx4/device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 06e066e04a4b..236a7d04f891 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -46,7 +46,6 @@
 
 #define DEFAULT_UAR_PAGE_SHIFT  12
 
-#define MAX_MSIX_P_PORT		17
 #define MAX_MSIX		128
 #define MIN_MSIX_P_PORT		5
 #define MLX4_IS_LEGACY_EQ_MODE(dev_cap) ((dev_cap).num_comp_vectors < \
-- 
cgit 


From fb01a2932e81a1fb2273f87ff92dc8172b8880ee Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 3 Dec 2020 09:26:36 +0800
Subject: blk-mq: add new API of blk_mq_hctx_set_fq_lock_class

flush_end_io() may be called recursively from some driver, such as
nvme-loop, so lockdep may complain 'possible recursive locking'.
Commit b3c6a5997541("block: Fix a lockdep complaint triggered by
request queue flushing") tried to address this issue by assigning
dynamically allocated per-flush-queue lock class. This solution
adds synchronize_rcu() for each hctx's release handler, and causes
horrible SCSI MQ probe delay(more than half an hour on megaraid sas).

Add new API of blk_mq_hctx_set_fq_lock_class() for these drivers, so
we just need to use driver specific lock class for avoiding the
lockdep warning of 'possible recursive locking'.

Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>
Reported-by: Qian Cai <cai@redhat.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c      | 25 +++++++++++++++++++++++++
 include/linux/blk-mq.h |  3 +++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9507dcdd5881..bf51588762d8 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -490,3 +490,28 @@ void blk_free_flush_queue(struct blk_flush_queue *fq)
 	kfree(fq->flush_rq);
 	kfree(fq);
 }
+
+/*
+ * Allow driver to set its own lock class to fq->mq_flush_lock for
+ * avoiding lockdep complaint.
+ *
+ * flush_end_io() may be called recursively from some driver, such as
+ * nvme-loop, so lockdep may complain 'possible recursive locking' because
+ * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
+ * key. We need to assign different lock class for these driver's
+ * fq->mq_flush_lock for avoiding the lockdep warning.
+ *
+ * Use dynamically allocated lock class key for each 'blk_flush_queue'
+ * instance is over-kill, and more worse it introduces horrible boot delay
+ * issue because synchronize_rcu() is implied in lockdep_unregister_key which
+ * is called for each hctx release. SCSI probing may synchronously create and
+ * destroy lots of MQ request_queues for non-existent devices, and some robot
+ * test kernel always enable lockdep option. It is observed that more than half
+ * an hour is taken during SCSI MQ probe with per-fq lock class.
+ */
+void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
+		struct lock_class_key *key)
+{
+	lockdep_set_class(&hctx->fq->mq_flush_lock, key);
+}
+EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 794b2a33a2c3..5f639240760e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -5,6 +5,7 @@
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
 #include <linux/srcu.h>
+#include <linux/lockdep.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -594,5 +595,7 @@ static inline void blk_mq_cleanup_rq(struct request *rq)
 }
 
 blk_qc_t blk_mq_submit_bio(struct bio *bio);
+void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
+		struct lock_class_key *key);
 
 #endif
-- 
cgit 


From 76ea4d8eeefbfdd37e47c6fd579d0d5852457618 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Tue, 24 Nov 2020 10:43:45 +0000
Subject: firmware: arm_scmi: Add power_scale_mw_get() interface

Add a new interface to the existing perf_ops and export the information
about the power values scale.

This would be used by the cpufreq driver and Energy Model framework to
set the performance domains scale: milli-Watts or abstract scale.

Suggested-by: Morten Rasmussen <morten.rasmussen@arm.com>
Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/firmware/arm_scmi/perf.c | 8 ++++++++
 include/linux/scmi_protocol.h    | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 82fb3babff72..e374b1125fca 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -750,6 +750,13 @@ static bool scmi_fast_switch_possible(const struct scmi_handle *handle,
 	return dom->fc_info && dom->fc_info->level_set_addr;
 }
 
+static bool scmi_power_scale_mw_get(const struct scmi_handle *handle)
+{
+	struct scmi_perf_info *pi = handle->perf_priv;
+
+	return pi->power_scale_mw;
+}
+
 static const struct scmi_perf_ops perf_ops = {
 	.limits_set = scmi_perf_limits_set,
 	.limits_get = scmi_perf_limits_get,
@@ -762,6 +769,7 @@ static const struct scmi_perf_ops perf_ops = {
 	.freq_get = scmi_dvfs_freq_get,
 	.est_power_get = scmi_dvfs_est_power_get,
 	.fast_switch_possible = scmi_fast_switch_possible,
+	.power_scale_mw_get = scmi_power_scale_mw_get,
 };
 
 static int scmi_perf_set_notify_enabled(const struct scmi_handle *handle,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 9cd312a1ff92..c77e4e11e788 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -121,6 +121,7 @@ struct scmi_perf_ops {
 			     unsigned long *rate, unsigned long *power);
 	bool (*fast_switch_possible)(const struct scmi_handle *handle,
 				     struct device *dev);
+	bool (*power_scale_mw_get)(const struct scmi_handle *handle);
 };
 
 /**
-- 
cgit 


From cc00bcaa589914096edef7fb87ca5cee4a166b5c Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Wed, 25 Nov 2020 11:27:22 -0700
Subject: netfilter: x_tables: Switch synchronization to RCU

When running concurrent iptables rules replacement with data, the per CPU
sequence count is checked after the assignment of the new information.
The sequence count is used to synchronize with the packet path without the
use of any explicit locking. If there are any packets in the packet path using
the table information, the sequence count is incremented to an odd value and
is incremented to an even after the packet process completion.

The new table value assignment is followed by a write memory barrier so every
CPU should see the latest value. If the packet path has started with the old
table information, the sequence counter will be odd and the iptables
replacement will wait till the sequence count is even prior to freeing the
old table info.

However, this assumes that the new table information assignment and the memory
barrier is actually executed prior to the counter check in the replacement
thread. If CPU decides to execute the assignment later as there is no user of
the table information prior to the sequence check, the packet path in another
CPU may use the old table information. The replacement thread would then free
the table information under it leading to a use after free in the packet
processing context-

Unable to handle kernel NULL pointer dereference at virtual
address 000000000000008e
pc : ip6t_do_table+0x5d0/0x89c
lr : ip6t_do_table+0x5b8/0x89c
ip6t_do_table+0x5d0/0x89c
ip6table_filter_hook+0x24/0x30
nf_hook_slow+0x84/0x120
ip6_input+0x74/0xe0
ip6_rcv_finish+0x7c/0x128
ipv6_rcv+0xac/0xe4
__netif_receive_skb+0x84/0x17c
process_backlog+0x15c/0x1b8
napi_poll+0x88/0x284
net_rx_action+0xbc/0x23c
__do_softirq+0x20c/0x48c

This could be fixed by forcing instruction order after the new table
information assignment or by switching to RCU for the synchronization.

Fixes: 80055dab5de0 ("netfilter: x_tables: make xt_replace_table wait until old rules are not used anymore")
Reported-by: Sean Tranchetti <stranche@codeaurora.org>
Reported-by: kernel test robot <lkp@intel.com>
Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  5 +++-
 net/ipv4/netfilter/arp_tables.c    | 14 +++++------
 net/ipv4/netfilter/ip_tables.c     | 14 +++++------
 net/ipv6/netfilter/ip6_tables.c    | 14 +++++------
 net/netfilter/x_tables.c           | 49 ++++++++++++--------------------------
 5 files changed, 40 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 5deb099d156d..8ebb64193757 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -227,7 +227,7 @@ struct xt_table {
 	unsigned int valid_hooks;
 
 	/* Man behind the curtain... */
-	struct xt_table_info *private;
+	struct xt_table_info __rcu *private;
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
@@ -448,6 +448,9 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
 
 struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);
 
+struct xt_table_info
+*xt_table_get_private_protected(const struct xt_table *table);
+
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index d1e04d2b5170..563b62b76a5f 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
-	private = READ_ONCE(table->private); /* Address dependency. */
+	private = rcu_access_pointer(table->private);
 	cpu     = smp_processor_id();
 	table_base = private->entries;
 	jumpstack  = (struct arpt_entry **)private->jumpstack[cpu];
@@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	const struct xt_table_info *private = xt_table_get_private_protected(table);
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	unsigned int off, num;
 	const struct arpt_entry *e;
 	struct xt_counters *counters;
-	struct xt_table_info *private = table->private;
+	struct xt_table_info *private = xt_table_get_private_protected(table);
 	int ret = 0;
 	void *loc_cpu_entry;
 
@@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
 	if (!IS_ERR(t)) {
 		struct arpt_getinfo info;
-		const struct xt_table_info *private = t->private;
+		const struct xt_table_info *private = xt_table_get_private_protected(t);
 #ifdef CONFIG_COMPAT
 		struct xt_table_info tmp;
 
@@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
 
 	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
 	if (!IS_ERR(t)) {
-		const struct xt_table_info *private = t->private;
+		const struct xt_table_info *private = xt_table_get_private_protected(t);
 
 		if (get.size == private->size)
 			ret = copy_entries_to_user(private->size,
@@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	}
 
 	local_bh_disable();
-	private = t->private;
+	private = xt_table_get_private_protected(t);
 	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 				       void __user *userptr)
 {
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	const struct xt_table_info *private = xt_table_get_private_protected(table);
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f15bc21d7301..6e2851f8d3a3 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb,
 	WARN_ON(!(table->valid_hooks & (1 << hook)));
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
-	private = READ_ONCE(table->private); /* Address dependency. */
+	private = rcu_access_pointer(table->private);
 	cpu        = smp_processor_id();
 	table_base = private->entries;
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
@@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	const struct xt_table_info *private = xt_table_get_private_protected(table);
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size,
 	unsigned int off, num;
 	const struct ipt_entry *e;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	const struct xt_table_info *private = xt_table_get_private_protected(table);
 	int ret = 0;
 	const void *loc_cpu_entry;
 
@@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	t = xt_request_find_table_lock(net, AF_INET, name);
 	if (!IS_ERR(t)) {
 		struct ipt_getinfo info;
-		const struct xt_table_info *private = t->private;
+		const struct xt_table_info *private = xt_table_get_private_protected(t);
 #ifdef CONFIG_COMPAT
 		struct xt_table_info tmp;
 
@@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
 
 	t = xt_find_table_lock(net, AF_INET, get.name);
 	if (!IS_ERR(t)) {
-		const struct xt_table_info *private = t->private;
+		const struct xt_table_info *private = xt_table_get_private_protected(t);
 		if (get.size == private->size)
 			ret = copy_entries_to_user(private->size,
 						   t, uptr->entrytable);
@@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	}
 
 	local_bh_disable();
-	private = t->private;
+	private = xt_table_get_private_protected(t);
 	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 			    void __user *userptr)
 {
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	const struct xt_table_info *private = xt_table_get_private_protected(table);
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 2e2119bfcf13..c4f532f4d311 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -280,7 +280,7 @@ ip6t_do_table(struct sk_buff *skb,
 
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
-	private = READ_ONCE(table->private); /* Address dependency. */
+	private = rcu_access_pointer(table->private);
 	cpu        = smp_processor_id();
 	table_base = private->entries;
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
@@ -807,7 +807,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	const struct xt_table_info *private = xt_table_get_private_protected(table);
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -831,7 +831,7 @@ copy_entries_to_user(unsigned int total_size,
 	unsigned int off, num;
 	const struct ip6t_entry *e;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	const struct xt_table_info *private = xt_table_get_private_protected(table);
 	int ret = 0;
 	const void *loc_cpu_entry;
 
@@ -980,7 +980,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	t = xt_request_find_table_lock(net, AF_INET6, name);
 	if (!IS_ERR(t)) {
 		struct ip6t_getinfo info;
-		const struct xt_table_info *private = t->private;
+		const struct xt_table_info *private = xt_table_get_private_protected(t);
 #ifdef CONFIG_COMPAT
 		struct xt_table_info tmp;
 
@@ -1035,7 +1035,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
 
 	t = xt_find_table_lock(net, AF_INET6, get.name);
 	if (!IS_ERR(t)) {
-		struct xt_table_info *private = t->private;
+		struct xt_table_info *private = xt_table_get_private_protected(t);
 		if (get.size == private->size)
 			ret = copy_entries_to_user(private->size,
 						   t, uptr->entrytable);
@@ -1189,7 +1189,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	}
 
 	local_bh_disable();
-	private = t->private;
+	private = xt_table_get_private_protected(t);
 	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1552,7 +1552,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 			    void __user *userptr)
 {
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	const struct xt_table_info *private = xt_table_get_private_protected(table);
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index af22dbe85e2c..acce622582e3 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1349,6 +1349,14 @@ struct xt_counters *xt_counters_alloc(unsigned int counters)
 }
 EXPORT_SYMBOL(xt_counters_alloc);
 
+struct xt_table_info
+*xt_table_get_private_protected(const struct xt_table *table)
+{
+	return rcu_dereference_protected(table->private,
+					 mutex_is_locked(&xt[table->af].mutex));
+}
+EXPORT_SYMBOL(xt_table_get_private_protected);
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
@@ -1356,7 +1364,6 @@ xt_replace_table(struct xt_table *table,
 	      int *error)
 {
 	struct xt_table_info *private;
-	unsigned int cpu;
 	int ret;
 
 	ret = xt_jumpstack_alloc(newinfo);
@@ -1366,47 +1373,20 @@ xt_replace_table(struct xt_table *table,
 	}
 
 	/* Do the substitution. */
-	local_bh_disable();
-	private = table->private;
+	private = xt_table_get_private_protected(table);
 
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		pr_debug("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
 
 	newinfo->initial_entries = private->initial_entries;
-	/*
-	 * Ensure contents of newinfo are visible before assigning to
-	 * private.
-	 */
-	smp_wmb();
-	table->private = newinfo;
-
-	/* make sure all cpus see new ->private value */
-	smp_wmb();
 
-	/*
-	 * Even though table entries have now been swapped, other CPU's
-	 * may still be using the old entries...
-	 */
-	local_bh_enable();
-
-	/* ... so wait for even xt_recseq on all cpus */
-	for_each_possible_cpu(cpu) {
-		seqcount_t *s = &per_cpu(xt_recseq, cpu);
-		u32 seq = raw_read_seqcount(s);
-
-		if (seq & 1) {
-			do {
-				cond_resched();
-				cpu_relax();
-			} while (seq == raw_read_seqcount(s));
-		}
-	}
+	rcu_assign_pointer(table->private, newinfo);
+	synchronize_rcu();
 
 	audit_log_nfcfg(table->name, table->af, private->number,
 			!private->number ? AUDIT_XT_OP_REGISTER :
@@ -1442,12 +1422,12 @@ struct xt_table *xt_register_table(struct net *net,
 	}
 
 	/* Simplifies replace_table code. */
-	table->private = bootstrap;
+	rcu_assign_pointer(table->private, bootstrap);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 
-	private = table->private;
+	private = xt_table_get_private_protected(table);
 	pr_debug("table->private->number = %u\n", private->number);
 
 	/* save number of initial entries */
@@ -1470,7 +1450,8 @@ void *xt_unregister_table(struct xt_table *table)
 	struct xt_table_info *private;
 
 	mutex_lock(&xt[table->af].mutex);
-	private = table->private;
+	private = xt_table_get_private_protected(table);
+	RCU_INIT_POINTER(table->private, NULL);
 	list_del(&table->list);
 	mutex_unlock(&xt[table->af].mutex);
 	audit_log_nfcfg(table->name, table->af, private->number,
-- 
cgit 


From 2f24dfb71208eeb0174f08dd56ca6d3c17b279a5 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Fri, 4 Dec 2020 02:34:50 +0800
Subject: iommu: Delete split_and_remove_iova()

Function split_and_remove_iova() has not been referenced since commit
e70b081c6f37 ("iommu/vt-d: Remove IOVA handling code from the non-dma_ops
path"), so delete it.

Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1607020492-189471-2-git-send-email-john.garry@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/iova.c | 41 -----------------------------------------
 include/linux/iova.h | 10 ----------
 2 files changed, 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index ff59d8aa3041..3331c040b2b0 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -742,47 +742,6 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
 }
 EXPORT_SYMBOL_GPL(copy_reserved_iova);
 
-struct iova *
-split_and_remove_iova(struct iova_domain *iovad, struct iova *iova,
-		      unsigned long pfn_lo, unsigned long pfn_hi)
-{
-	unsigned long flags;
-	struct iova *prev = NULL, *next = NULL;
-
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	if (iova->pfn_lo < pfn_lo) {
-		prev = alloc_and_init_iova(iova->pfn_lo, pfn_lo - 1);
-		if (prev == NULL)
-			goto error;
-	}
-	if (iova->pfn_hi > pfn_hi) {
-		next = alloc_and_init_iova(pfn_hi + 1, iova->pfn_hi);
-		if (next == NULL)
-			goto error;
-	}
-
-	__cached_rbnode_delete_update(iovad, iova);
-	rb_erase(&iova->node, &iovad->rbroot);
-
-	if (prev) {
-		iova_insert_rbtree(&iovad->rbroot, prev, NULL);
-		iova->pfn_lo = pfn_lo;
-	}
-	if (next) {
-		iova_insert_rbtree(&iovad->rbroot, next, NULL);
-		iova->pfn_hi = pfn_hi;
-	}
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-
-	return iova;
-
-error:
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-	if (prev)
-		free_iova_mem(prev);
-	return NULL;
-}
-
 /*
  * Magazine caches for IOVA ranges.  For an introduction to magazines,
  * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
diff --git a/include/linux/iova.h b/include/linux/iova.h
index a0637abffee8..a77add571c50 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -160,8 +160,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
-struct iova *split_and_remove_iova(struct iova_domain *iovad,
-	struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
 void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 #else
 static inline int iova_cache_get(void)
@@ -258,14 +256,6 @@ static inline void put_iova_domain(struct iova_domain *iovad)
 {
 }
 
-static inline struct iova *split_and_remove_iova(struct iova_domain *iovad,
-						 struct iova *iova,
-						 unsigned long pfn_lo,
-						 unsigned long pfn_hi)
-{
-	return NULL;
-}
-
 static inline void free_cpu_cached_iovas(unsigned int cpu,
 					 struct iova_domain *iovad)
 {
-- 
cgit 


From 51b70b817b187e48155fc492adb9ce80bdb21b70 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Fri, 4 Dec 2020 02:34:51 +0800
Subject: iommu: Stop exporting alloc_iova_mem()

It is not used outside iova.c

Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1607020492-189471-3-git-send-email-john.garry@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/iova.c | 3 +--
 include/linux/iova.h | 6 ------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 3331c040b2b0..bb5cb67ee311 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -243,11 +243,10 @@ static struct kmem_cache *iova_cache;
 static unsigned int iova_cache_users;
 static DEFINE_MUTEX(iova_cache_mutex);
 
-struct iova *alloc_iova_mem(void)
+static struct iova *alloc_iova_mem(void)
 {
 	return kmem_cache_zalloc(iova_cache, GFP_ATOMIC | __GFP_NOWARN);
 }
-EXPORT_SYMBOL(alloc_iova_mem);
 
 void free_iova_mem(struct iova *iova)
 {
diff --git a/include/linux/iova.h b/include/linux/iova.h
index a77add571c50..d8c011b6d4ed 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -136,7 +136,6 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 int iova_cache_get(void);
 void iova_cache_put(void);
 
-struct iova *alloc_iova_mem(void);
 void free_iova_mem(struct iova *iova);
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
 void __free_iova(struct iova_domain *iovad, struct iova *iova);
@@ -171,11 +170,6 @@ static inline void iova_cache_put(void)
 {
 }
 
-static inline struct iova *alloc_iova_mem(void)
-{
-	return NULL;
-}
-
 static inline void free_iova_mem(struct iova *iova)
 {
 }
-- 
cgit 


From 176cfc187c24287a363e7612cd2385ba40c2042b Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Fri, 4 Dec 2020 02:34:52 +0800
Subject: iommu: Stop exporting free_iova_mem()

It has no user outside iova.c

Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1607020492-189471-4-git-send-email-john.garry@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/iova.c | 3 +--
 include/linux/iova.h | 5 -----
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index bb5cb67ee311..4bb3293ae4d7 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -248,12 +248,11 @@ static struct iova *alloc_iova_mem(void)
 	return kmem_cache_zalloc(iova_cache, GFP_ATOMIC | __GFP_NOWARN);
 }
 
-void free_iova_mem(struct iova *iova)
+static void free_iova_mem(struct iova *iova)
 {
 	if (iova->pfn_lo != IOVA_ANCHOR)
 		kmem_cache_free(iova_cache, iova);
 }
-EXPORT_SYMBOL(free_iova_mem);
 
 int iova_cache_get(void)
 {
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d8c011b6d4ed..76e16ae20729 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -136,7 +136,6 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 int iova_cache_get(void);
 void iova_cache_put(void);
 
-void free_iova_mem(struct iova *iova);
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
 void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
@@ -170,10 +169,6 @@ static inline void iova_cache_put(void)
 {
 }
 
-static inline void free_iova_mem(struct iova *iova)
-{
-}
-
 static inline void free_iova(struct iova_domain *iovad, unsigned long pfn)
 {
 }
-- 
cgit 


From fefe8527a1e0e0014946c6b5b3b2e40cb32bb5d3 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 25 Nov 2020 17:29:39 +0000
Subject: iommu/io-pgtable: Remove tlb_flush_leaf

The only user of tlb_flush_leaf is a particularly hairy corner of the
Arm short-descriptor code, which wants a synchronous invalidation to
minimise the races inherent in trying to split a large page mapping.
This is already far enough into "here be dragons" territory that no
sensible caller should ever hit it, and thus it really doesn't need
optimising. Although using tlb_flush_walk there may technically be
more heavyweight than needed, it does the job and saves everyone else
having to carry around useless baggage.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/9844ab0c5cb3da8b2f89c6c2da16941910702b41.1606324115.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/gpu/drm/msm/msm_iommu.c             |  1 -
 drivers/gpu/drm/panfrost/panfrost_mmu.c     |  7 -------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  7 -------
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 25 +++----------------------
 drivers/iommu/arm/arm-smmu/qcom_iommu.c     |  8 --------
 drivers/iommu/io-pgtable-arm-v7s.c          |  3 +--
 drivers/iommu/io-pgtable-arm.c              |  1 -
 drivers/iommu/ipmmu-vmsa.c                  |  1 -
 drivers/iommu/msm_iommu.c                   |  7 -------
 drivers/iommu/mtk_iommu.c                   |  1 -
 include/linux/io-pgtable.h                  | 11 -----------
 11 files changed, 4 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index 22ac7c692a81..50d881794758 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -139,7 +139,6 @@ static void msm_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
 static const struct iommu_flush_ops null_tlb_ops = {
 	.tlb_flush_all = msm_iommu_tlb_flush_all,
 	.tlb_flush_walk = msm_iommu_tlb_flush_walk,
-	.tlb_flush_leaf = msm_iommu_tlb_flush_walk,
 	.tlb_add_page = msm_iommu_tlb_add_page,
 };
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c
index 776448c527ea..c186914cc4f9 100644
--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c
@@ -347,16 +347,9 @@ static void mmu_tlb_flush_walk(unsigned long iova, size_t size, size_t granule,
 	mmu_tlb_sync_context(cookie);
 }
 
-static void mmu_tlb_flush_leaf(unsigned long iova, size_t size, size_t granule,
-			       void *cookie)
-{
-	mmu_tlb_sync_context(cookie);
-}
-
 static const struct iommu_flush_ops mmu_tlb_ops = {
 	.tlb_flush_all	= mmu_tlb_inv_context_s1,
 	.tlb_flush_walk = mmu_tlb_flush_walk,
-	.tlb_flush_leaf = mmu_tlb_flush_leaf,
 };
 
 int panfrost_mmu_pgtable_alloc(struct panfrost_file_priv *priv)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2ddf5ecc75f8..8ca7415d785d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1760,16 +1760,9 @@ static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
 	arm_smmu_tlb_inv_range(iova, size, granule, false, cookie);
 }
 
-static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
-				  size_t granule, void *cookie)
-{
-	arm_smmu_tlb_inv_range(iova, size, granule, true, cookie);
-}
-
 static const struct iommu_flush_ops arm_smmu_flush_ops = {
 	.tlb_flush_all	= arm_smmu_tlb_inv_context,
 	.tlb_flush_walk = arm_smmu_tlb_inv_walk,
-	.tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
 	.tlb_add_page	= arm_smmu_tlb_inv_page_nosync,
 };
 
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index d8979bb71fc0..d8c6bfde6a61 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -333,14 +333,6 @@ static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t size,
 	arm_smmu_tlb_sync_context(cookie);
 }
 
-static void arm_smmu_tlb_inv_leaf_s1(unsigned long iova, size_t size,
-				     size_t granule, void *cookie)
-{
-	arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
-				  ARM_SMMU_CB_S1_TLBIVAL);
-	arm_smmu_tlb_sync_context(cookie);
-}
-
 static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather *gather,
 				     unsigned long iova, size_t granule,
 				     void *cookie)
@@ -357,14 +349,6 @@ static void arm_smmu_tlb_inv_walk_s2(unsigned long iova, size_t size,
 	arm_smmu_tlb_sync_context(cookie);
 }
 
-static void arm_smmu_tlb_inv_leaf_s2(unsigned long iova, size_t size,
-				     size_t granule, void *cookie)
-{
-	arm_smmu_tlb_inv_range_s2(iova, size, granule, cookie,
-				  ARM_SMMU_CB_S2_TLBIIPAS2L);
-	arm_smmu_tlb_sync_context(cookie);
-}
-
 static void arm_smmu_tlb_add_page_s2(struct iommu_iotlb_gather *gather,
 				     unsigned long iova, size_t granule,
 				     void *cookie)
@@ -373,8 +357,8 @@ static void arm_smmu_tlb_add_page_s2(struct iommu_iotlb_gather *gather,
 				  ARM_SMMU_CB_S2_TLBIIPAS2L);
 }
 
-static void arm_smmu_tlb_inv_any_s2_v1(unsigned long iova, size_t size,
-				       size_t granule, void *cookie)
+static void arm_smmu_tlb_inv_walk_s2_v1(unsigned long iova, size_t size,
+					size_t granule, void *cookie)
 {
 	arm_smmu_tlb_inv_context_s2(cookie);
 }
@@ -401,21 +385,18 @@ static void arm_smmu_tlb_add_page_s2_v1(struct iommu_iotlb_gather *gather,
 static const struct iommu_flush_ops arm_smmu_s1_tlb_ops = {
 	.tlb_flush_all	= arm_smmu_tlb_inv_context_s1,
 	.tlb_flush_walk	= arm_smmu_tlb_inv_walk_s1,
-	.tlb_flush_leaf	= arm_smmu_tlb_inv_leaf_s1,
 	.tlb_add_page	= arm_smmu_tlb_add_page_s1,
 };
 
 static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v2 = {
 	.tlb_flush_all	= arm_smmu_tlb_inv_context_s2,
 	.tlb_flush_walk	= arm_smmu_tlb_inv_walk_s2,
-	.tlb_flush_leaf	= arm_smmu_tlb_inv_leaf_s2,
 	.tlb_add_page	= arm_smmu_tlb_add_page_s2,
 };
 
 static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v1 = {
 	.tlb_flush_all	= arm_smmu_tlb_inv_context_s2,
-	.tlb_flush_walk	= arm_smmu_tlb_inv_any_s2_v1,
-	.tlb_flush_leaf	= arm_smmu_tlb_inv_any_s2_v1,
+	.tlb_flush_walk	= arm_smmu_tlb_inv_walk_s2_v1,
 	.tlb_add_page	= arm_smmu_tlb_add_page_s2_v1,
 };
 
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index b30d6c966e2c..7f280c8d5c53 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -185,13 +185,6 @@ static void qcom_iommu_tlb_flush_walk(unsigned long iova, size_t size,
 	qcom_iommu_tlb_sync(cookie);
 }
 
-static void qcom_iommu_tlb_flush_leaf(unsigned long iova, size_t size,
-				      size_t granule, void *cookie)
-{
-	qcom_iommu_tlb_inv_range_nosync(iova, size, granule, true, cookie);
-	qcom_iommu_tlb_sync(cookie);
-}
-
 static void qcom_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
 				    unsigned long iova, size_t granule,
 				    void *cookie)
@@ -202,7 +195,6 @@ static void qcom_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
 static const struct iommu_flush_ops qcom_flush_ops = {
 	.tlb_flush_all	= qcom_iommu_tlb_inv_context,
 	.tlb_flush_walk = qcom_iommu_tlb_flush_walk,
-	.tlb_flush_leaf = qcom_iommu_tlb_flush_leaf,
 	.tlb_add_page	= qcom_iommu_tlb_add_page,
 };
 
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index 359b96b0fa3e..1d92ac948db7 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -584,7 +584,7 @@ static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data,
 	__arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg);
 
 	size *= ARM_V7S_CONT_PAGES;
-	io_pgtable_tlb_flush_leaf(iop, iova, size, size);
+	io_pgtable_tlb_flush_walk(iop, iova, size, size);
 	return pte;
 }
 
@@ -866,7 +866,6 @@ static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
 static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
 	.tlb_flush_all	= dummy_tlb_flush_all,
 	.tlb_flush_walk	= dummy_tlb_flush,
-	.tlb_flush_leaf	= dummy_tlb_flush,
 	.tlb_add_page	= dummy_tlb_add_page,
 };
 
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 135f57b37bbd..49ad8d2a82df 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -1085,7 +1085,6 @@ static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
 static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
 	.tlb_flush_all	= dummy_tlb_flush_all,
 	.tlb_flush_walk	= dummy_tlb_flush,
-	.tlb_flush_leaf	= dummy_tlb_flush,
 	.tlb_add_page	= dummy_tlb_add_page,
 };
 
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 0f18abda0e20..d71f10257f15 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -325,7 +325,6 @@ static void ipmmu_tlb_flush(unsigned long iova, size_t size,
 static const struct iommu_flush_ops ipmmu_flush_ops = {
 	.tlb_flush_all = ipmmu_tlb_flush_all,
 	.tlb_flush_walk = ipmmu_tlb_flush,
-	.tlb_flush_leaf = ipmmu_tlb_flush,
 };
 
 /* -----------------------------------------------------------------------------
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 3615cd6241c4..040e85f70861 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -174,12 +174,6 @@ static void __flush_iotlb_walk(unsigned long iova, size_t size,
 	__flush_iotlb_range(iova, size, granule, false, cookie);
 }
 
-static void __flush_iotlb_leaf(unsigned long iova, size_t size,
-			       size_t granule, void *cookie)
-{
-	__flush_iotlb_range(iova, size, granule, true, cookie);
-}
-
 static void __flush_iotlb_page(struct iommu_iotlb_gather *gather,
 			       unsigned long iova, size_t granule, void *cookie)
 {
@@ -189,7 +183,6 @@ static void __flush_iotlb_page(struct iommu_iotlb_gather *gather,
 static const struct iommu_flush_ops msm_iommu_flush_ops = {
 	.tlb_flush_all = __flush_iotlb,
 	.tlb_flush_walk = __flush_iotlb_walk,
-	.tlb_flush_leaf = __flush_iotlb_leaf,
 	.tlb_add_page = __flush_iotlb_page,
 };
 
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index c072cee532c2..8e56cec532e7 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -240,7 +240,6 @@ static void mtk_iommu_tlb_flush_page_nosync(struct iommu_iotlb_gather *gather,
 static const struct iommu_flush_ops mtk_iommu_flush_ops = {
 	.tlb_flush_all = mtk_iommu_tlb_flush_all,
 	.tlb_flush_walk = mtk_iommu_tlb_flush_range_sync,
-	.tlb_flush_leaf = mtk_iommu_tlb_flush_range_sync,
 	.tlb_add_page = mtk_iommu_tlb_flush_page_nosync,
 };
 
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index fb4d5a763e0c..ea727eb1a1a9 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -25,8 +25,6 @@ enum io_pgtable_fmt {
  * @tlb_flush_walk: Synchronously invalidate all intermediate TLB state
  *                  (sometimes referred to as the "walk cache") for a virtual
  *                  address range.
- * @tlb_flush_leaf: Synchronously invalidate all leaf TLB state for a virtual
- *                  address range.
  * @tlb_add_page:   Optional callback to queue up leaf TLB invalidation for a
  *                  single page.  IOMMUs that cannot batch TLB invalidation
  *                  operations efficiently will typically issue them here, but
@@ -40,8 +38,6 @@ struct iommu_flush_ops {
 	void (*tlb_flush_all)(void *cookie);
 	void (*tlb_flush_walk)(unsigned long iova, size_t size, size_t granule,
 			       void *cookie);
-	void (*tlb_flush_leaf)(unsigned long iova, size_t size, size_t granule,
-			       void *cookie);
 	void (*tlb_add_page)(struct iommu_iotlb_gather *gather,
 			     unsigned long iova, size_t granule, void *cookie);
 };
@@ -228,13 +224,6 @@ io_pgtable_tlb_flush_walk(struct io_pgtable *iop, unsigned long iova,
 	iop->cfg.tlb->tlb_flush_walk(iova, size, granule, iop->cookie);
 }
 
-static inline void
-io_pgtable_tlb_flush_leaf(struct io_pgtable *iop, unsigned long iova,
-			  size_t size, size_t granule)
-{
-	iop->cfg.tlb->tlb_flush_leaf(iova, size, granule, iop->cookie);
-}
-
 static inline void
 io_pgtable_tlb_add_page(struct io_pgtable *iop,
 			struct iommu_iotlb_gather * gather, unsigned long iova,
-- 
cgit 


From 1080399542075bb0e9d46ea80418d76784d1ece8 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Sat, 28 Nov 2020 07:09:23 +0530
Subject: PM / EM: Micro optimization in em_cpu_energy

When the sum of the utilization of CPUs in a power domain is zero,
return the energy as 0 without doing any computations.

Acked-by: Quentin Perret <qperret@google.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 9618c0a46ef4..757fc60658fa 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -106,6 +106,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	struct em_perf_state *ps;
 	int i, cpu;
 
+	if (!sum_util)
+		return 0;
+
 	/*
 	 * In order to predict the performance state, map the utilization of
 	 * the most utilized CPU of the performance domain to a requested
-- 
cgit 


From b577562ccc072ab4b09243740ebeca52309eecd2 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 8 Dec 2020 13:16:22 +0100
Subject: PCI: Remove unused HAVE_PCI_SET_MWI

Remove unused HAVE_PCI_SET_MWI.

Link: https://lore.kernel.org/r/03f20cac-708d-7897-c7c7-cb4e63cfd991@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 22207a79762c..fe824afc0e91 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1190,7 +1190,6 @@ void pci_clear_master(struct pci_dev *dev);
 
 int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state);
 int pci_set_cacheline_size(struct pci_dev *dev);
-#define HAVE_PCI_SET_MWI
 int __must_check pci_set_mwi(struct pci_dev *dev);
 int __must_check pcim_set_mwi(struct pci_dev *dev);
 int pci_try_set_mwi(struct pci_dev *dev);
-- 
cgit 


From f119cc9818eb33b66e977ad3af75aef6500bbdc3 Mon Sep 17 00:00:00 2001
From: Fugang Duan <fugang.duan@nxp.com>
Date: Mon, 7 Dec 2020 18:51:41 +0800
Subject: net: stmmac: overwrite the dma_cap.addr64 according to HW design

The current IP register MAC_HW_Feature1[ADDR64] only defines
32/40/64 bit width, but some SOCs support others like i.MX8MP
support 34 bits but it maps to 40 bits width in MAC_HW_Feature1[ADDR64].
So overwrite dma_cap.addr64 according to HW real design.

Fixes: 94abdad6974a ("net: ethernet: dwmac: add ethernet glue logic for NXP imx8 chip")
Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: Joakim Zhang <qiangqing.zhang@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c   | 9 +--------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 8 ++++++++
 include/linux/stmmac.h                            | 1 +
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c
index efef5476a577..223f69da7e95 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c
@@ -246,13 +246,7 @@ static int imx_dwmac_probe(struct platform_device *pdev)
 		goto err_parse_dt;
 	}
 
-	ret = dma_set_mask_and_coherent(&pdev->dev,
-					DMA_BIT_MASK(dwmac->ops->addr_width));
-	if (ret) {
-		dev_err(&pdev->dev, "DMA mask set failed\n");
-		goto err_dma_mask;
-	}
-
+	plat_dat->addr64 = dwmac->ops->addr_width;
 	plat_dat->init = imx_dwmac_init;
 	plat_dat->exit = imx_dwmac_exit;
 	plat_dat->fix_mac_speed = imx_dwmac_fix_speed;
@@ -272,7 +266,6 @@ static int imx_dwmac_probe(struct platform_device *pdev)
 err_dwmac_init:
 err_drv_probe:
 	imx_dwmac_exit(pdev, plat_dat->bsp_priv);
-err_dma_mask:
 err_parse_dt:
 err_match_data:
 	stmmac_remove_config_dt(pdev, plat_dat);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index d2521ebb8217..c33db79cdd0a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4945,6 +4945,14 @@ int stmmac_dvr_probe(struct device *device,
 		dev_info(priv->device, "SPH feature enabled\n");
 	}
 
+	/* The current IP register MAC_HW_Feature1[ADDR64] only define
+	 * 32/40/64 bit width, but some SOC support others like i.MX8MP
+	 * support 34 bits but it map to 40 bits width in MAC_HW_Feature1[ADDR64].
+	 * So overwrite dma_cap.addr64 according to HW real design.
+	 */
+	if (priv->plat->addr64)
+		priv->dma_cap.addr64 = priv->plat->addr64;
+
 	if (priv->dma_cap.addr64) {
 		ret = dma_set_mask_and_coherent(device,
 				DMA_BIT_MASK(priv->dma_cap.addr64));
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 628e28903b8b..15ca6b4167cc 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -170,6 +170,7 @@ struct plat_stmmacenet_data {
 	int unicast_filter_entries;
 	int tx_fifo_size;
 	int rx_fifo_size;
+	u32 addr64;
 	u32 rx_queues_to_use;
 	u32 tx_queues_to_use;
 	u8 rx_sched_algorithm;
-- 
cgit 


From b60da4955f53d1f50e44351a9c3a37a92503079e Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Tue, 8 Dec 2020 18:36:23 +0100
Subject: bpf: Only provide bpf_sock_from_file with CONFIG_NET

This moves the bpf_sock_from_file definition into net/core/filter.c
which only gets compiled with CONFIG_NET and also moves the helper proto
usage next to other tracing helpers that are conditional on CONFIG_NET.

This avoids
  ld: kernel/trace/bpf_trace.o: in function `bpf_sock_from_file':
  bpf_trace.c:(.text+0xe23): undefined reference to `sock_from_file'
When compiling a kernel with BPF and without NET.

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/bpf/20201208173623.1136863-1-revest@chromium.org
---
 include/linux/bpf.h      |  1 +
 kernel/trace/bpf_trace.c | 22 ++--------------------
 net/core/filter.c        | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d05e75ed8c1b..07cb5d15e743 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1859,6 +1859,7 @@ extern const struct bpf_func_proto bpf_snprintf_btf_proto;
 extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
+extern const struct bpf_func_proto bpf_sock_from_file_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0cf0a6331482..52ddd217d6a1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1270,24 +1270,6 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_1(bpf_sock_from_file, struct file *, file)
-{
-	return (unsigned long) sock_from_file(file);
-}
-
-BTF_ID_LIST(bpf_sock_from_file_btf_ids)
-BTF_ID(struct, socket)
-BTF_ID(struct, file)
-
-static const struct bpf_func_proto bpf_sock_from_file_proto = {
-	.func		= bpf_sock_from_file,
-	.gpl_only	= false,
-	.ret_type	= RET_PTR_TO_BTF_ID_OR_NULL,
-	.ret_btf_id	= &bpf_sock_from_file_btf_ids[0],
-	.arg1_type	= ARG_PTR_TO_BTF_ID,
-	.arg1_btf_id	= &bpf_sock_from_file_btf_ids[1],
-};
-
 const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1384,8 +1366,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_per_cpu_ptr_proto;
 	case BPF_FUNC_bpf_this_cpu_ptr:
 		return &bpf_this_cpu_ptr_proto;
-	case BPF_FUNC_sock_from_file:
-		return &bpf_sock_from_file_proto;
 	default:
 		return NULL;
 	}
@@ -1778,6 +1758,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_storage_get_tracing_proto;
 	case BPF_FUNC_sk_storage_delete:
 		return &bpf_sk_storage_delete_tracing_proto;
+	case BPF_FUNC_sock_from_file:
+		return &bpf_sock_from_file_proto;
 #endif
 	case BPF_FUNC_seq_printf:
 		return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/net/core/filter.c b/net/core/filter.c
index 77001a35768f..255aeee72402 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10413,6 +10413,24 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
 	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
 };
 
+BPF_CALL_1(bpf_sock_from_file, struct file *, file)
+{
+	return (unsigned long)sock_from_file(file);
+}
+
+BTF_ID_LIST(bpf_sock_from_file_btf_ids)
+BTF_ID(struct, socket)
+BTF_ID(struct, file)
+
+const struct bpf_func_proto bpf_sock_from_file_proto = {
+	.func		= bpf_sock_from_file,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_BTF_ID_OR_NULL,
+	.ret_btf_id	= &bpf_sock_from_file_btf_ids[0],
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &bpf_sock_from_file_btf_ids[1],
+};
+
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id)
 {
-- 
cgit 


From e77dcb0b732dd355ca594909f6c2085dfc46cde2 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 6 Nov 2020 10:37:16 +0530
Subject: opp: Don't create an OPP table from dev_pm_opp_get_opp_table()

It has been found that some users (like cpufreq-dt and others on LKML)
have abused the helper dev_pm_opp_get_opp_table() to create the OPP
table instead of just finding it, which is the wrong thing to do. This
routine was meant for OPP core's internal working and exposed the whole
functionality by mistake.

Change the scope of dev_pm_opp_get_opp_table() to only finding the
table. The internal helpers _opp_get_opp_table*() are thus renamed to
_add_opp_table*(), dev_pm_opp_get_opp_table_indexed() is removed (as we
don't need the index field for finding the OPP table) and so the only
user, genpd, is updated.

Note that the prototype of _add_opp_table() was already left in opp.h by
mistake when it was removed earlier and so we weren't required to add it
now.

Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/base/power/domain.c |  2 +-
 drivers/opp/core.c          | 27 +++++++++++++--------------
 drivers/opp/of.c            |  4 ++--
 drivers/opp/opp.h           |  1 +
 include/linux/pm_opp.h      |  1 -
 5 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 743268996336..92b750b865d5 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2249,7 +2249,7 @@ int of_genpd_add_provider_onecell(struct device_node *np,
 			 * Save table for faster processing while setting
 			 * performance state.
 			 */
-			genpd->opp_table = dev_pm_opp_get_opp_table_indexed(&genpd->dev, i);
+			genpd->opp_table = dev_pm_opp_get_opp_table(&genpd->dev);
 			WARN_ON(IS_ERR(genpd->opp_table));
 		}
 
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 27a0e49b24ab..8f53c1b48911 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1138,7 +1138,7 @@ void _get_opp_table_kref(struct opp_table *opp_table)
  * uses the opp_tables_busy flag to indicate if another creator is in the middle
  * of adding an OPP table and others should wait for it to finish.
  */
-static struct opp_table *_opp_get_opp_table(struct device *dev, int index)
+struct opp_table *_add_opp_table_indexed(struct device *dev, int index)
 {
 	struct opp_table *opp_table;
 
@@ -1188,17 +1188,16 @@ unlock:
 	return opp_table;
 }
 
-struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
+struct opp_table *_add_opp_table(struct device *dev)
 {
-	return _opp_get_opp_table(dev, 0);
+	return _add_opp_table_indexed(dev, 0);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_table);
 
-struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device *dev,
-						   int index)
+struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
 {
-	return _opp_get_opp_table(dev, index);
+	return _find_opp_table(dev);
 }
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_table);
 
 static void _opp_table_kref_release(struct kref *kref)
 {
@@ -1627,7 +1626,7 @@ struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
 {
 	struct opp_table *opp_table;
 
-	opp_table = dev_pm_opp_get_opp_table(dev);
+	opp_table = _add_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return opp_table;
 
@@ -1686,7 +1685,7 @@ struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
 	struct opp_table *opp_table;
 
-	opp_table = dev_pm_opp_get_opp_table(dev);
+	opp_table = _add_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return opp_table;
 
@@ -1779,7 +1778,7 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 	struct regulator *reg;
 	int ret, i;
 
-	opp_table = dev_pm_opp_get_opp_table(dev);
+	opp_table = _add_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return opp_table;
 
@@ -1887,7 +1886,7 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
 	struct opp_table *opp_table;
 	int ret;
 
-	opp_table = dev_pm_opp_get_opp_table(dev);
+	opp_table = _add_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return opp_table;
 
@@ -1955,7 +1954,7 @@ struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 	if (!set_opp)
 		return ERR_PTR(-EINVAL);
 
-	opp_table = dev_pm_opp_get_opp_table(dev);
+	opp_table = _add_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return opp_table;
 
@@ -2039,7 +2038,7 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 	int index = 0, ret = -EINVAL;
 	const char **name = names;
 
-	opp_table = dev_pm_opp_get_opp_table(dev);
+	opp_table = _add_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return opp_table;
 
@@ -2204,7 +2203,7 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
 	struct opp_table *opp_table;
 	int ret;
 
-	opp_table = dev_pm_opp_get_opp_table(dev);
+	opp_table = _add_opp_table(dev);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index aa0ac5d4e479..6b7f0066942d 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -975,7 +975,7 @@ int dev_pm_opp_of_add_table(struct device *dev)
 	struct opp_table *opp_table;
 	int ret;
 
-	opp_table = dev_pm_opp_get_opp_table_indexed(dev, 0);
+	opp_table = _add_opp_table_indexed(dev, 0);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
@@ -1030,7 +1030,7 @@ int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
 			index = 0;
 	}
 
-	opp_table = dev_pm_opp_get_opp_table_indexed(dev, index);
+	opp_table = _add_opp_table_indexed(dev, index);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index ebd930e0b3ca..4ced7ffa8158 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -224,6 +224,7 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *o
 int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic);
 void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu);
 struct opp_table *_add_opp_table(struct device *dev);
+struct opp_table *_add_opp_table_indexed(struct device *dev, int index);
 void _put_opp_list_kref(struct opp_table *opp_table);
 
 #ifdef CONFIG_OF
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index dbb484524f82..1435c054016a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -90,7 +90,6 @@ struct dev_pm_set_opp_data {
 #if defined(CONFIG_PM_OPP)
 
 struct opp_table *dev_pm_opp_get_opp_table(struct device *dev);
-struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device *dev, int index);
 void dev_pm_opp_put_opp_table(struct opp_table *opp_table);
 
 unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
-- 
cgit 


From e4a9378083c57a22624cda8b780ff5f5da4de7ef Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Tue, 1 Dec 2020 19:17:33 -0800
Subject: usb: typec: tcpm: Pass down negotiated rev to update retry count

nRetryCount was updated from 3 to 2 between PD2.0 and PD3.0 spec.
nRetryCount in "Table 6-34 Counter parameters" of the PD 2.0
spec is set to 3, whereas, nRetryCount in "Table 6-59 Counter
parameters" is set to 2.

Pass down negotiated rev in pd_transmit so that low level chip
drivers can update the retry count accordingly before attempting
packet transmission.

This helps in passing "TEST.PD.PORT.ALL.02" of the
"Power Delivery Merged" test suite which was initially failing
with "The UUT did not retransmit the message nReryCount times"

In fusb302 & tcpci drivers, by default the driver sets the retry
count to 3 (Default for PD 2.0). Update this to 2,
if the negotiated rev is PD 3.0.

In wcove, since the retry count is intentionally set to max, leaving
it as is.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Link: https://lore.kernel.org/r/20201202031733.647808-1-badhri@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/fusb302.c | 16 +++++++++++-----
 drivers/usb/typec/tcpm/tcpci.c   | 12 +++++++-----
 drivers/usb/typec/tcpm/tcpm.c    |  2 +-
 drivers/usb/typec/tcpm/wcove.c   |  3 ++-
 include/linux/usb/tcpm.h         |  2 +-
 5 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/fusb302.c b/drivers/usb/typec/tcpm/fusb302.c
index 99562cc65ca6..ebc46b9f776c 100644
--- a/drivers/usb/typec/tcpm/fusb302.c
+++ b/drivers/usb/typec/tcpm/fusb302.c
@@ -343,12 +343,11 @@ static int fusb302_sw_reset(struct fusb302_chip *chip)
 	return ret;
 }
 
-static int fusb302_enable_tx_auto_retries(struct fusb302_chip *chip)
+static int fusb302_enable_tx_auto_retries(struct fusb302_chip *chip, u8 retry_count)
 {
 	int ret = 0;
 
-	ret = fusb302_i2c_set_bits(chip, FUSB_REG_CONTROL3,
-				   FUSB_REG_CONTROL3_N_RETRIES_3 |
+	ret = fusb302_i2c_set_bits(chip, FUSB_REG_CONTROL3, retry_count |
 				   FUSB_REG_CONTROL3_AUTO_RETRY);
 
 	return ret;
@@ -399,7 +398,7 @@ static int tcpm_init(struct tcpc_dev *dev)
 	ret = fusb302_sw_reset(chip);
 	if (ret < 0)
 		return ret;
-	ret = fusb302_enable_tx_auto_retries(chip);
+	ret = fusb302_enable_tx_auto_retries(chip, FUSB_REG_CONTROL3_N_RETRIES_3);
 	if (ret < 0)
 		return ret;
 	ret = fusb302_init_interrupt(chip);
@@ -1017,7 +1016,7 @@ static const char * const transmit_type_name[] = {
 };
 
 static int tcpm_pd_transmit(struct tcpc_dev *dev, enum tcpm_transmit_type type,
-			    const struct pd_message *msg)
+			    const struct pd_message *msg, unsigned int negotiated_rev)
 {
 	struct fusb302_chip *chip = container_of(dev, struct fusb302_chip,
 						 tcpc_dev);
@@ -1026,6 +1025,13 @@ static int tcpm_pd_transmit(struct tcpc_dev *dev, enum tcpm_transmit_type type,
 	mutex_lock(&chip->lock);
 	switch (type) {
 	case TCPC_TX_SOP:
+		/* nRetryCount 3 in P2.0 spec, whereas 2 in PD3.0 spec */
+		ret = fusb302_enable_tx_auto_retries(chip, negotiated_rev > PD_REV20 ?
+						     FUSB_REG_CONTROL3_N_RETRIES_2 :
+						     FUSB_REG_CONTROL3_N_RETRIES_3);
+		if (ret < 0)
+			fusb302_log(chip, "Cannot update retry count ret=%d", ret);
+
 		ret = fusb302_pd_send_message(chip, msg);
 		if (ret < 0)
 			fusb302_log(chip,
diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c
index 12d983a75510..98a2455f779f 100644
--- a/drivers/usb/typec/tcpm/tcpci.c
+++ b/drivers/usb/typec/tcpm/tcpci.c
@@ -18,7 +18,8 @@
 
 #include "tcpci.h"
 
-#define	PD_RETRY_COUNT				3
+#define	PD_RETRY_COUNT_DEFAULT			3
+#define	PD_RETRY_COUNT_3_0_OR_HIGHER		2
 #define	AUTO_DISCHARGE_DEFAULT_THRESHOLD_MV	3500
 #define	AUTO_DISCHARGE_PD_HEADROOM_MV		850
 #define	AUTO_DISCHARGE_PPS_HEADROOM_MV		1250
@@ -447,9 +448,8 @@ static int tcpci_set_vbus(struct tcpc_dev *tcpc, bool source, bool sink)
 	return 0;
 }
 
-static int tcpci_pd_transmit(struct tcpc_dev *tcpc,
-			     enum tcpm_transmit_type type,
-			     const struct pd_message *msg)
+static int tcpci_pd_transmit(struct tcpc_dev *tcpc, enum tcpm_transmit_type type,
+			     const struct pd_message *msg, unsigned int negotiated_rev)
 {
 	struct tcpci *tcpci = tcpc_to_tcpci(tcpc);
 	u16 header = msg ? le16_to_cpu(msg->header) : 0;
@@ -497,7 +497,9 @@ static int tcpci_pd_transmit(struct tcpc_dev *tcpc,
 		}
 	}
 
-	reg = (PD_RETRY_COUNT << TCPC_TRANSMIT_RETRY_SHIFT) | (type << TCPC_TRANSMIT_TYPE_SHIFT);
+	/* nRetryCount is 3 in PD2.0 spec where 2 in PD3.0 spec */
+	reg = ((negotiated_rev > PD_REV20 ? PD_RETRY_COUNT_3_0_OR_HIGHER : PD_RETRY_COUNT_DEFAULT)
+	       << TCPC_TRANSMIT_RETRY_SHIFT) | (type << TCPC_TRANSMIT_TYPE_SHIFT);
 	ret = regmap_write(tcpci->regmap, TCPC_TRANSMIT, reg);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 3bbc1f10af49..c73bc3a8356a 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -667,7 +667,7 @@ static int tcpm_pd_transmit(struct tcpm_port *port,
 		tcpm_log(port, "PD TX, type: %#x", type);
 
 	reinit_completion(&port->tx_complete);
-	ret = port->tcpc->pd_transmit(port->tcpc, type, msg);
+	ret = port->tcpc->pd_transmit(port->tcpc, type, msg, port->negotiated_rev);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/usb/typec/tcpm/wcove.c b/drivers/usb/typec/tcpm/wcove.c
index 9b745f432c91..79ae63950050 100644
--- a/drivers/usb/typec/tcpm/wcove.c
+++ b/drivers/usb/typec/tcpm/wcove.c
@@ -356,7 +356,8 @@ static int wcove_set_pd_rx(struct tcpc_dev *tcpc, bool on)
 
 static int wcove_pd_transmit(struct tcpc_dev *tcpc,
 			     enum tcpm_transmit_type type,
-			     const struct pd_message *msg)
+			     const struct pd_message *msg,
+			     unsigned int negotiated_rev)
 {
 	struct wcove_typec *wcove = tcpc_to_wcove(tcpc);
 	unsigned int info = 0;
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index e68aaa12886f..efaedd7e8a18 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -121,7 +121,7 @@ struct tcpc_dev {
 			      enum typec_cc_status cc);
 	int (*try_role)(struct tcpc_dev *dev, int role);
 	int (*pd_transmit)(struct tcpc_dev *dev, enum tcpm_transmit_type type,
-			   const struct pd_message *msg);
+			   const struct pd_message *msg, unsigned int negotiated_rev);
 	int (*set_bist_data)(struct tcpc_dev *dev, bool on);
 	int (*enable_frs)(struct tcpc_dev *dev, bool enable);
 	void (*frs_sourcing_vbus)(struct tcpc_dev *dev);
-- 
cgit 


From 28b43d3d746b89fc112fe681e018b39b43495dad Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Tue, 1 Dec 2020 20:08:38 -0800
Subject: usb: typec: tcpm: Introduce vsafe0v for vbus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TCPM at present lacks the notion of VSAFE0V. There
are three vbus threshold levels that are critical to track:
a. vSafe5V         - VBUS “5 volts” as defined by the USB
                     PD specification.
b. vSinkDisconnect - Threshold used for transition from
                     Attached.SNK to Unattached.SNK.
c. vSafe0V         - VBUS “0 volts” as defined by the USB
                     PD specification.

Tracking vSafe0V is crucial for entry into Try.SNK and
Attached.SRC and turning vbus back on by the source in
response to hard reset.

>From "4.5.2.2.8.2 Exiting from AttachWait.SRC State" section
in the Type-C spec:

"The port shall transition to Attached.SRC when VBUS is at
vSafe0V and the SRC.Rd state is detected on exactly one of
the CC1 or CC2 pins for at least tCCDebounce."

"A DRP that strongly prefers the Sink role may optionally
transition to Try.SNK instead of Attached.SRC when VBUS
is at vSafe0V and the SRC.Rd state is detected on exactly
one of the CC1 or CC2 pins for at least tCCDebounce."

>From "7.1.5 Response to Hard Resets" section in the PD spec:

"After establishing the vSafe0V voltage condition on VBUS,
the Source Shall wait tSrcRecover before re-applying VCONN
and restoring VBUS to vSafe5V."

vbus_present in the TCPM code tracks vSafe5V(vbus_present is true)
and vSinkDisconnect(vbus_present is false).

This change adds is_vbus_vsafe0v callback which when set makes
TCPM query for vSafe0V voltage level when needed.

Since not all TCPC controllers might have the capability
to report vSafe0V, TCPM assumes that vSafe0V is same as
vSinkDisconnect when is_vbus_vsafe0v callback is not set.
This allows TCPM to continue to support controllers which don't
have the support for reporting vSafe0V.

Introducing vSafe0V helps fix the failure reported at
"Step 15. CVS verifies PUT remains in AttachWait.SRC for 500ms"
of "TD 4.7.2 Try. SNK DRP Connect DRP Test" of
"Universal Serial Bus Type-C (USB Type-C) Functional Test
Specification Chapters 4 and 5". Here the compliance tester
intentionally maintains vbus at greater than vSafe0V and expects
the Product under test to stay in AttachWait.SRC till vbus drops
to vSafe0V.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Link: https://lore.kernel.org/r/20201202040840.663578-1-badhri@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 63 +++++++++++++++++++++++++++++++++++--------
 include/linux/usb/tcpm.h      |  7 +++++
 2 files changed, 59 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 1c64b831bbb5..cedc6cf82d61 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -258,7 +258,19 @@ struct tcpm_port {
 	bool attached;
 	bool connected;
 	enum typec_port_type port_type;
+
+	/*
+	 * Set to true when vbus is greater than VSAFE5V min.
+	 * Set to false when vbus falls below vSinkDisconnect max threshold.
+	 */
 	bool vbus_present;
+
+	/*
+	 * Set to true when vbus is less than VSAFE0V max.
+	 * Set to false when vbus is greater than VSAFE0V max.
+	 */
+	bool vbus_vsafe0v;
+
 	bool vbus_never_low;
 	bool vbus_source;
 	bool vbus_charge;
@@ -3093,7 +3105,7 @@ static void run_state_machine(struct tcpm_port *port)
 		else if (tcpm_port_is_audio(port))
 			tcpm_set_state(port, AUDIO_ACC_ATTACHED,
 				       PD_T_CC_DEBOUNCE);
-		else if (tcpm_port_is_source(port))
+		else if (tcpm_port_is_source(port) && port->vbus_vsafe0v)
 			tcpm_set_state(port,
 				       tcpm_try_snk(port) ? SNK_TRY
 							  : SRC_ATTACHED,
@@ -4096,6 +4108,12 @@ static void _tcpm_pd_vbus_on(struct tcpm_port *port)
 {
 	tcpm_log_force(port, "VBUS on");
 	port->vbus_present = true;
+	/*
+	 * When vbus_present is true i.e. Voltage at VBUS is greater than VSAFE5V implicitly
+	 * states that vbus is not at VSAFE0V, hence clear the vbus_vsafe0v flag here.
+	 */
+	port->vbus_vsafe0v = false;
+
 	switch (port->state) {
 	case SNK_TRANSITION_SINK_VBUS:
 		port->explicit_contract = true;
@@ -4185,16 +4203,8 @@ static void _tcpm_pd_vbus_off(struct tcpm_port *port)
 	case SNK_HARD_RESET_SINK_OFF:
 		tcpm_set_state(port, SNK_HARD_RESET_WAIT_VBUS, 0);
 		break;
-	case SRC_HARD_RESET_VBUS_OFF:
-		/*
-		 * After establishing the vSafe0V voltage condition on VBUS, the Source Shall wait
-		 * tSrcRecover before re-applying VCONN and restoring VBUS to vSafe5V.
-		 */
-		tcpm_set_state(port, SRC_HARD_RESET_VBUS_ON, PD_T_SRC_RECOVER);
-		break;
 	case HARD_RESET_SEND:
 		break;
-
 	case SNK_TRY:
 		/* Do nothing, waiting for timeout */
 		break;
@@ -4265,6 +4275,28 @@ static void _tcpm_pd_vbus_off(struct tcpm_port *port)
 	}
 }
 
+static void _tcpm_pd_vbus_vsafe0v(struct tcpm_port *port)
+{
+	tcpm_log_force(port, "VBUS VSAFE0V");
+	port->vbus_vsafe0v = true;
+	switch (port->state) {
+	case SRC_HARD_RESET_VBUS_OFF:
+		/*
+		 * After establishing the vSafe0V voltage condition on VBUS, the Source Shall wait
+		 * tSrcRecover before re-applying VCONN and restoring VBUS to vSafe5V.
+		 */
+		tcpm_set_state(port, SRC_HARD_RESET_VBUS_ON, PD_T_SRC_RECOVER);
+		break;
+	case SRC_ATTACH_WAIT:
+		if (tcpm_port_is_source(port))
+			tcpm_set_state(port, tcpm_try_snk(port) ? SNK_TRY : SRC_ATTACHED,
+				       PD_T_CC_DEBOUNCE);
+		break;
+	default:
+		break;
+	}
+}
+
 static void _tcpm_pd_hard_reset(struct tcpm_port *port)
 {
 	tcpm_log_force(port, "Received hard reset");
@@ -4300,10 +4332,19 @@ static void tcpm_pd_event_handler(struct kthread_work *work)
 			bool vbus;
 
 			vbus = port->tcpc->get_vbus(port->tcpc);
-			if (vbus)
+			if (vbus) {
 				_tcpm_pd_vbus_on(port);
-			else
+			} else {
 				_tcpm_pd_vbus_off(port);
+				/*
+				 * When TCPC does not support detecting vsafe0v voltage level,
+				 * treat vbus absent as vsafe0v. Else invoke is_vbus_vsafe0v
+				 * to see if vbus has discharge to VSAFE0V.
+				 */
+				if (!port->tcpc->is_vbus_vsafe0v ||
+				    port->tcpc->is_vbus_vsafe0v(port->tcpc))
+					_tcpm_pd_vbus_vsafe0v(port);
+			}
 		}
 		if (events & TCPM_CC_EVENT) {
 			enum typec_cc_status cc1, cc2;
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index efaedd7e8a18..f4a18427f5c4 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -98,6 +98,12 @@ enum tcpm_transmit_type {
  *		will be turned on. requested_vbus_voltage is set to 0 when vbus
  *		is going to disappear knowingly i.e. during PR_SWAP and
  *		HARD_RESET etc.
+ * @is_vbus_vsafe0v:
+ *		Optional; TCPCI spec based TCPC implementations are expected to
+ *		detect VSAFE0V voltage level at vbus. When detection of VSAFE0V
+ *		is supported by TCPC, set this callback for TCPM to query
+ *		whether vbus is at VSAFE0V when needed.
+ *		Returns true when vbus is at VSAFE0V, false otherwise.
  */
 struct tcpc_dev {
 	struct fwnode_handle *fwnode;
@@ -128,6 +134,7 @@ struct tcpc_dev {
 	int (*enable_auto_vbus_discharge)(struct tcpc_dev *dev, bool enable);
 	int (*set_auto_vbus_discharge_threshold)(struct tcpc_dev *dev, enum typec_pwr_opmode mode,
 						 bool pps_active, u32 requested_vbus_voltage);
+	bool (*is_vbus_vsafe0v)(struct tcpc_dev *dev);
 };
 
 struct tcpm_port;
-- 
cgit 


From af633212c4aac1dbd3a48615a834646ce072346d Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 2 Dec 2020 12:39:36 +0100
Subject: tty: use assign_bit() in port-flag accessors

Use the new assign_bit() wrapper in the port-flag accessors instead of
open coding.

Suggested-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20201202113942.27024-2-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 30 ++++++------------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 0c2f97ba8018..9b8e7d5bf2fc 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -615,10 +615,7 @@ static inline bool tty_port_cts_enabled(struct tty_port *port)
 
 static inline void tty_port_set_cts_flow(struct tty_port *port, bool val)
 {
-	if (val)
-		set_bit(TTY_PORT_CTS_FLOW, &port->iflags);
-	else
-		clear_bit(TTY_PORT_CTS_FLOW, &port->iflags);
+	assign_bit(TTY_PORT_CTS_FLOW, &port->iflags, val);
 }
 
 static inline bool tty_port_active(struct tty_port *port)
@@ -628,10 +625,7 @@ static inline bool tty_port_active(struct tty_port *port)
 
 static inline void tty_port_set_active(struct tty_port *port, bool val)
 {
-	if (val)
-		set_bit(TTY_PORT_ACTIVE, &port->iflags);
-	else
-		clear_bit(TTY_PORT_ACTIVE, &port->iflags);
+	assign_bit(TTY_PORT_ACTIVE, &port->iflags, val);
 }
 
 static inline bool tty_port_check_carrier(struct tty_port *port)
@@ -641,10 +635,7 @@ static inline bool tty_port_check_carrier(struct tty_port *port)
 
 static inline void tty_port_set_check_carrier(struct tty_port *port, bool val)
 {
-	if (val)
-		set_bit(TTY_PORT_CHECK_CD, &port->iflags);
-	else
-		clear_bit(TTY_PORT_CHECK_CD, &port->iflags);
+	assign_bit(TTY_PORT_CHECK_CD, &port->iflags, val);
 }
 
 static inline bool tty_port_suspended(struct tty_port *port)
@@ -654,10 +645,7 @@ static inline bool tty_port_suspended(struct tty_port *port)
 
 static inline void tty_port_set_suspended(struct tty_port *port, bool val)
 {
-	if (val)
-		set_bit(TTY_PORT_SUSPENDED, &port->iflags);
-	else
-		clear_bit(TTY_PORT_SUSPENDED, &port->iflags);
+	assign_bit(TTY_PORT_SUSPENDED, &port->iflags, val);
 }
 
 static inline bool tty_port_initialized(struct tty_port *port)
@@ -667,10 +655,7 @@ static inline bool tty_port_initialized(struct tty_port *port)
 
 static inline void tty_port_set_initialized(struct tty_port *port, bool val)
 {
-	if (val)
-		set_bit(TTY_PORT_INITIALIZED, &port->iflags);
-	else
-		clear_bit(TTY_PORT_INITIALIZED, &port->iflags);
+	assign_bit(TTY_PORT_INITIALIZED, &port->iflags, val);
 }
 
 static inline bool tty_port_kopened(struct tty_port *port)
@@ -680,10 +665,7 @@ static inline bool tty_port_kopened(struct tty_port *port)
 
 static inline void tty_port_set_kopened(struct tty_port *port, bool val)
 {
-	if (val)
-		set_bit(TTY_PORT_KOPENED, &port->iflags);
-	else
-		clear_bit(TTY_PORT_KOPENED, &port->iflags);
+	assign_bit(TTY_PORT_KOPENED, &port->iflags, val);
 }
 
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
-- 
cgit 


From 9e1792727ead477f49958578d0dbd466a7deea48 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 2 Dec 2020 12:39:37 +0100
Subject: tty: use const parameters in port-flag accessors

Declare the port parameter to the flag-test accessors as const.

This is currently mostly cosmetic as the accessors are already inlined.

Suggested-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20201202113942.27024-3-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 9b8e7d5bf2fc..c873f475f0a7 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -608,7 +608,7 @@ static inline struct tty_port *tty_port_get(struct tty_port *port)
 }
 
 /* If the cts flow control is enabled, return true. */
-static inline bool tty_port_cts_enabled(struct tty_port *port)
+static inline bool tty_port_cts_enabled(const struct tty_port *port)
 {
 	return test_bit(TTY_PORT_CTS_FLOW, &port->iflags);
 }
@@ -618,7 +618,7 @@ static inline void tty_port_set_cts_flow(struct tty_port *port, bool val)
 	assign_bit(TTY_PORT_CTS_FLOW, &port->iflags, val);
 }
 
-static inline bool tty_port_active(struct tty_port *port)
+static inline bool tty_port_active(const struct tty_port *port)
 {
 	return test_bit(TTY_PORT_ACTIVE, &port->iflags);
 }
@@ -628,7 +628,7 @@ static inline void tty_port_set_active(struct tty_port *port, bool val)
 	assign_bit(TTY_PORT_ACTIVE, &port->iflags, val);
 }
 
-static inline bool tty_port_check_carrier(struct tty_port *port)
+static inline bool tty_port_check_carrier(const struct tty_port *port)
 {
 	return test_bit(TTY_PORT_CHECK_CD, &port->iflags);
 }
@@ -638,7 +638,7 @@ static inline void tty_port_set_check_carrier(struct tty_port *port, bool val)
 	assign_bit(TTY_PORT_CHECK_CD, &port->iflags, val);
 }
 
-static inline bool tty_port_suspended(struct tty_port *port)
+static inline bool tty_port_suspended(const struct tty_port *port)
 {
 	return test_bit(TTY_PORT_SUSPENDED, &port->iflags);
 }
@@ -648,7 +648,7 @@ static inline void tty_port_set_suspended(struct tty_port *port, bool val)
 	assign_bit(TTY_PORT_SUSPENDED, &port->iflags, val);
 }
 
-static inline bool tty_port_initialized(struct tty_port *port)
+static inline bool tty_port_initialized(const struct tty_port *port)
 {
 	return test_bit(TTY_PORT_INITIALIZED, &port->iflags);
 }
@@ -658,7 +658,7 @@ static inline void tty_port_set_initialized(struct tty_port *port, bool val)
 	assign_bit(TTY_PORT_INITIALIZED, &port->iflags, val);
 }
 
-static inline bool tty_port_kopened(struct tty_port *port)
+static inline bool tty_port_kopened(const struct tty_port *port)
 {
 	return test_bit(TTY_PORT_KOPENED, &port->iflags);
 }
-- 
cgit 


From 4b03d99794eeed27650597a886247c6427ce1055 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 30 Nov 2020 17:46:16 -0500
Subject: nfsd: minor nfsd4_change_attribute cleanup

Minor cleanup, no change in behavior.

Also pull out a common helper that'll be useful elsewhere.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsfh.h          | 13 +++++--------
 include/linux/iversion.h | 13 +++++++++++++
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 39d764b129fa..45bd776290d5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -259,19 +259,16 @@ fh_clear_wcc(struct svc_fh *fhp)
 static inline u64 nfsd4_change_attribute(struct kstat *stat,
 					 struct inode *inode)
 {
-	u64 chattr;
-
 	if (IS_I_VERSION(inode)) {
+		u64 chattr;
+
 		chattr =  stat->ctime.tv_sec;
 		chattr <<= 30;
 		chattr += stat->ctime.tv_nsec;
 		chattr += inode_query_iversion(inode);
-	} else {
-		chattr = stat->ctime.tv_sec;
-		chattr <<= 32;
-		chattr += stat->ctime.tv_nsec;
-	}
-	return chattr;
+		return chattr;
+	} else
+		return time_to_chattr(&stat->ctime);
 }
 
 extern void fill_pre_wcc(struct svc_fh *fhp);
diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index 2917ef990d43..3bfebde5a1a6 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -328,6 +328,19 @@ inode_query_iversion(struct inode *inode)
 	return cur >> I_VERSION_QUERIED_SHIFT;
 }
 
+/*
+ * For filesystems without any sort of change attribute, the best we can
+ * do is fake one up from the ctime:
+ */
+static inline u64 time_to_chattr(struct timespec64 *t)
+{
+	u64 chattr = t->tv_sec;
+
+	chattr <<= 32;
+	chattr += t->tv_nsec;
+	return chattr;
+}
+
 /**
  * inode_eq_iversion_raw - check whether the raw i_version counter has changed
  * @inode: inode to check
-- 
cgit 


From 1631087ba8727db03c0ab2815dc06dc25d962b80 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 30 Nov 2020 17:46:18 -0500
Subject: Revert "nfsd4: support change_attr_type attribute"

This reverts commit a85857633b04d57f4524cca0a2bfaf87b2543f9f.

We're still factoring ctime into our change attribute even in the
IS_I_VERSION case.  If someone sets the system time backwards, a client
could see the change attribute go backwards.  Maybe we can just say
"well, don't do that", but there's some question whether that's good
enough, or whether we need a better guarantee.

Also, the client still isn't actually using the attribute.

While we're still figuring this out, let's just stop returning this
attribute.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c    | 10 ----------
 fs/nfsd/nfsd.h       |  1 -
 include/linux/nfs4.h |  8 --------
 3 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7f650fa47006..45ee6b12ce5b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3311,16 +3311,6 @@ out_acl:
 			goto out;
 	}
 
-	if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		if (IS_I_VERSION(d_inode(dentry)))
-			*p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR);
-		else
-			*p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA);
-	}
-
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
 		status = nfsd4_encode_security_label(xdr, rqstp, context,
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7907de3f2ee6..d63cf8196fed 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -395,7 +395,6 @@ void		nfsd_lockd_shutdown(void);
 
 #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
 	(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
-	FATTR4_WORD2_CHANGE_ATTR_TYPE | \
 	FATTR4_WORD2_MODE_UMASK | \
 	NFSD4_2_SECURITY_ATTRS | \
 	FATTR4_WORD2_XATTR_SUPPORT)
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 9dc7eeac924f..5b4c67c91f56 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -385,13 +385,6 @@ enum lock_type4 {
 	NFS4_WRITEW_LT = 4
 };
 
-enum change_attr_type4 {
-	NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0,
-	NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1,
-	NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2,
-	NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3,
-	NFS4_CHANGE_TYPE_IS_UNDEFINED = 4
-};
 
 /* Mandatory Attributes */
 #define FATTR4_WORD0_SUPPORTED_ATTRS    (1UL << 0)
@@ -459,7 +452,6 @@ enum change_attr_type4 {
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_CLONE_BLKSIZE	(1UL << 13)
-#define FATTR4_WORD2_CHANGE_ATTR_TYPE	(1UL << 15)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
 #define FATTR4_WORD2_MODE_UMASK		(1UL << 17)
 #define FATTR4_WORD2_XATTR_SUPPORT	(1UL << 18)
-- 
cgit 


From daab110e47f8d7aa6da66923e3ac1a8dbd2b2a72 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Mon, 30 Nov 2020 17:03:14 -0500
Subject: nfsd: add a new EXPORT_OP_NOWCC flag to struct export_operations

With NFSv3 nfsd will always attempt to send along WCC data to the
client. This generally involves saving off the in-core inode information
prior to doing the operation on the given filehandle, and then issuing a
vfs_getattr to it after the op.

Some filesystems (particularly clustered or networked ones) have an
expensive ->getattr inode operation. Atomicity is also often difficult
or impossible to guarantee on such filesystems. For those, we're best
off not trying to provide WCC information to the client at all, and to
simply allow it to poll for that information as needed with a GETATTR
RPC.

This patch adds a new flags field to struct export_operations, and
defines a new EXPORT_OP_NOWCC flag that filesystems can use to indicate
that nfsd should not attempt to provide WCC info in NFSv3 replies. It
also adds a blurb about the new flags field and flag to the exporting
documentation.

The server will also now skip collecting this information for NFSv2 as
well, since that info is never used there anyway.

Note that this patch does not add this flag to any filesystem
export_operations structures. This was originally developed to allow
reexporting nfs via nfsd.

Other filesystems may want to consider enabling this flag too. It's hard
to tell however which ones have export operations to enable export via
knfsd and which ones mostly rely on them for open-by-filehandle support,
so I'm leaving that up to the individual maintainers to decide. I am
cc'ing the relevant lists for those filesystems that I think may want to
consider adding this though.

Cc: HPDD-discuss@lists.01.org
Cc: ceph-devel@vger.kernel.org
Cc: cluster-devel@redhat.com
Cc: fuse-devel@lists.sourceforge.net
Cc: ocfs2-devel@oss.oracle.com
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/filesystems/nfs/exporting.rst | 27 +++++++++++++++++++++++++++
 fs/nfs/export.c                             |  1 +
 fs/nfsd/nfs3xdr.c                           |  7 +++++--
 fs/nfsd/nfsfh.c                             | 14 ++++++++++++++
 fs/nfsd/nfsfh.h                             |  2 +-
 include/linux/exportfs.h                    |  2 ++
 6 files changed, 50 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index 33d588a01ace..cbe542ad5233 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -154,6 +154,11 @@ struct which has the following members:
     to find potential names, and matches inode numbers to find the correct
     match.
 
+  flags
+    Some filesystems may need to be handled differently than others. The
+    export_operations struct also includes a flags field that allows the
+    filesystem to communicate such information to nfsd. See the Export
+    Operations Flags section below for more explanation.
 
 A filehandle fragment consists of an array of 1 or more 4byte words,
 together with a one byte "type".
@@ -163,3 +168,25 @@ generated by encode_fh, in which case it will have been padded with
 nuls.  Rather, the encode_fh routine should choose a "type" which
 indicates the decode_fh how much of the filehandle is valid, and how
 it should be interpreted.
+
+Export Operations Flags
+-----------------------
+In addition to the operation vector pointers, struct export_operations also
+contains a "flags" field that allows the filesystem to communicate to nfsd
+that it may want to do things differently when dealing with it. The
+following flags are defined:
+
+  EXPORT_OP_NOWCC - disable NFSv3 WCC attributes on this filesystem
+    RFC 1813 recommends that servers always send weak cache consistency
+    (WCC) data to the client after each operation. The server should
+    atomically collect attributes about the inode, do an operation on it,
+    and then collect the attributes afterward. This allows the client to
+    skip issuing GETATTRs in some situations but means that the server
+    is calling vfs_getattr for almost all RPCs. On some filesystems
+    (particularly those that are clustered or networked) this is expensive
+    and atomicity is difficult to guarantee. This flag indicates to nfsd
+    that it should skip providing WCC attributes to the client in NFSv3
+    replies when doing operations on this filesystem. Consider enabling
+    this on filesystems that have an expensive ->getattr inode operation,
+    or when atomicity between pre and post operation attribute collection
+    is impossible to guarantee.
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 3430d6891e89..8f4c528865c5 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -171,4 +171,5 @@ const struct export_operations nfs_export_ops = {
 	.encode_fh = nfs_encode_fh,
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
+	.flags = EXPORT_OP_NOWCC,
 };
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index b0a53c857706..821db21ba072 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -206,7 +206,7 @@ static __be32 *
 encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	struct dentry *dentry = fhp->fh_dentry;
-	if (dentry && d_really_is_positive(dentry)) {
+	if (!fhp->fh_no_wcc && dentry && d_really_is_positive(dentry)) {
 	        __be32 err;
 		struct kstat stat;
 
@@ -262,7 +262,7 @@ void fill_pre_wcc(struct svc_fh *fhp)
 	bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
 	__be32 err;
 
-	if (fhp->fh_pre_saved)
+	if (fhp->fh_no_wcc || fhp->fh_pre_saved)
 		return;
 	inode = d_inode(fhp->fh_dentry);
 	err = fh_getattr(fhp, &stat);
@@ -290,6 +290,9 @@ void fill_post_wcc(struct svc_fh *fhp)
 	struct inode *inode = d_inode(fhp->fh_dentry);
 	__be32 err;
 
+	if (fhp->fh_no_wcc)
+		return;
+
 	if (fhp->fh_post_saved)
 		printk("nfsd: inode locked twice during operation.\n");
 
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c81dbbad8792..9c29a523f484 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -291,6 +291,16 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 
 	fhp->fh_dentry = dentry;
 	fhp->fh_export = exp;
+
+	switch (rqstp->rq_vers) {
+	case 3:
+		if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOWCC)
+			fhp->fh_no_wcc = true;
+		break;
+	case 2:
+		fhp->fh_no_wcc = true;
+	}
+
 	return 0;
 out:
 	exp_put(exp);
@@ -559,6 +569,9 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 	 */
 	set_version_and_fsid_type(fhp, exp, ref_fh);
 
+	/* If we have a ref_fh, then copy the fh_no_wcc setting from it. */
+	fhp->fh_no_wcc = ref_fh ? ref_fh->fh_no_wcc : false;
+
 	if (ref_fh == fhp)
 		fh_put(ref_fh);
 
@@ -662,6 +675,7 @@ fh_put(struct svc_fh *fhp)
 		exp_put(exp);
 		fhp->fh_export = NULL;
 	}
+	fhp->fh_no_wcc = false;
 	return;
 }
 
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 45bd776290d5..347d10aa6265 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -35,6 +35,7 @@ typedef struct svc_fh {
 
 	bool			fh_locked;	/* inode locked by us */
 	bool			fh_want_write;	/* remount protection taken */
+	bool			fh_no_wcc;	/* no wcc data needed */
 	int			fh_flags;	/* FH flags */
 #ifdef CONFIG_NFSD_V3
 	bool			fh_post_saved;	/* post-op attrs saved */
@@ -54,7 +55,6 @@ typedef struct svc_fh {
 	struct kstat		fh_post_attr;	/* full attrs after operation */
 	u64			fh_post_change; /* nfsv4 change; see above */
 #endif /* CONFIG_NFSD_V3 */
-
 } svc_fh;
 #define NFSD4_FH_FOREIGN (1<<0)
 #define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f))
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 3ceb72b67a7a..e7de0103a32e 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -213,6 +213,8 @@ struct export_operations {
 			  bool write, u32 *device_generation);
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
+#define	EXPORT_OP_NOWCC		(0x1)	/* Don't collect wcc data for NFSv3 replies */
+	unsigned long	flags;
 };
 
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
-- 
cgit 


From ba5e8187c55555519ae0b63c0fb681391bc42af9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Mon, 30 Nov 2020 17:03:15 -0500
Subject: nfsd: allow filesystems to opt out of subtree checking

When we start allowing NFS to be reexported, then we have some problems
when it comes to subtree checking. In principle, we could allow it, but
it would mean encoding parent info in the filehandles and there may not
be enough space for that in a NFSv3 filehandle.

To enforce this at export upcall time, we add a new export_ops flag
that declares the filesystem ineligible for subtree checking.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/filesystems/nfs/exporting.rst | 12 ++++++++++++
 fs/nfs/export.c                             |  2 +-
 fs/nfsd/export.c                            |  6 ++++++
 include/linux/exportfs.h                    |  1 +
 4 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index cbe542ad5233..960be64446cb 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -190,3 +190,15 @@ following flags are defined:
     this on filesystems that have an expensive ->getattr inode operation,
     or when atomicity between pre and post operation attribute collection
     is impossible to guarantee.
+
+  EXPORT_OP_NOSUBTREECHK - disallow subtree checking on this fs
+    Many NFS operations deal with filehandles, which the server must then
+    vet to ensure that they live inside of an exported tree. When the
+    export consists of an entire filesystem, this is trivial. nfsd can just
+    ensure that the filehandle live on the filesystem. When only part of a
+    filesystem is exported however, then nfsd must walk the ancestors of the
+    inode to ensure that it's within an exported subtree. This is an
+    expensive operation and not all filesystems can support it properly.
+    This flag exempts the filesystem from subtree checking and causes
+    exportfs to get back an error if it tries to enable subtree checking
+    on it.
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 8f4c528865c5..b9ba306bf912 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -171,5 +171,5 @@ const struct export_operations nfs_export_ops = {
 	.encode_fh = nfs_encode_fh,
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
-	.flags = EXPORT_OP_NOWCC,
+	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK,
 };
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 21e404e7cb68..81e7bb12aca6 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -408,6 +408,12 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
 		return -EINVAL;
 	}
 
+	if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK &&
+	    !(*flags & NFSEXP_NOSUBTREECHECK)) {
+		dprintk("%s: %s does not support subtree checking!\n",
+			__func__, inode->i_sb->s_type->name);
+		return -EINVAL;
+	}
 	return 0;
 
 }
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index e7de0103a32e..2fcbab0f6b61 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -214,6 +214,7 @@ struct export_operations {
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
 #define	EXPORT_OP_NOWCC		(0x1)	/* Don't collect wcc data for NFSv3 replies */
+#define	EXPORT_OP_NOSUBTREECHK	(0x2)	/* Subtree checking is not supported! */
 	unsigned long	flags;
 };
 
-- 
cgit 


From 7f84b488f9add1d5cca3e6197c95914c7bd3c1cf Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Mon, 30 Nov 2020 17:03:16 -0500
Subject: nfsd: close cached files prior to a REMOVE or RENAME that would
 replace target

It's not uncommon for some workloads to do a bunch of I/O to a file and
delete it just afterward. If knfsd has a cached open file however, then
the file may still be open when the dentry is unlinked. If the
underlying filesystem is nfs, then that could trigger it to do a
sillyrename.

On a REMOVE or RENAME scan the nfsd_file cache for open files that
correspond to the inode, and proactively unhash and put their
references. This should prevent any delete-on-last-close activity from
occurring, solely due to knfsd's open file cache.

This must be done synchronously though so we use the variants that call
flush_delayed_fput. There are deadlock possibilities if you call
flush_delayed_fput while holding locks, however. In the case of
nfsd_rename, we don't even do the lookups of the dentries to be renamed
until we've locked for rename.

Once we've figured out what the target dentry is for a rename, check to
see whether there are cached open files associated with it. If there
are, then unwind all of the locking, close them all, and then reattempt
the rename.

None of this is really necessary for "typical" filesystems though. It's
mostly of use for NFS, so declare a new export op flag and use that to
determine whether to close the files beforehand.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/filesystems/nfs/exporting.rst | 13 +++++++++++++
 fs/nfs/export.c                             |  2 +-
 fs/nfsd/vfs.c                               | 16 +++++++++-------
 include/linux/exportfs.h                    |  5 +++--
 4 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index 960be64446cb..0e98edd353b5 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -202,3 +202,16 @@ following flags are defined:
     This flag exempts the filesystem from subtree checking and causes
     exportfs to get back an error if it tries to enable subtree checking
     on it.
+
+  EXPORT_OP_CLOSE_BEFORE_UNLINK - always close cached files before unlinking
+    On some exportable filesystems (such as NFS) unlinking a file that
+    is still open can cause a fair bit of extra work. For instance,
+    the NFS client will do a "sillyrename" to ensure that the file
+    sticks around while it's still open. When reexporting, that open
+    file is held by nfsd so we usually end up doing a sillyrename, and
+    then immediately deleting the sillyrenamed file just afterward when
+    the link count actually goes to zero. Sometimes this delete can race
+    with other operations (for instance an rmdir of the parent directory).
+    This flag causes nfsd to close any open files for this inode _before_
+    calling into the vfs to do an unlink or a rename that would replace
+    an existing file.
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index b9ba306bf912..5428713af5fe 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -171,5 +171,5 @@ const struct export_operations nfs_export_ops = {
 	.encode_fh = nfs_encode_fh,
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
-	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK,
+	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|EXPORT_OP_CLOSE_BEFORE_UNLINK,
 };
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1ecaceebee13..79cba942087e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1724,7 +1724,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	struct inode	*fdir, *tdir;
 	__be32		err;
 	int		host_err;
-	bool		has_cached = false;
+	bool		close_cached = false;
 
 	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
 	if (err)
@@ -1783,8 +1783,9 @@ retry:
 	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
 		goto out_dput_new;
 
-	if (nfsd_has_cached_files(ndentry)) {
-		has_cached = true;
+	if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
+	    nfsd_has_cached_files(ndentry)) {
+		close_cached = true;
 		goto out_dput_old;
 	} else {
 		host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
@@ -1805,7 +1806,7 @@ retry:
 	 * as that would do the wrong thing if the two directories
 	 * were the same, so again we do it by hand.
 	 */
-	if (!has_cached) {
+	if (!close_cached) {
 		fill_post_wcc(ffhp);
 		fill_post_wcc(tfhp);
 	}
@@ -1819,8 +1820,8 @@ retry:
 	 * shouldn't be done with locks held however, so we delay it until this
 	 * point and then reattempt the whole shebang.
 	 */
-	if (has_cached) {
-		has_cached = false;
+	if (close_cached) {
+		close_cached = false;
 		nfsd_close_cached_files(ndentry);
 		dput(ndentry);
 		goto retry;
@@ -1872,7 +1873,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 		type = d_inode(rdentry)->i_mode & S_IFMT;
 
 	if (type != S_IFDIR) {
-		nfsd_close_cached_files(rdentry);
+		if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
+			nfsd_close_cached_files(rdentry);
 		host_err = vfs_unlink(dirp, rdentry, NULL);
 	} else {
 		host_err = vfs_rmdir(dirp, rdentry);
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 2fcbab0f6b61..d829403ffd3b 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -213,8 +213,9 @@ struct export_operations {
 			  bool write, u32 *device_generation);
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
-#define	EXPORT_OP_NOWCC		(0x1)	/* Don't collect wcc data for NFSv3 replies */
-#define	EXPORT_OP_NOSUBTREECHK	(0x2)	/* Subtree checking is not supported! */
+#define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
+#define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
+#define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
 	unsigned long	flags;
 };
 
-- 
cgit 


From d045465fc6cbfa4acfb5a7d817a7c1a57a078109 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 30 Nov 2020 17:03:17 -0500
Subject: exportfs: Add a function to return the raw output from fh_to_dentry()

In order to allow nfsd to accept return values that are not
acceptable to overlayfs and others, add a new function.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/exportfs/expfs.c      | 32 ++++++++++++++++++++++++--------
 include/linux/exportfs.h |  5 +++++
 2 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 2dd55b172d57..0106eba46d5a 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -417,9 +417,11 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
 }
 EXPORT_SYMBOL_GPL(exportfs_encode_fh);
 
-struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
-		int fh_len, int fileid_type,
-		int (*acceptable)(void *, struct dentry *), void *context)
+struct dentry *
+exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
+		       int fileid_type,
+		       int (*acceptable)(void *, struct dentry *),
+		       void *context)
 {
 	const struct export_operations *nop = mnt->mnt_sb->s_export_op;
 	struct dentry *result, *alias;
@@ -432,10 +434,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	if (!nop || !nop->fh_to_dentry)
 		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
-	if (PTR_ERR(result) == -ENOMEM)
-		return ERR_CAST(result);
 	if (IS_ERR_OR_NULL(result))
-		return ERR_PTR(-ESTALE);
+		return result;
 
 	/*
 	 * If no acceptance criteria was specified by caller, a disconnected
@@ -561,10 +561,26 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 
  err_result:
 	dput(result);
-	if (err != -ENOMEM)
-		err = -ESTALE;
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL_GPL(exportfs_decode_fh_raw);
+
+struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
+				  int fh_len, int fileid_type,
+				  int (*acceptable)(void *, struct dentry *),
+				  void *context)
+{
+	struct dentry *ret;
+
+	ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type,
+				     acceptable, context);
+	if (IS_ERR_OR_NULL(ret)) {
+		if (ret == ERR_PTR(-ENOMEM))
+			return ret;
+		return ERR_PTR(-ESTALE);
+	}
+	return ret;
+}
 EXPORT_SYMBOL_GPL(exportfs_decode_fh);
 
 MODULE_LICENSE("GPL");
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index d829403ffd3b..846df3c96730 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -223,6 +223,11 @@ extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 				    int *max_len, struct inode *parent);
 extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
 	int *max_len, int connectable);
+extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt,
+					     struct fid *fid, int fh_len,
+					     int fileid_type,
+					     int (*acceptable)(void *, struct dentry *),
+					     void *context);
 extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *),
 	void *context);
-- 
cgit 


From 01cbf3853959feec40ec9b9a399e12a021cd4d81 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 30 Nov 2020 17:03:19 -0500
Subject: nfsd: Set PF_LOCAL_THROTTLE on local filesystems only

Don't set PF_LOCAL_THROTTLE on remote filesystems like NFS, since they
aren't expected to ever be subject to double buffering.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/export.c          |  3 ++-
 fs/nfsd/vfs.c            | 13 +++++++++++--
 include/linux/exportfs.h |  1 +
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 5428713af5fe..48b879cfe6e3 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -171,5 +171,6 @@ const struct export_operations nfs_export_ops = {
 	.encode_fh = nfs_encode_fh,
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
-	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|EXPORT_OP_CLOSE_BEFORE_UNLINK,
+	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
+		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS,
 };
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 79cba942087e..04937e51de56 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -978,18 +978,25 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 				__be32 *verf)
 {
 	struct file		*file = nf->nf_file;
+	struct super_block	*sb = file_inode(file)->i_sb;
 	struct svc_export	*exp;
 	struct iov_iter		iter;
 	__be32			nfserr;
 	int			host_err;
 	int			use_wgather;
 	loff_t			pos = offset;
+	unsigned long		exp_op_flags = 0;
 	unsigned int		pflags = current->flags;
 	rwf_t			flags = 0;
+	bool			restore_flags = false;
 
 	trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
 
-	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
+	if (sb->s_export_op)
+		exp_op_flags = sb->s_export_op->flags;
+
+	if (test_bit(RQ_LOCAL, &rqstp->rq_flags) &&
+	    !(exp_op_flags & EXPORT_OP_REMOTE_FS)) {
 		/*
 		 * We want throttling in balance_dirty_pages()
 		 * and shrink_inactive_list() to only consider
@@ -998,6 +1005,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 		 * the client's dirty pages or its congested queue.
 		 */
 		current->flags |= PF_LOCAL_THROTTLE;
+		restore_flags = true;
+	}
 
 	exp = fhp->fh_export;
 	use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
@@ -1049,7 +1058,7 @@ out_nfserr:
 		trace_nfsd_write_err(rqstp, fhp, offset, host_err);
 		nfserr = nfserrno(host_err);
 	}
-	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
+	if (restore_flags)
 		current_restore_flags(pflags, PF_LOCAL_THROTTLE);
 	return nfserr;
 }
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 846df3c96730..d93e8a6737bb 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -216,6 +216,7 @@ struct export_operations {
 #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
 #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
 #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
+#define EXPORT_OP_REMOTE_FS		(0x8) /* Filesystem is remote */
 	unsigned long	flags;
 };
 
-- 
cgit 


From 716a8bc7f706eeef80ab42c99d9f210eda845c81 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 30 Nov 2020 23:14:27 -0500
Subject: nfsd: Record NFSv4 pre/post-op attributes as non-atomic

For the case of NFSv4, specify to the client that the pre/post-op
attributes were not recorded atomically with the main operation.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/export.c          | 3 ++-
 fs/nfsd/nfsfh.c          | 4 ++++
 fs/nfsd/nfsfh.h          | 5 +++++
 fs/nfsd/xdr4.h           | 2 +-
 include/linux/exportfs.h | 3 +++
 5 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 48b879cfe6e3..7412bb164fa7 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -172,5 +172,6 @@ const struct export_operations nfs_export_ops = {
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
 	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
-		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS,
+		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
+		EXPORT_OP_NOATOMIC_ATTR,
 };
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e80a7525561d..66f2ef67792a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -301,6 +301,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	fhp->fh_export = exp;
 
 	switch (rqstp->rq_vers) {
+	case 4:
+		if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR)
+			fhp->fh_no_atomic_attr = true;
+		break;
 	case 3:
 		if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOWCC)
 			fhp->fh_no_wcc = true;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 347d10aa6265..cb20c2cd3469 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -36,6 +36,11 @@ typedef struct svc_fh {
 	bool			fh_locked;	/* inode locked by us */
 	bool			fh_want_write;	/* remount protection taken */
 	bool			fh_no_wcc;	/* no wcc data needed */
+	bool			fh_no_atomic_attr;
+						/*
+						 * wcc data is not atomic with
+						 * operation
+						 */
 	int			fh_flags;	/* FH flags */
 #ifdef CONFIG_NFSD_V3
 	bool			fh_post_saved;	/* post-op attrs saved */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index b4556e86e97c..a60ff5ce1a37 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -748,7 +748,7 @@ static inline void
 set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
 	BUG_ON(!fhp->fh_pre_saved);
-	cinfo->atomic = (u32)fhp->fh_post_saved;
+	cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr);
 
 	cinfo->before_change = fhp->fh_pre_change;
 	cinfo->after_change = fhp->fh_post_change;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index d93e8a6737bb..9f4d4bcbf251 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -217,6 +217,9 @@ struct export_operations {
 #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
 #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
 #define EXPORT_OP_REMOTE_FS		(0x8) /* Filesystem is remote */
+#define EXPORT_OP_NOATOMIC_ATTR		(0x10) /* Filesystem cannot supply
+						  atomic attribute updates
+						*/
 	unsigned long	flags;
 };
 
-- 
cgit 


From 0f9368b5bf6db0c04afc5454b1be79022a681615 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 3 Dec 2020 14:10:32 -0600
Subject: rwsem: Implement down_read_killable_nested

In preparation for converting exec_update_mutex to a rwsem so that
multiple readers can execute in parallel and not deadlock, add
down_read_killable_nested.  This is needed so that kcmp_lock
can be converted from working on a mutexes to working on rw_semaphores.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/87o8jabqh3.fsf@x220.int.ebiederm.org
---
 include/linux/rwsem.h  |  2 ++
 kernel/locking/rwsem.c | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 25e3fde85617..13021b08b2ed 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -171,6 +171,7 @@ extern void downgrade_write(struct rw_semaphore *sem);
  * See Documentation/locking/lockdep-design.rst for more details.)
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
+extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
 extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
 extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);
@@ -191,6 +192,7 @@ extern void down_read_non_owner(struct rw_semaphore *sem);
 extern void up_read_non_owner(struct rw_semaphore *sem);
 #else
 # define down_read_nested(sem, subclass)		down_read(sem)
+# define down_read_killable_nested(sem, subclass)	down_read_killable(sem)
 # define down_write_nest_lock(sem, nest_lock)	down_write(sem)
 # define down_write_nested(sem, subclass)	down_write(sem)
 # define down_write_killable_nested(sem, subclass)	down_write_killable(sem)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index f11b9bd3431d..54d11cb97551 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1605,6 +1605,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 }
 EXPORT_SYMBOL(down_read_nested);
 
+int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+		rwsem_release(&sem->dep_map, _RET_IP_);
+		return -EINTR;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(down_read_killable_nested);
+
 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
 {
 	might_sleep();
-- 
cgit 


From 31784cff7ee073b34d6eddabb95e3be2880a425c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 3 Dec 2020 14:11:13 -0600
Subject: rwsem: Implement down_read_interruptible

In preparation for converting exec_update_mutex to a rwsem so that
multiple readers can execute in parallel and not deadlock, add
down_read_interruptible.  This is needed for perf_event_open to be
converted (with no semantic changes) from working on a mutex to
wroking on a rwsem.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/87k0tybqfy.fsf@x220.int.ebiederm.org
---
 include/linux/rwsem.h  |  1 +
 kernel/locking/rwsem.c | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 13021b08b2ed..4c715be48717 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -123,6 +123,7 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
  * lock for reading
  */
 extern void down_read(struct rw_semaphore *sem);
+extern int __must_check down_read_interruptible(struct rw_semaphore *sem);
 extern int __must_check down_read_killable(struct rw_semaphore *sem);
 
 /*
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 54d11cb97551..a163542d178e 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1345,6 +1345,18 @@ static inline void __down_read(struct rw_semaphore *sem)
 	}
 }
 
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+	if (!rwsem_read_trylock(sem)) {
+		if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE)))
+			return -EINTR;
+		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+	} else {
+		rwsem_set_reader_owned(sem);
+	}
+	return 0;
+}
+
 static inline int __down_read_killable(struct rw_semaphore *sem)
 {
 	if (!rwsem_read_trylock(sem)) {
@@ -1495,6 +1507,20 @@ void __sched down_read(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(down_read);
 
+int __sched down_read_interruptible(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
+		rwsem_release(&sem->dep_map, _RET_IP_);
+		return -EINTR;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(down_read_interruptible);
+
 int __sched down_read_killable(struct rw_semaphore *sem)
 {
 	might_sleep();
-- 
cgit 


From 66bcfcdf89d00f2409f4b5da0f8c20c08318dc72 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Sun, 6 Dec 2020 17:21:42 +0100
Subject: seqlock: Prefix internal seqcount_t-only macros with a "do_"

When the seqcount_LOCKNAME_t group of data types were introduced, two
classes of seqlock.h sequence counter macros were added:

  - An external public API which can either take a plain seqcount_t or
    any of the seqcount_LOCKNAME_t variants.

  - An internal API which takes only a plain seqcount_t.

To distinguish between the two groups, the "*_seqcount_t_*" pattern was
used for the latter. This confused a number of mm/ call-site developers,
and Linus also commented that it was not a standard practice for marking
seqlock.h internal APIs.

Distinguish the latter group of macros by prefixing a "do_".

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/CAHk-=wikhGExmprXgaW+MVXG1zsGpztBbVwOb23vetk41EtTBQ@mail.gmail.com
---
 include/linux/seqlock.h | 66 ++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index d89134c74fba..235cbc65fd71 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -425,9 +425,9 @@ SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mu
  * Return: true if a read section retry is required, else false
  */
 #define __read_seqcount_retry(s, start)					\
-	__read_seqcount_t_retry(seqprop_ptr(s), start)
+	do___read_seqcount_retry(seqprop_ptr(s), start)
 
-static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start)
+static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
 	kcsan_atomic_next(0);
 	return unlikely(READ_ONCE(s->sequence) != start);
@@ -445,12 +445,12 @@ static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start)
  * Return: true if a read section retry is required, else false
  */
 #define read_seqcount_retry(s, start)					\
-	read_seqcount_t_retry(seqprop_ptr(s), start)
+	do_read_seqcount_retry(seqprop_ptr(s), start)
 
-static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start)
+static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
 	smp_rmb();
-	return __read_seqcount_t_retry(s, start);
+	return do___read_seqcount_retry(s, start);
 }
 
 /**
@@ -462,10 +462,10 @@ do {									\
 	if (seqprop_preemptible(s))					\
 		preempt_disable();					\
 									\
-	raw_write_seqcount_t_begin(seqprop_ptr(s));			\
+	do_raw_write_seqcount_begin(seqprop_ptr(s));			\
 } while (0)
 
-static inline void raw_write_seqcount_t_begin(seqcount_t *s)
+static inline void do_raw_write_seqcount_begin(seqcount_t *s)
 {
 	kcsan_nestable_atomic_begin();
 	s->sequence++;
@@ -478,13 +478,13 @@ static inline void raw_write_seqcount_t_begin(seqcount_t *s)
  */
 #define raw_write_seqcount_end(s)					\
 do {									\
-	raw_write_seqcount_t_end(seqprop_ptr(s));			\
+	do_raw_write_seqcount_end(seqprop_ptr(s));			\
 									\
 	if (seqprop_preemptible(s))					\
 		preempt_enable();					\
 } while (0)
 
-static inline void raw_write_seqcount_t_end(seqcount_t *s)
+static inline void do_raw_write_seqcount_end(seqcount_t *s)
 {
 	smp_wmb();
 	s->sequence++;
@@ -506,12 +506,12 @@ do {									\
 	if (seqprop_preemptible(s))					\
 		preempt_disable();					\
 									\
-	write_seqcount_t_begin_nested(seqprop_ptr(s), subclass);	\
+	do_write_seqcount_begin_nested(seqprop_ptr(s), subclass);	\
 } while (0)
 
-static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass)
+static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
 {
-	raw_write_seqcount_t_begin(s);
+	do_raw_write_seqcount_begin(s);
 	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
 }
 
@@ -533,12 +533,12 @@ do {									\
 	if (seqprop_preemptible(s))					\
 		preempt_disable();					\
 									\
-	write_seqcount_t_begin(seqprop_ptr(s));				\
+	do_write_seqcount_begin(seqprop_ptr(s));			\
 } while (0)
 
-static inline void write_seqcount_t_begin(seqcount_t *s)
+static inline void do_write_seqcount_begin(seqcount_t *s)
 {
-	write_seqcount_t_begin_nested(s, 0);
+	do_write_seqcount_begin_nested(s, 0);
 }
 
 /**
@@ -549,16 +549,16 @@ static inline void write_seqcount_t_begin(seqcount_t *s)
  */
 #define write_seqcount_end(s)						\
 do {									\
-	write_seqcount_t_end(seqprop_ptr(s));				\
+	do_write_seqcount_end(seqprop_ptr(s));				\
 									\
 	if (seqprop_preemptible(s))					\
 		preempt_enable();					\
 } while (0)
 
-static inline void write_seqcount_t_end(seqcount_t *s)
+static inline void do_write_seqcount_end(seqcount_t *s)
 {
 	seqcount_release(&s->dep_map, _RET_IP_);
-	raw_write_seqcount_t_end(s);
+	do_raw_write_seqcount_end(s);
 }
 
 /**
@@ -603,9 +603,9 @@ static inline void write_seqcount_t_end(seqcount_t *s)
  *      }
  */
 #define raw_write_seqcount_barrier(s)					\
-	raw_write_seqcount_t_barrier(seqprop_ptr(s))
+	do_raw_write_seqcount_barrier(seqprop_ptr(s))
 
-static inline void raw_write_seqcount_t_barrier(seqcount_t *s)
+static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
 {
 	kcsan_nestable_atomic_begin();
 	s->sequence++;
@@ -623,9 +623,9 @@ static inline void raw_write_seqcount_t_barrier(seqcount_t *s)
  * will complete successfully and see data older than this.
  */
 #define write_seqcount_invalidate(s)					\
-	write_seqcount_t_invalidate(seqprop_ptr(s))
+	do_write_seqcount_invalidate(seqprop_ptr(s))
 
-static inline void write_seqcount_t_invalidate(seqcount_t *s)
+static inline void do_write_seqcount_invalidate(seqcount_t *s)
 {
 	smp_wmb();
 	kcsan_nestable_atomic_begin();
@@ -865,9 +865,9 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 }
 
 /*
- * For all seqlock_t write side functions, use write_seqcount_*t*_begin()
- * instead of the generic write_seqcount_begin(). This way, no redundant
- * lockdep_assert_held() checks are added.
+ * For all seqlock_t write side functions, use the the internal
+ * do_write_seqcount_begin() instead of generic write_seqcount_begin().
+ * This way, no redundant lockdep_assert_held() checks are added.
  */
 
 /**
@@ -886,7 +886,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 static inline void write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
-	write_seqcount_t_begin(&sl->seqcount.seqcount);
+	do_write_seqcount_begin(&sl->seqcount.seqcount);
 }
 
 /**
@@ -898,7 +898,7 @@ static inline void write_seqlock(seqlock_t *sl)
  */
 static inline void write_sequnlock(seqlock_t *sl)
 {
-	write_seqcount_t_end(&sl->seqcount.seqcount);
+	do_write_seqcount_end(&sl->seqcount.seqcount);
 	spin_unlock(&sl->lock);
 }
 
@@ -912,7 +912,7 @@ static inline void write_sequnlock(seqlock_t *sl)
 static inline void write_seqlock_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
-	write_seqcount_t_begin(&sl->seqcount.seqcount);
+	do_write_seqcount_begin(&sl->seqcount.seqcount);
 }
 
 /**
@@ -925,7 +925,7 @@ static inline void write_seqlock_bh(seqlock_t *sl)
  */
 static inline void write_sequnlock_bh(seqlock_t *sl)
 {
-	write_seqcount_t_end(&sl->seqcount.seqcount);
+	do_write_seqcount_end(&sl->seqcount.seqcount);
 	spin_unlock_bh(&sl->lock);
 }
 
@@ -939,7 +939,7 @@ static inline void write_sequnlock_bh(seqlock_t *sl)
 static inline void write_seqlock_irq(seqlock_t *sl)
 {
 	spin_lock_irq(&sl->lock);
-	write_seqcount_t_begin(&sl->seqcount.seqcount);
+	do_write_seqcount_begin(&sl->seqcount.seqcount);
 }
 
 /**
@@ -951,7 +951,7 @@ static inline void write_seqlock_irq(seqlock_t *sl)
  */
 static inline void write_sequnlock_irq(seqlock_t *sl)
 {
-	write_seqcount_t_end(&sl->seqcount.seqcount);
+	do_write_seqcount_end(&sl->seqcount.seqcount);
 	spin_unlock_irq(&sl->lock);
 }
 
@@ -960,7 +960,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sl->lock, flags);
-	write_seqcount_t_begin(&sl->seqcount.seqcount);
+	do_write_seqcount_begin(&sl->seqcount.seqcount);
 	return flags;
 }
 
@@ -989,7 +989,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 static inline void
 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
 {
-	write_seqcount_t_end(&sl->seqcount.seqcount);
+	do_write_seqcount_end(&sl->seqcount.seqcount);
 	spin_unlock_irqrestore(&sl->lock, flags);
 }
 
-- 
cgit 


From cb262935a166bdef0ccfe6e2adffa00c0f2d038a Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Sun, 6 Dec 2020 17:21:43 +0100
Subject: seqlock: kernel-doc: Specify when preemption is automatically altered

The kernel-doc annotations for sequence counters write side functions
are incomplete: they do not specify when preemption is automatically
disabled and re-enabled.

This has confused a number of call-site developers. Fix it.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/CAHk-=wikhGExmprXgaW+MVXG1zsGpztBbVwOb23vetk41EtTBQ@mail.gmail.com
---
 include/linux/seqlock.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 235cbc65fd71..2f7bb92b4c9e 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -456,6 +456,8 @@ static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
 /**
  * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
  * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+ *
+ * Context: check write_seqcount_begin()
  */
 #define raw_write_seqcount_begin(s)					\
 do {									\
@@ -475,6 +477,8 @@ static inline void do_raw_write_seqcount_begin(seqcount_t *s)
 /**
  * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
  * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+ *
+ * Context: check write_seqcount_end()
  */
 #define raw_write_seqcount_end(s)					\
 do {									\
@@ -498,6 +502,7 @@ static inline void do_raw_write_seqcount_end(seqcount_t *s)
  * @subclass: lockdep nesting level
  *
  * See Documentation/locking/lockdep-design.rst
+ * Context: check write_seqcount_begin()
  */
 #define write_seqcount_begin_nested(s, subclass)			\
 do {									\
@@ -519,11 +524,10 @@ static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
  * write_seqcount_begin() - start a seqcount_t write side critical section
  * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
  *
- * write_seqcount_begin opens a write side critical section of the given
- * seqcount_t.
- *
- * Context: seqcount_t write side critical sections must be serialized and
- * non-preemptible. If readers can be invoked from hardirq or softirq
+ * Context: sequence counter write side sections must be serialized and
+ * non-preemptible. Preemption will be automatically disabled if and
+ * only if the seqcount write serialization lock is associated, and
+ * preemptible.  If readers can be invoked from hardirq or softirq
  * context, interrupts or bottom halves must be respectively disabled.
  */
 #define write_seqcount_begin(s)						\
@@ -545,7 +549,8 @@ static inline void do_write_seqcount_begin(seqcount_t *s)
  * write_seqcount_end() - end a seqcount_t write side critical section
  * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
  *
- * The write section must've been opened with write_seqcount_begin().
+ * Context: Preemption will be automatically re-enabled if and only if
+ * the seqcount write serialization lock is associated, and preemptible.
  */
 #define write_seqcount_end(s)						\
 do {									\
-- 
cgit 


From c95d64012ad7de2747923b0caf80e195e940606c Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:16 -0800
Subject: Revert "driver core: Avoid deferred probe due to
 fw_devlink_pause/resume()"

This reverts commit 2451e746478a6a6e981cfa66b62b791ca93b90c8.

fw_devlink_pause/resume() was an incomplete attempt at boot time
optimization. That's going to get replaced by a much better optimization
at the end of the series. Since fw_devlink_pause/resume() is going away,
changes made for that can also go away.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-2-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 21 ---------------------
 include/linux/device.h |  3 +--
 2 files changed, 1 insertion(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index ba5a3cac6571..58c7d17820a7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -51,7 +51,6 @@ static DEFINE_MUTEX(wfs_lock);
 static LIST_HEAD(deferred_sync);
 static unsigned int defer_sync_state_count = 1;
 static unsigned int defer_fw_devlink_count;
-static LIST_HEAD(deferred_fw_devlink);
 static DEFINE_MUTEX(defer_fw_devlink_lock);
 static bool fw_devlink_is_permissive(void);
 
@@ -1468,12 +1467,6 @@ static void fw_devlink_link_device(struct device *dev)
 			fw_ret = -EAGAIN;
 	} else {
 		fw_ret = -ENODEV;
-		/*
-		 * defer_hook is not used to add device to deferred_sync list
-		 * until device is bound. Since deferred fw devlink also blocks
-		 * probing, same list hook can be used for deferred_fw_devlink.
-		 */
-		list_add_tail(&dev->links.defer_hook, &deferred_fw_devlink);
 	}
 
 	if (fw_ret == -ENODEV)
@@ -1542,9 +1535,6 @@ void fw_devlink_pause(void)
  */
 void fw_devlink_resume(void)
 {
-	struct device *dev, *tmp;
-	LIST_HEAD(probe_list);
-
 	mutex_lock(&defer_fw_devlink_lock);
 	if (!defer_fw_devlink_count) {
 		WARN(true, "Unmatched fw_devlink pause/resume!");
@@ -1556,19 +1546,8 @@ void fw_devlink_resume(void)
 		goto out;
 
 	device_link_add_missing_supplier_links();
-	list_splice_tail_init(&deferred_fw_devlink, &probe_list);
 out:
 	mutex_unlock(&defer_fw_devlink_lock);
-
-	/*
-	 * bus_probe_device() can cause new devices to get added and they'll
-	 * try to grab defer_fw_devlink_lock. So, this needs to be done outside
-	 * the defer_fw_devlink_lock.
-	 */
-	list_for_each_entry_safe(dev, tmp, &probe_list, links.defer_hook) {
-		list_del_init(&dev->links.defer_hook);
-		bus_probe_device(dev);
-	}
 }
 /* Device links support end. */
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 5ed101be7b2e..da00f8e449bb 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -352,8 +352,7 @@ enum dl_dev_state {
  * @suppliers: List of links to supplier devices.
  * @consumers: List of links to consumer devices.
  * @needs_suppliers: Hook to global list of devices waiting for suppliers.
- * @defer_hook: Hook to global list of devices that have deferred sync_state or
- *		deferred fw_devlink.
+ * @defer_hook: Hook to global list of devices that have deferred sync_state.
  * @need_for_probe: If needs_suppliers is on a list, this indicates if the
  *		    suppliers are needed for probe or not.
  * @status: Driver status information.
-- 
cgit 


From 3b052a3e30f2eb92dcae9fd89af48d5a13045737 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:17 -0800
Subject: Revert "driver core: Rename dev_links_info.defer_sync to defer_hook"

This reverts commit ec7bd78498f29680f536451fbdf9464e851273ed.

This field rename was done to reuse defer_syc list head for multiple
lists. That's not needed anymore and this list head will only be used
for defer sync. So revert this patch to avoid conflicts with the other
reverts coming after this.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-3-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 22 +++++++++++-----------
 include/linux/device.h |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 58c7d17820a7..fc4b0bc59935 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -959,11 +959,11 @@ static void __device_links_queue_sync_state(struct device *dev,
 	 */
 	dev->state_synced = true;
 
-	if (WARN_ON(!list_empty(&dev->links.defer_hook)))
+	if (WARN_ON(!list_empty(&dev->links.defer_sync)))
 		return;
 
 	get_device(dev);
-	list_add_tail(&dev->links.defer_hook, list);
+	list_add_tail(&dev->links.defer_sync, list);
 }
 
 /**
@@ -981,8 +981,8 @@ static void device_links_flush_sync_list(struct list_head *list,
 {
 	struct device *dev, *tmp;
 
-	list_for_each_entry_safe(dev, tmp, list, links.defer_hook) {
-		list_del_init(&dev->links.defer_hook);
+	list_for_each_entry_safe(dev, tmp, list, links.defer_sync) {
+		list_del_init(&dev->links.defer_sync);
 
 		if (dev != dont_lock_dev)
 			device_lock(dev);
@@ -1020,12 +1020,12 @@ void device_links_supplier_sync_state_resume(void)
 	if (defer_sync_state_count)
 		goto out;
 
-	list_for_each_entry_safe(dev, tmp, &deferred_sync, links.defer_hook) {
+	list_for_each_entry_safe(dev, tmp, &deferred_sync, links.defer_sync) {
 		/*
 		 * Delete from deferred_sync list before queuing it to
-		 * sync_list because defer_hook is used for both lists.
+		 * sync_list because defer_sync is used for both lists.
 		 */
-		list_del_init(&dev->links.defer_hook);
+		list_del_init(&dev->links.defer_sync);
 		__device_links_queue_sync_state(dev, &sync_list);
 	}
 out:
@@ -1043,8 +1043,8 @@ late_initcall(sync_state_resume_initcall);
 
 static void __device_links_supplier_defer_sync(struct device *sup)
 {
-	if (list_empty(&sup->links.defer_hook) && dev_has_sync_state(sup))
-		list_add_tail(&sup->links.defer_hook, &deferred_sync);
+	if (list_empty(&sup->links.defer_sync) && dev_has_sync_state(sup))
+		list_add_tail(&sup->links.defer_sync, &deferred_sync);
 }
 
 static void device_link_drop_managed(struct device_link *link)
@@ -1272,7 +1272,7 @@ void device_links_driver_cleanup(struct device *dev)
 		WRITE_ONCE(link->status, DL_STATE_DORMANT);
 	}
 
-	list_del_init(&dev->links.defer_hook);
+	list_del_init(&dev->links.defer_sync);
 	__device_links_no_driver(dev);
 
 	device_links_write_unlock();
@@ -2405,7 +2405,7 @@ void device_initialize(struct device *dev)
 	INIT_LIST_HEAD(&dev->links.consumers);
 	INIT_LIST_HEAD(&dev->links.suppliers);
 	INIT_LIST_HEAD(&dev->links.needs_suppliers);
-	INIT_LIST_HEAD(&dev->links.defer_hook);
+	INIT_LIST_HEAD(&dev->links.defer_sync);
 	dev->links.status = DL_DEV_NO_DRIVER;
 }
 EXPORT_SYMBOL_GPL(device_initialize);
diff --git a/include/linux/device.h b/include/linux/device.h
index da00f8e449bb..1e771ea4dca6 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -352,7 +352,7 @@ enum dl_dev_state {
  * @suppliers: List of links to supplier devices.
  * @consumers: List of links to consumer devices.
  * @needs_suppliers: Hook to global list of devices waiting for suppliers.
- * @defer_hook: Hook to global list of devices that have deferred sync_state.
+ * @defer_sync: Hook to global list of devices that have deferred sync_state.
  * @need_for_probe: If needs_suppliers is on a list, this indicates if the
  *		    suppliers are needed for probe or not.
  * @status: Driver status information.
@@ -361,7 +361,7 @@ struct dev_links_info {
 	struct list_head suppliers;
 	struct list_head consumers;
 	struct list_head needs_suppliers;
-	struct list_head defer_hook;
+	struct list_head defer_sync;
 	bool need_for_probe;
 	enum dl_dev_state status;
 };
-- 
cgit 


From c84b90909e475a2eb4934b4d92fdd10e73e75805 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:21 -0800
Subject: Revert "driver core: fw_devlink: Add support for batching fwnode
 parsing"

This reverts commit 716a7a25969003d82ab738179c3f1068a120ed11.

The fw_devlink_pause/resume() APIs added by the commit being reverted
were a first cut attempt at optimizing boot time. But these APIs don't
fully solve the problem and are very fragile (can only be used for the
top level devices being added). This series replaces them with a much
better optimization that works for all device additions and also has the
benefit of reducing the complexity of the firmware (DT, EFI) specific
code and abstracting out common code to driver core.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-7-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h    |   1 -
 drivers/base/core.c    | 116 +++----------------------------------------------
 drivers/base/dd.c      |   8 ----
 include/linux/fwnode.h |   2 -
 4 files changed, 7 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index c3562adf4789..91cfb8405abd 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -156,7 +156,6 @@ extern char *make_class_name(const char *name, struct kobject *kobj);
 extern int devres_release_all(struct device *dev);
 extern void device_block_probing(void);
 extern void device_unblock_probing(void);
-extern void driver_deferred_probe_force_trigger(void);
 
 /* /sys/devices directory */
 extern struct kset *devices_kset;
diff --git a/drivers/base/core.c b/drivers/base/core.c
index b1189697c29b..670aa452bfe7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -50,9 +50,6 @@ static LIST_HEAD(wait_for_suppliers);
 static DEFINE_MUTEX(wfs_lock);
 static LIST_HEAD(deferred_sync);
 static unsigned int defer_sync_state_count = 1;
-static unsigned int defer_fw_devlink_count;
-static DEFINE_MUTEX(defer_fw_devlink_lock);
-static bool fw_devlink_is_permissive(void);
 
 #ifdef CONFIG_SRCU
 static DEFINE_MUTEX(device_links_lock);
@@ -758,7 +755,7 @@ static void device_link_add_missing_supplier_links(void)
 		int ret = fwnode_call_int_op(dev->fwnode, add_links, dev);
 		if (!ret)
 			list_del_init(&dev->links.needs_suppliers);
-		else if (ret != -ENODEV || fw_devlink_is_permissive())
+		else if (ret != -ENODEV)
 			dev->links.need_for_probe = false;
 	}
 	mutex_unlock(&wfs_lock);
@@ -1440,116 +1437,17 @@ static void fw_devlink_link_device(struct device *dev)
 {
 	int fw_ret;
 
-	if (!fw_devlink_flags)
-		return;
-
-	mutex_lock(&defer_fw_devlink_lock);
-	if (!defer_fw_devlink_count)
-		device_link_add_missing_supplier_links();
-
-	/*
-	 * The device's fwnode not having add_links() doesn't affect if other
-	 * consumers can find this device as a supplier.  So, this check is
-	 * intentionally placed after device_link_add_missing_supplier_links().
-	 */
-	if (!fwnode_has_op(dev->fwnode, add_links))
-		goto out;
+	device_link_add_missing_supplier_links();
 
-	/*
-	 * If fw_devlink is being deferred, assume all devices have mandatory
-	 * suppliers they need to link to later. Then, when the fw_devlink is
-	 * resumed, all these devices will get a chance to try and link to any
-	 * suppliers they have.
-	 */
-	if (!defer_fw_devlink_count) {
+	if (fw_devlink_flags && fwnode_has_op(dev->fwnode, add_links)) {
 		fw_ret = fwnode_call_int_op(dev->fwnode, add_links, dev);
-		if (fw_ret == -ENODEV && fw_devlink_is_permissive())
-			fw_ret = -EAGAIN;
-	} else {
-		fw_ret = -ENODEV;
+		if (fw_ret == -ENODEV && !fw_devlink_is_permissive())
+			device_link_wait_for_mandatory_supplier(dev);
+		else if (fw_ret)
+			device_link_wait_for_optional_supplier(dev);
 	}
-
-	if (fw_ret == -ENODEV)
-		device_link_wait_for_mandatory_supplier(dev);
-	else if (fw_ret)
-		device_link_wait_for_optional_supplier(dev);
-
-out:
-	mutex_unlock(&defer_fw_devlink_lock);
 }
 
-/**
- * fw_devlink_pause - Pause parsing of fwnode to create device links
- *
- * Calling this function defers any fwnode parsing to create device links until
- * fw_devlink_resume() is called. Both these functions are ref counted and the
- * caller needs to match the calls.
- *
- * While fw_devlink is paused:
- * - Any device that is added won't have its fwnode parsed to create device
- *   links.
- * - The probe of the device will also be deferred during this period.
- * - Any devices that were already added, but waiting for suppliers won't be
- *   able to link to newly added devices.
- *
- * Once fw_devlink_resume():
- * - All the fwnodes that was not parsed will be parsed.
- * - All the devices that were deferred probing will be reattempted if they
- *   aren't waiting for any more suppliers.
- *
- * This pair of functions, is mainly meant to optimize the parsing of fwnodes
- * when a lot of devices that need to link to each other are added in a short
- * interval of time. For example, adding all the top level devices in a system.
- *
- * For example, if N devices are added and:
- * - All the consumers are added before their suppliers
- * - All the suppliers of the N devices are part of the N devices
- *
- * Then:
- *
- * - With the use of fw_devlink_pause() and fw_devlink_resume(), each device
- *   will only need one parsing of its fwnode because it is guaranteed to find
- *   all the supplier devices already registered and ready to link to. It won't
- *   have to do another pass later to find one or more suppliers it couldn't
- *   find in the first parse of the fwnode. So, we'll only need O(N) fwnode
- *   parses.
- *
- * - Without the use of fw_devlink_pause() and fw_devlink_resume(), we would
- *   end up doing O(N^2) parses of fwnodes because every device that's added is
- *   guaranteed to trigger a parse of the fwnode of every device added before
- *   it. This O(N^2) parse is made worse by the fact that when a fwnode of a
- *   device is parsed, all it descendant devices might need to have their
- *   fwnodes parsed too (even if the devices themselves aren't added).
- */
-void fw_devlink_pause(void)
-{
-	mutex_lock(&defer_fw_devlink_lock);
-	defer_fw_devlink_count++;
-	mutex_unlock(&defer_fw_devlink_lock);
-}
-
-/** fw_devlink_resume - Resume parsing of fwnode to create device links
- *
- * This function is used in conjunction with fw_devlink_pause() and is ref
- * counted. See documentation for fw_devlink_pause() for more details.
- */
-void fw_devlink_resume(void)
-{
-	mutex_lock(&defer_fw_devlink_lock);
-	if (!defer_fw_devlink_count) {
-		WARN(true, "Unmatched fw_devlink pause/resume!");
-		goto out;
-	}
-
-	defer_fw_devlink_count--;
-	if (defer_fw_devlink_count)
-		goto out;
-
-	device_link_add_missing_supplier_links();
-	driver_deferred_probe_force_trigger();
-out:
-	mutex_unlock(&defer_fw_devlink_lock);
-}
 /* Device links support end. */
 
 int (*platform_notify)(struct device *dev) = NULL;
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index b4be35fa7fda..148e81969e04 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -163,14 +163,6 @@ static bool driver_deferred_probe_enable = false;
  * again.
  */
 static void driver_deferred_probe_trigger(void)
-{
-	if (!driver_deferred_probe_enable)
-		return;
-
-	driver_deferred_probe_force_trigger();
-}
-
-void driver_deferred_probe_force_trigger(void)
 {
 	if (!driver_deferred_probe_enable)
 		return;
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 9506f8ec0974..e0abafbb17f8 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -171,7 +171,5 @@ struct fwnode_operations {
 #define get_dev_from_fwnode(fwnode)	get_device((fwnode)->dev)
 
 extern u32 fw_devlink_get_flags(void);
-void fw_devlink_pause(void);
-void fw_devlink_resume(void);
 
 #endif
-- 
cgit 


From 01bb86b380a306bd937c96da36f66429f3362137 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:22 -0800
Subject: driver core: Add fwnode_init()

There are multiple locations in the kernel where a struct fwnode_handle
is initialized. Add fwnode_init() so that we have one way of
initializing a fwnode_handle.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-8-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/acpi/property.c         | 2 +-
 drivers/acpi/scan.c             | 2 +-
 drivers/base/swnode.c           | 2 +-
 drivers/firmware/efi/efi-init.c | 8 ++++----
 include/linux/fwnode.h          | 6 ++++++
 include/linux/of.h              | 2 +-
 kernel/irq/irqdomain.c          | 2 +-
 7 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index d04de10a63e4..24e87b630573 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -76,7 +76,7 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 		return false;
 
 	dn->name = link->package.elements[0].string.pointer;
-	dn->fwnode.ops = &acpi_data_fwnode_ops;
+	fwnode_init(&dn->fwnode, &acpi_data_fwnode_ops);
 	dn->parent = parent;
 	INIT_LIST_HEAD(&dn->data.properties);
 	INIT_LIST_HEAD(&dn->data.subnodes);
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index a896e5e87c93..0ac19f9274b8 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1589,7 +1589,7 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 	device->device_type = type;
 	device->handle = handle;
 	device->parent = acpi_bus_get_parent(handle);
-	device->fwnode.ops = &acpi_device_fwnode_ops;
+	fwnode_init(&device->fwnode, &acpi_device_fwnode_ops);
 	acpi_set_device_status(device, sta);
 	acpi_device_get_busid(device);
 	acpi_set_pnp_ids(handle, &device->pnp, type);
diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 010828fc785b..4a4b2008fbc2 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -653,7 +653,7 @@ swnode_register(const struct software_node *node, struct swnode *parent,
 	swnode->parent = parent;
 	swnode->allocated = allocated;
 	swnode->kobj.kset = swnode_kset;
-	swnode->fwnode.ops = &software_node_ops;
+	fwnode_init(&swnode->fwnode, &software_node_ops);
 
 	ida_init(&swnode->child_ids);
 	INIT_LIST_HEAD(&swnode->entry);
diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c
index f55a92ff12c0..b148f1459fb3 100644
--- a/drivers/firmware/efi/efi-init.c
+++ b/drivers/firmware/efi/efi-init.c
@@ -359,9 +359,7 @@ static const struct fwnode_operations efifb_fwnode_ops = {
 	.add_links = efifb_add_links,
 };
 
-static struct fwnode_handle efifb_fwnode = {
-	.ops = &efifb_fwnode_ops,
-};
+static struct fwnode_handle efifb_fwnode;
 
 static int __init register_gop_device(void)
 {
@@ -375,8 +373,10 @@ static int __init register_gop_device(void)
 	if (!pd)
 		return -ENOMEM;
 
-	if (IS_ENABLED(CONFIG_PCI))
+	if (IS_ENABLED(CONFIG_PCI)) {
+		fwnode_init(&efifb_fwnode, &efifb_fwnode_ops);
 		pd->dev.fwnode = &efifb_fwnode;
+	}
 
 	err = platform_device_add_data(pd, &screen_info, sizeof(screen_info));
 	if (err)
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index e0abafbb17f8..5589799708b5 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -170,6 +170,12 @@ struct fwnode_operations {
 	} while (false)
 #define get_dev_from_fwnode(fwnode)	get_device((fwnode)->dev)
 
+static inline void fwnode_init(struct fwnode_handle *fwnode,
+			       const struct fwnode_operations *ops)
+{
+	fwnode->ops = ops;
+}
+
 extern u32 fw_devlink_get_flags(void);
 
 #endif
diff --git a/include/linux/of.h b/include/linux/of.h
index af655d264f10..df71a3a1bb8d 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -108,7 +108,7 @@ static inline void of_node_init(struct device_node *node)
 #if defined(CONFIG_OF_KOBJ)
 	kobject_init(&node->kobj, &of_node_ktype);
 #endif
-	node->fwnode.ops = &of_fwnode_ops;
+	fwnode_init(&node->fwnode, &of_fwnode_ops);
 }
 
 #if defined(CONFIG_OF_KOBJ)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf8b374b892d..06fce7e39033 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -91,7 +91,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
 	fwid->type = type;
 	fwid->name = n;
 	fwid->pa = pa;
-	fwid->fwnode.ops = &irqchip_fwnode_ops;
+	fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops);
 	return &fwid->fwnode;
 }
 EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
-- 
cgit 


From 7b337cb3ebde384cba7405b61dfb84200bf623bf Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:23 -0800
Subject: driver core: Add fwnode link support

Add support for creating supplier-consumer links between fwnodes.  It is
intended for internal use the driver core and generic firmware support
code (eg. Device Tree, ACPI), so it is simple by design and the API
provided is limited.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-9-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/of/dynamic.c   |  1 +
 include/linux/fwnode.h | 14 ++++++++
 3 files changed, 113 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 670aa452bfe7..972d42dedfc8 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -50,6 +50,104 @@ static LIST_HEAD(wait_for_suppliers);
 static DEFINE_MUTEX(wfs_lock);
 static LIST_HEAD(deferred_sync);
 static unsigned int defer_sync_state_count = 1;
+static DEFINE_MUTEX(fwnode_link_lock);
+
+/**
+ * fwnode_link_add - Create a link between two fwnode_handles.
+ * @con: Consumer end of the link.
+ * @sup: Supplier end of the link.
+ *
+ * Create a fwnode link between fwnode handles @con and @sup. The fwnode link
+ * represents the detail that the firmware lists @sup fwnode as supplying a
+ * resource to @con.
+ *
+ * The driver core will use the fwnode link to create a device link between the
+ * two device objects corresponding to @con and @sup when they are created. The
+ * driver core will automatically delete the fwnode link between @con and @sup
+ * after doing that.
+ *
+ * Attempts to create duplicate links between the same pair of fwnode handles
+ * are ignored and there is no reference counting.
+ */
+int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup)
+{
+	struct fwnode_link *link;
+	int ret = 0;
+
+	mutex_lock(&fwnode_link_lock);
+
+	list_for_each_entry(link, &sup->consumers, s_hook)
+		if (link->consumer == con)
+			goto out;
+
+	link = kzalloc(sizeof(*link), GFP_KERNEL);
+	if (!link) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	link->supplier = sup;
+	INIT_LIST_HEAD(&link->s_hook);
+	link->consumer = con;
+	INIT_LIST_HEAD(&link->c_hook);
+
+	list_add(&link->s_hook, &sup->consumers);
+	list_add(&link->c_hook, &con->suppliers);
+out:
+	mutex_unlock(&fwnode_link_lock);
+
+	return ret;
+}
+
+/**
+ * fwnode_links_purge_suppliers - Delete all supplier links of fwnode_handle.
+ * @fwnode: fwnode whose supplier links need to be deleted
+ *
+ * Deletes all supplier links connecting directly to @fwnode.
+ */
+static void fwnode_links_purge_suppliers(struct fwnode_handle *fwnode)
+{
+	struct fwnode_link *link, *tmp;
+
+	mutex_lock(&fwnode_link_lock);
+	list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook) {
+		list_del(&link->s_hook);
+		list_del(&link->c_hook);
+		kfree(link);
+	}
+	mutex_unlock(&fwnode_link_lock);
+}
+
+/**
+ * fwnode_links_purge_consumers - Delete all consumer links of fwnode_handle.
+ * @fwnode: fwnode whose consumer links need to be deleted
+ *
+ * Deletes all consumer links connecting directly to @fwnode.
+ */
+static void fwnode_links_purge_consumers(struct fwnode_handle *fwnode)
+{
+	struct fwnode_link *link, *tmp;
+
+	mutex_lock(&fwnode_link_lock);
+	list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook) {
+		list_del(&link->s_hook);
+		list_del(&link->c_hook);
+		kfree(link);
+	}
+	mutex_unlock(&fwnode_link_lock);
+}
+
+/**
+ * fwnode_links_purge - Delete all links connected to a fwnode_handle.
+ * @fwnode: fwnode whose links needs to be deleted
+ *
+ * Deletes all links connecting directly to a fwnode.
+ */
+void fwnode_links_purge(struct fwnode_handle *fwnode)
+{
+	fwnode_links_purge_suppliers(fwnode);
+	fwnode_links_purge_consumers(fwnode);
+}
 
 #ifdef CONFIG_SRCU
 static DEFINE_MUTEX(device_links_lock);
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index fe64430b438a..9a824decf61f 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -356,6 +356,7 @@ void of_node_release(struct kobject *kobj)
 
 	property_list_free(node->properties);
 	property_list_free(node->deadprops);
+	fwnode_links_purge(of_fwnode_handle(node));
 
 	kfree(node->full_name);
 	kfree(node->data);
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 5589799708b5..b88365187347 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -10,6 +10,7 @@
 #define _LINUX_FWNODE_H_
 
 #include <linux/types.h>
+#include <linux/list.h>
 
 struct fwnode_operations;
 struct device;
@@ -18,6 +19,15 @@ struct fwnode_handle {
 	struct fwnode_handle *secondary;
 	const struct fwnode_operations *ops;
 	struct device *dev;
+	struct list_head suppliers;
+	struct list_head consumers;
+};
+
+struct fwnode_link {
+	struct fwnode_handle *supplier;
+	struct list_head s_hook;
+	struct fwnode_handle *consumer;
+	struct list_head c_hook;
 };
 
 /**
@@ -174,8 +184,12 @@ static inline void fwnode_init(struct fwnode_handle *fwnode,
 			       const struct fwnode_operations *ops)
 {
 	fwnode->ops = ops;
+	INIT_LIST_HEAD(&fwnode->consumers);
+	INIT_LIST_HEAD(&fwnode->suppliers);
 }
 
 extern u32 fw_devlink_get_flags(void);
+int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup);
+void fwnode_links_purge(struct fwnode_handle *fwnode);
 
 #endif
-- 
cgit 


From b5d3e2fbcb10957521af14c4256cd0e5f68b9234 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:25 -0800
Subject: device property: Add fwnode_is_ancestor_of() and
 fwnode_get_next_parent_dev()

Add fwnode_is_ancestor_of() helper function to check if a fwnode is an
ancestor of another fwnode.

Add fwnode_get_next_parent_dev() helper function that take as input a
fwnode and finds the closest ancestor fwnode that has a corresponding
struct device and returns that struct device.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-11-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/property.c  | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/property.h |  3 +++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 4c43d30145c6..35b95c6ac0c6 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -614,6 +614,31 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
 
+/**
+ * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode
+ * @fwnode: firmware node
+ *
+ * Given a firmware node (@fwnode), this function finds its closest ancestor
+ * firmware node that has a corresponding struct device and returns that struct
+ * device.
+ *
+ * The caller of this function is expected to call put_device() on the returned
+ * device when they are done.
+ */
+struct device *fwnode_get_next_parent_dev(struct fwnode_handle *fwnode)
+{
+	struct device *dev = NULL;
+
+	fwnode_handle_get(fwnode);
+	do {
+		fwnode = fwnode_get_next_parent(fwnode);
+		if (fwnode)
+			dev = get_dev_from_fwnode(fwnode);
+	} while (fwnode && !dev);
+	fwnode_handle_put(fwnode);
+	return dev;
+}
+
 /**
  * fwnode_count_parents - Return the number of parents a node has
  * @fwnode: The node the parents of which are to be counted
@@ -660,6 +685,33 @@ struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode,
 }
 EXPORT_SYMBOL_GPL(fwnode_get_nth_parent);
 
+/**
+ * fwnode_is_ancestor_of - Test if @test_ancestor is ancestor of @test_child
+ * @test_ancestor: Firmware which is tested for being an ancestor
+ * @test_child: Firmware which is tested for being the child
+ *
+ * A node is considered an ancestor of itself too.
+ *
+ * Returns true if @test_ancestor is an ancestor of @test_child.
+ * Otherwise, returns false.
+ */
+bool fwnode_is_ancestor_of(struct fwnode_handle *test_ancestor,
+				  struct fwnode_handle *test_child)
+{
+	if (!test_ancestor)
+		return false;
+
+	fwnode_handle_get(test_child);
+	while (test_child) {
+		if (test_child == test_ancestor) {
+			fwnode_handle_put(test_child);
+			return true;
+		}
+		test_child = fwnode_get_next_parent(test_child);
+	}
+	return false;
+}
+
 /**
  * fwnode_get_next_child_node - Return the next child node handle for a node
  * @fwnode: Firmware node to find the next child node for.
diff --git a/include/linux/property.h b/include/linux/property.h
index 2d4542629d80..0a9001fe7aea 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -85,9 +85,12 @@ const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(
 	struct fwnode_handle *fwnode);
+struct device *fwnode_get_next_parent_dev(struct fwnode_handle *fwnode);
 unsigned int fwnode_count_parents(const struct fwnode_handle *fwn);
 struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn,
 					    unsigned int depth);
+bool fwnode_is_ancestor_of(struct fwnode_handle *test_ancestor,
+				  struct fwnode_handle *test_child);
 struct fwnode_handle *fwnode_get_next_child_node(
 	const struct fwnode_handle *fwnode, struct fwnode_handle *child);
 struct fwnode_handle *fwnode_get_next_available_child_node(
-- 
cgit 


From 04f63c213b671d89db35f4239f57fa1edeb736a8 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:26 -0800
Subject: driver core: Redefine the meaning of fwnode_operations.add_links()

Change the meaning of fwnode_operations.add_links() to just create
fwnode links by parsing the properties of a given fwnode.

This patch doesn't actually make any code changes. To keeps things more
digestable, the actual functional changes come in later patches in this
series.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-12-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/efi/efi-init.c |  2 +-
 include/linux/fwnode.h          | 42 +++--------------------------------------
 2 files changed, 4 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c
index b148f1459fb3..65bb97c391b0 100644
--- a/drivers/firmware/efi/efi-init.c
+++ b/drivers/firmware/efi/efi-init.c
@@ -316,7 +316,7 @@ static struct device_node *find_pci_overlap_node(void)
  * resource reservation conflict on the memory window that the efifb
  * framebuffer steals from the PCIe host bridge.
  */
-static int efifb_add_links(const struct fwnode_handle *fwnode,
+static int efifb_add_links(struct fwnode_handle *fwnode,
 			   struct device *dev)
 {
 	struct device_node *sup_np;
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index b88365187347..942a6bb18201 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -78,44 +78,8 @@ struct fwnode_reference_args {
  *			       endpoint node.
  * @graph_get_port_parent: Return the parent node of a port node.
  * @graph_parse_endpoint: Parse endpoint for port and endpoint id.
- * @add_links:	Called after the device corresponding to the fwnode is added
- *		using device_add(). The function is expected to create device
- *		links to all the suppliers of the device that are available at
- *		the time this function is called.  The function must NOT stop
- *		at the first failed device link if other unlinked supplier
- *		devices are present in the system.  This is necessary for the
- *		driver/bus sync_state() callbacks to work correctly.
- *
- *		For example, say Device-C depends on suppliers Device-S1 and
- *		Device-S2 and the dependency is listed in that order in the
- *		firmware.  Say, S1 gets populated from the firmware after
- *		late_initcall_sync().  Say S2 is populated and probed way
- *		before that in device_initcall(). When C is populated, if this
- *		add_links() function doesn't continue past a "failed linking to
- *		S1" and continue linking C to S2, then S2 will get a
- *		sync_state() callback before C is probed. This is because from
- *		the perspective of S2, C was never a consumer when its
- *		sync_state() evaluation is done. To avoid this, the add_links()
- *		function has to go through all available suppliers of the
- *		device (that corresponds to this fwnode) and link to them
- *		before returning.
- *
- *		If some suppliers are not yet available (indicated by an error
- *		return value), this function will be called again when other
- *		devices are added to allow creating device links to any newly
- *		available suppliers.
- *
- *		Return 0 if device links have been successfully created to all
- *		the known suppliers of this device or if the supplier
- *		information is not known.
- *
- *		Return -ENODEV if the suppliers needed for probing this device
- *		have not been registered yet (because device links can only be
- *		created to devices registered with the driver core).
- *
- *		Return -EAGAIN if some of the suppliers of this device have not
- *		been registered yet, but none of those suppliers are necessary
- *		for probing the device.
+ * @add_links:	Create fwnode links to all the suppliers of the fwnode. Return
+ *		zero on success, a negative error code otherwise.
  */
 struct fwnode_operations {
 	struct fwnode_handle *(*get)(struct fwnode_handle *fwnode);
@@ -155,7 +119,7 @@ struct fwnode_operations {
 	(*graph_get_port_parent)(struct fwnode_handle *fwnode);
 	int (*graph_parse_endpoint)(const struct fwnode_handle *fwnode,
 				    struct fwnode_endpoint *endpoint);
-	int (*add_links)(const struct fwnode_handle *fwnode,
+	int (*add_links)(struct fwnode_handle *fwnode,
 			 struct device *dev);
 };
 
-- 
cgit 


From c2c724c868c42c5166bf7aa644dd0a0c8d30b47a Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:27 -0800
Subject: driver core: Add fw_devlink_parse_fwtree()

This function is a wrapper around fwnode_operations.add_links().

This function parses each node in a fwnode tree and create fwnode links
for each of those nodes. The information for creating the fwnode links
(the supplier and consumer fwnode) is obtained by parsing the properties
in each of the fwnodes.

This function also ensures that no fwnode is parsed more than once by
marking the fwnodes as parsed.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-13-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 19 +++++++++++++++++++
 include/linux/fwnode.h |  8 ++++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index edde79fc3d33..92a2dc355d13 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1542,6 +1542,25 @@ static bool fw_devlink_is_permissive(void)
 	return fw_devlink_flags == DL_FLAG_SYNC_STATE_ONLY;
 }
 
+static void fw_devlink_parse_fwnode(struct fwnode_handle *fwnode)
+{
+	if (fwnode->flags & FWNODE_FLAG_LINKS_ADDED)
+		return;
+
+	fwnode_call_int_op(fwnode, add_links, NULL);
+	fwnode->flags |= FWNODE_FLAG_LINKS_ADDED;
+}
+
+static void fw_devlink_parse_fwtree(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *child = NULL;
+
+	fw_devlink_parse_fwnode(fwnode);
+
+	while ((child = fwnode_get_next_available_child_node(fwnode, child)))
+		fw_devlink_parse_fwtree(child);
+}
+
 static void fw_devlink_link_device(struct device *dev)
 {
 	int fw_ret;
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 942a6bb18201..ffa9129182a6 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -15,12 +15,20 @@
 struct fwnode_operations;
 struct device;
 
+/*
+ * fwnode link flags
+ *
+ * LINKS_ADDED: The fwnode has already be parsed to add fwnode links.
+ */
+#define FWNODE_FLAG_LINKS_ADDED		BIT(0)
+
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
 	const struct fwnode_operations *ops;
 	struct device *dev;
 	struct list_head suppliers;
 	struct list_head consumers;
+	u8 flags;
 };
 
 struct fwnode_link {
-- 
cgit 


From f9aa460672c9c56896cdc12a521159e3e67000ba Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:31 -0800
Subject: driver core: Refactor fw_devlink feature

The current implementation of fw_devlink is very inefficient because it
tries to get away without creating fwnode links in the name of saving
memory usage. Past attempts to optimize runtime at the cost of memory
usage were blocked with request for data showing that the optimization
made significant improvement for real world scenarios.

We have those scenarios now. There have been several reports of boot
time increase in the order of seconds in this thread [1]. Several OEMs
and SoC manufacturers have also privately reported significant
(350-400ms) increase in boot time due to all the parsing done by
fw_devlink.

So this patch uses all the setup done by the previous patches in this
series to refactor fw_devlink to be more efficient. Most of the code has
been moved out of firmware specific (DT mostly) code into driver core.

This brings the following benefits:
- Instead of parsing the device tree multiple times during bootup,
  fw_devlink parses each fwnode node/property only once and creates
  fwnode links. The rest of the fw_devlink code then just looks at these
  fwnode links to do rest of the work.

- Makes it much easier to debug probe issue due to fw_devlink in the
  future. fw_devlink=on blocks the probing of devices if they depend on
  a device that hasn't been added yet. With this refactor, it'll be very
  easy to tell what that device is because we now have a reference to
  the fwnode of the device.

- Much easier to add fw_devlink support to ACPI and other firmware
  types. A refactor to move the common bits from DT specific code to
  driver core was in my TODO list as a prerequisite to adding ACPI
  support to fw_devlink. This series gets that done.

[1] - https://lore.kernel.org/linux-omap/ea02f57e-871d-cd16-4418-c1da4bbc4696@ti.com/

Tested-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-17-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 325 ++++++++++++++++++++++++++++++++++++-------------
 include/linux/device.h |   5 -
 2 files changed, 238 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8a973d9601d3..9edf9084fc98 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -46,8 +46,6 @@ early_param("sysfs.deprecated", sysfs_deprecated_setup);
 #endif
 
 /* Device links support. */
-static LIST_HEAD(wait_for_suppliers);
-static DEFINE_MUTEX(wfs_lock);
 static LIST_HEAD(deferred_sync);
 static unsigned int defer_sync_state_count = 1;
 static DEFINE_MUTEX(fwnode_link_lock);
@@ -803,74 +801,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(device_link_add);
 
-/**
- * device_link_wait_for_supplier - Add device to wait_for_suppliers list
- * @consumer: Consumer device
- *
- * Marks the @consumer device as waiting for suppliers to become available by
- * adding it to the wait_for_suppliers list. The consumer device will never be
- * probed until it's removed from the wait_for_suppliers list.
- *
- * The caller is responsible for adding the links to the supplier devices once
- * they are available and removing the @consumer device from the
- * wait_for_suppliers list once links to all the suppliers have been created.
- *
- * This function is NOT meant to be called from the probe function of the
- * consumer but rather from code that creates/adds the consumer device.
- */
-static void device_link_wait_for_supplier(struct device *consumer,
-					  bool need_for_probe)
-{
-	mutex_lock(&wfs_lock);
-	list_add_tail(&consumer->links.needs_suppliers, &wait_for_suppliers);
-	consumer->links.need_for_probe = need_for_probe;
-	mutex_unlock(&wfs_lock);
-}
-
-static void device_link_wait_for_mandatory_supplier(struct device *consumer)
-{
-	device_link_wait_for_supplier(consumer, true);
-}
-
-static void device_link_wait_for_optional_supplier(struct device *consumer)
-{
-	device_link_wait_for_supplier(consumer, false);
-}
-
-/**
- * device_link_add_missing_supplier_links - Add links from consumer devices to
- *					    supplier devices, leaving any
- *					    consumer with inactive suppliers on
- *					    the wait_for_suppliers list
- *
- * Loops through all consumers waiting on suppliers and tries to add all their
- * supplier links. If that succeeds, the consumer device is removed from
- * wait_for_suppliers list. Otherwise, they are left in the wait_for_suppliers
- * list.  Devices left on the wait_for_suppliers list will not be probed.
- *
- * The fwnode add_links callback is expected to return 0 if it has found and
- * added all the supplier links for the consumer device. It should return an
- * error if it isn't able to do so.
- *
- * The caller of device_link_wait_for_supplier() is expected to call this once
- * it's aware of potential suppliers becoming available.
- */
-static void device_link_add_missing_supplier_links(void)
-{
-	struct device *dev, *tmp;
-
-	mutex_lock(&wfs_lock);
-	list_for_each_entry_safe(dev, tmp, &wait_for_suppliers,
-				 links.needs_suppliers) {
-		int ret = fwnode_call_int_op(dev->fwnode, add_links, dev);
-		if (!ret)
-			list_del_init(&dev->links.needs_suppliers);
-		else if (ret != -ENODEV)
-			dev->links.need_for_probe = false;
-	}
-	mutex_unlock(&wfs_lock);
-}
-
 #ifdef CONFIG_SRCU
 static void __device_link_del(struct kref *kref)
 {
@@ -1195,9 +1125,8 @@ void device_links_driver_bound(struct device *dev)
 	 * the device links it needs to or make new device links as it needs
 	 * them. So, it no longer needs to wait on any suppliers.
 	 */
-	mutex_lock(&wfs_lock);
-	list_del_init(&dev->links.needs_suppliers);
-	mutex_unlock(&wfs_lock);
+	if (dev->fwnode && dev->fwnode->dev == dev)
+		fwnode_links_purge_suppliers(dev->fwnode);
 	device_remove_file(dev, &dev_attr_waiting_for_supplier);
 
 	device_links_write_lock();
@@ -1488,10 +1417,6 @@ static void device_links_purge(struct device *dev)
 	if (dev->class == &devlink_class)
 		return;
 
-	mutex_lock(&wfs_lock);
-	list_del_init(&dev->links.needs_suppliers);
-	mutex_unlock(&wfs_lock);
-
 	/*
 	 * Delete all of the remaining links from this device to any other
 	 * devices (either consumers or suppliers).
@@ -1561,19 +1486,246 @@ static void fw_devlink_parse_fwtree(struct fwnode_handle *fwnode)
 		fw_devlink_parse_fwtree(child);
 }
 
-static void fw_devlink_link_device(struct device *dev)
+/**
+ * fw_devlink_create_devlink - Create a device link from a consumer to fwnode
+ * @con - Consumer device for the device link
+ * @sup_handle - fwnode handle of supplier
+ *
+ * This function will try to create a device link between the consumer device
+ * @con and the supplier device represented by @sup_handle.
+ *
+ * The supplier has to be provided as a fwnode because incorrect cycles in
+ * fwnode links can sometimes cause the supplier device to never be created.
+ * This function detects such cases and returns an error if it cannot create a
+ * device link from the consumer to a missing supplier.
+ *
+ * Returns,
+ * 0 on successfully creating a device link
+ * -EINVAL if the device link cannot be created as expected
+ * -EAGAIN if the device link cannot be created right now, but it may be
+ *  possible to do that in the future
+ */
+static int fw_devlink_create_devlink(struct device *con,
+				     struct fwnode_handle *sup_handle, u32 flags)
+{
+	struct device *sup_dev;
+	int ret = 0;
+
+	sup_dev = get_dev_from_fwnode(sup_handle);
+	if (sup_dev) {
+		/*
+		 * If this fails, it is due to cycles in device links.  Just
+		 * give up on this link and treat it as invalid.
+		 */
+		if (!device_link_add(con, sup_dev, flags))
+			ret = -EINVAL;
+
+		goto out;
+	}
+
+	/*
+	 * DL_FLAG_SYNC_STATE_ONLY doesn't block probing and supports
+	 * cycles. So cycle detection isn't necessary and shouldn't be
+	 * done.
+	 */
+	if (flags & DL_FLAG_SYNC_STATE_ONLY)
+		return -EAGAIN;
+
+	/*
+	 * If we can't find the supplier device from its fwnode, it might be
+	 * due to a cyclic dependency between fwnodes. Some of these cycles can
+	 * be broken by applying logic. Check for these types of cycles and
+	 * break them so that devices in the cycle probe properly.
+	 *
+	 * If the supplier's parent is dependent on the consumer, then
+	 * the consumer-supplier dependency is a false dependency. So,
+	 * treat it as an invalid link.
+	 */
+	sup_dev = fwnode_get_next_parent_dev(sup_handle);
+	if (sup_dev && device_is_dependent(con, sup_dev)) {
+		dev_dbg(con, "Not linking to %pfwP - False link\n",
+			sup_handle);
+		ret = -EINVAL;
+	} else {
+		/*
+		 * Can't check for cycles or no cycles. So let's try
+		 * again later.
+		 */
+		ret = -EAGAIN;
+	}
+
+out:
+	put_device(sup_dev);
+	return ret;
+}
+
+/**
+ * __fw_devlink_link_to_consumers - Create device links to consumers of a device
+ * @dev - Device that needs to be linked to its consumers
+ *
+ * This function looks at all the consumer fwnodes of @dev and creates device
+ * links between the consumer device and @dev (supplier).
+ *
+ * If the consumer device has not been added yet, then this function creates a
+ * SYNC_STATE_ONLY link between @dev (supplier) and the closest ancestor device
+ * of the consumer fwnode. This is necessary to make sure @dev doesn't get a
+ * sync_state() callback before the real consumer device gets to be added and
+ * then probed.
+ *
+ * Once device links are created from the real consumer to @dev (supplier), the
+ * fwnode links are deleted.
+ */
+static void __fw_devlink_link_to_consumers(struct device *dev)
+{
+	struct fwnode_handle *fwnode = dev->fwnode;
+	struct fwnode_link *link, *tmp;
+
+	list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook) {
+		u32 dl_flags = fw_devlink_get_flags();
+		struct device *con_dev;
+		bool own_link = true;
+		int ret;
+
+		con_dev = get_dev_from_fwnode(link->consumer);
+		/*
+		 * If consumer device is not available yet, make a "proxy"
+		 * SYNC_STATE_ONLY link from the consumer's parent device to
+		 * the supplier device. This is necessary to make sure the
+		 * supplier doesn't get a sync_state() callback before the real
+		 * consumer can create a device link to the supplier.
+		 *
+		 * This proxy link step is needed to handle the case where the
+		 * consumer's parent device is added before the supplier.
+		 */
+		if (!con_dev) {
+			con_dev = fwnode_get_next_parent_dev(link->consumer);
+			/*
+			 * However, if the consumer's parent device is also the
+			 * parent of the supplier, don't create a
+			 * consumer-supplier link from the parent to its child
+			 * device. Such a dependency is impossible.
+			 */
+			if (con_dev &&
+			    fwnode_is_ancestor_of(con_dev->fwnode, fwnode)) {
+				put_device(con_dev);
+				con_dev = NULL;
+			} else {
+				own_link = false;
+				dl_flags = DL_FLAG_SYNC_STATE_ONLY;
+			}
+		}
+
+		if (!con_dev)
+			continue;
+
+		ret = fw_devlink_create_devlink(con_dev, fwnode, dl_flags);
+		put_device(con_dev);
+		if (!own_link || ret == -EAGAIN)
+			continue;
+
+		list_del(&link->s_hook);
+		list_del(&link->c_hook);
+		kfree(link);
+	}
+}
+
+/**
+ * __fw_devlink_link_to_suppliers - Create device links to suppliers of a device
+ * @dev - The consumer device that needs to be linked to its suppliers
+ * @fwnode - Root of the fwnode tree that is used to create device links
+ *
+ * This function looks at all the supplier fwnodes of fwnode tree rooted at
+ * @fwnode and creates device links between @dev (consumer) and all the
+ * supplier devices of the entire fwnode tree at @fwnode.
+ *
+ * The function creates normal (non-SYNC_STATE_ONLY) device links between @dev
+ * and the real suppliers of @dev. Once these device links are created, the
+ * fwnode links are deleted. When such device links are successfully created,
+ * this function is called recursively on those supplier devices. This is
+ * needed to detect and break some invalid cycles in fwnode links.  See
+ * fw_devlink_create_devlink() for more details.
+ *
+ * In addition, it also looks at all the suppliers of the entire fwnode tree
+ * because some of the child devices of @dev that have not been added yet
+ * (because @dev hasn't probed) might already have their suppliers added to
+ * driver core. So, this function creates SYNC_STATE_ONLY device links between
+ * @dev (consumer) and these suppliers to make sure they don't execute their
+ * sync_state() callbacks before these child devices have a chance to create
+ * their device links. The fwnode links that correspond to the child devices
+ * aren't delete because they are needed later to create the device links
+ * between the real consumer and supplier devices.
+ */
+static void __fw_devlink_link_to_suppliers(struct device *dev,
+					   struct fwnode_handle *fwnode)
 {
-	int fw_ret;
+	bool own_link = (dev->fwnode == fwnode);
+	struct fwnode_link *link, *tmp;
+	struct fwnode_handle *child = NULL;
+	u32 dl_flags;
+
+	if (own_link)
+		dl_flags = fw_devlink_get_flags();
+	else
+		dl_flags = DL_FLAG_SYNC_STATE_ONLY;
 
-	device_link_add_missing_supplier_links();
+	list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook) {
+		int ret;
+		struct device *sup_dev;
+		struct fwnode_handle *sup = link->supplier;
+
+		ret = fw_devlink_create_devlink(dev, sup, dl_flags);
+		if (!own_link || ret == -EAGAIN)
+			continue;
 
-	if (fw_devlink_flags && fwnode_has_op(dev->fwnode, add_links)) {
-		fw_ret = fwnode_call_int_op(dev->fwnode, add_links, dev);
-		if (fw_ret == -ENODEV && !fw_devlink_is_permissive())
-			device_link_wait_for_mandatory_supplier(dev);
-		else if (fw_ret)
-			device_link_wait_for_optional_supplier(dev);
+		list_del(&link->s_hook);
+		list_del(&link->c_hook);
+		kfree(link);
+
+		/* If no device link was created, nothing more to do. */
+		if (ret)
+			continue;
+
+		/*
+		 * If a device link was successfully created to a supplier, we
+		 * now need to try and link the supplier to all its suppliers.
+		 *
+		 * This is needed to detect and delete false dependencies in
+		 * fwnode links that haven't been converted to a device link
+		 * yet. See comments in fw_devlink_create_devlink() for more
+		 * details on the false dependency.
+		 *
+		 * Without deleting these false dependencies, some devices will
+		 * never probe because they'll keep waiting for their false
+		 * dependency fwnode links to be converted to device links.
+		 */
+		sup_dev = get_dev_from_fwnode(sup);
+		__fw_devlink_link_to_suppliers(sup_dev, sup_dev->fwnode);
+		put_device(sup_dev);
 	}
+
+	/*
+	 * Make "proxy" SYNC_STATE_ONLY device links to represent the needs of
+	 * all the descendants. This proxy link step is needed to handle the
+	 * case where the supplier is added before the consumer's parent device
+	 * (@dev).
+	 */
+	while ((child = fwnode_get_next_available_child_node(fwnode, child)))
+		__fw_devlink_link_to_suppliers(dev, child);
+}
+
+static void fw_devlink_link_device(struct device *dev)
+{
+	struct fwnode_handle *fwnode = dev->fwnode;
+
+	if (!fw_devlink_flags)
+		return;
+
+	fw_devlink_parse_fwtree(fwnode);
+
+	mutex_lock(&fwnode_link_lock);
+	__fw_devlink_link_to_consumers(dev);
+	__fw_devlink_link_to_suppliers(dev, fwnode);
+	mutex_unlock(&fwnode_link_lock);
 }
 
 /* Device links support end. */
@@ -2431,7 +2583,6 @@ void device_initialize(struct device *dev)
 #endif
 	INIT_LIST_HEAD(&dev->links.consumers);
 	INIT_LIST_HEAD(&dev->links.suppliers);
-	INIT_LIST_HEAD(&dev->links.needs_suppliers);
 	INIT_LIST_HEAD(&dev->links.defer_sync);
 	dev->links.status = DL_DEV_NO_DRIVER;
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 1e771ea4dca6..89bb8b84173e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -351,18 +351,13 @@ enum dl_dev_state {
  * struct dev_links_info - Device data related to device links.
  * @suppliers: List of links to supplier devices.
  * @consumers: List of links to consumer devices.
- * @needs_suppliers: Hook to global list of devices waiting for suppliers.
  * @defer_sync: Hook to global list of devices that have deferred sync_state.
- * @need_for_probe: If needs_suppliers is on a list, this indicates if the
- *		    suppliers are needed for probe or not.
  * @status: Driver status information.
  */
 struct dev_links_info {
 	struct list_head suppliers;
 	struct list_head consumers;
-	struct list_head needs_suppliers;
 	struct list_head defer_sync;
-	bool need_for_probe;
 	enum dl_dev_state status;
 };
 
-- 
cgit 


From 2d09e6eb4a6f20273959f4905ccf009da8c64c7a Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 20 Nov 2020 18:02:32 -0800
Subject: driver core: Delete pointless parameter in
 fwnode_operations.add_links

The struct device input to add_links() is not used for anything. So
delete it.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201121020232.908850-18-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c             | 2 +-
 drivers/firmware/efi/efi-init.c | 3 +--
 drivers/of/property.c           | 3 +--
 include/linux/fwnode.h          | 3 +--
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 9edf9084fc98..63edb8bd9d7d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1472,7 +1472,7 @@ static void fw_devlink_parse_fwnode(struct fwnode_handle *fwnode)
 	if (fwnode->flags & FWNODE_FLAG_LINKS_ADDED)
 		return;
 
-	fwnode_call_int_op(fwnode, add_links, NULL);
+	fwnode_call_int_op(fwnode, add_links);
 	fwnode->flags |= FWNODE_FLAG_LINKS_ADDED;
 }
 
diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c
index c0c3d4c3837a..a552a08a1741 100644
--- a/drivers/firmware/efi/efi-init.c
+++ b/drivers/firmware/efi/efi-init.c
@@ -316,8 +316,7 @@ static struct device_node *find_pci_overlap_node(void)
  * resource reservation conflict on the memory window that the efifb
  * framebuffer steals from the PCIe host bridge.
  */
-static int efifb_add_links(struct fwnode_handle *fwnode,
-			   struct device *dev)
+static int efifb_add_links(struct fwnode_handle *fwnode)
 {
 	struct device_node *sup_np;
 
diff --git a/drivers/of/property.c b/drivers/of/property.c
index 620d29fdace8..5f9eed79a8aa 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -1343,8 +1343,7 @@ static int of_link_property(struct device_node *con_np, const char *prop_name)
 	return ret;
 }
 
-static int of_fwnode_add_links(struct fwnode_handle *fwnode,
-			       struct device *dev)
+static int of_fwnode_add_links(struct fwnode_handle *fwnode)
 {
 	struct property *p;
 	struct device_node *con_np = to_of_node(fwnode);
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index ffa9129182a6..fde4ad97564c 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -127,8 +127,7 @@ struct fwnode_operations {
 	(*graph_get_port_parent)(struct fwnode_handle *fwnode);
 	int (*graph_parse_endpoint)(const struct fwnode_handle *fwnode,
 				    struct fwnode_endpoint *endpoint);
-	int (*add_links)(struct fwnode_handle *fwnode,
-			 struct device *dev);
+	int (*add_links)(struct fwnode_handle *fwnode);
 };
 
 #define fwnode_has_op(fwnode, op)				\
-- 
cgit 


From 30b79eb1f92ed5974885d374a4107c94e2dd3e03 Mon Sep 17 00:00:00 2001
From: Michael Tretter <m.tretter@pengutronix.de>
Date: Mon, 9 Nov 2020 14:48:17 +0100
Subject: soc: xilinx: vcu: use vcu-settings syscon registers

Switch the "logicoreip" registers to the new xlnx,vcu-settings binding
to be able to read the settings if the settings are specified in a
separate device tree node that is shared with other drivers.

If the driver is not able to find a node with the new binding, fall back
to check for the logicore register bank to be backwards compatible.

Signed-off-by: Michael Tretter <m.tretter@pengutronix.de>
Reviewed-by: Hyun Kwon <hyun.kwon@xilinx.com>
Link: https://lore.kernel.org/r/20201109134818.4159342-4-m.tretter@pengutronix.de
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 drivers/soc/xilinx/Kconfig          |  1 +
 drivers/soc/xilinx/xlnx_vcu.c       | 94 ++++++++++++++++++-------------------
 include/linux/mfd/syscon/xlnx-vcu.h | 38 +++++++++++++++
 3 files changed, 86 insertions(+), 47 deletions(-)
 create mode 100644 include/linux/mfd/syscon/xlnx-vcu.h

(limited to 'include/linux')

diff --git a/drivers/soc/xilinx/Kconfig b/drivers/soc/xilinx/Kconfig
index 646512d7276f..0b1708dae361 100644
--- a/drivers/soc/xilinx/Kconfig
+++ b/drivers/soc/xilinx/Kconfig
@@ -4,6 +4,7 @@ menu "Xilinx SoC drivers"
 config XILINX_VCU
 	tristate "Xilinx VCU logicoreIP Init"
 	depends on HAS_IOMEM
+	select REGMAP_MMIO
 	help
 	  Provides the driver to enable and disable the isolation between the
 	  processing system and programmable logic part by using the logicoreIP
diff --git a/drivers/soc/xilinx/xlnx_vcu.c b/drivers/soc/xilinx/xlnx_vcu.c
index dcd8e7824b06..14daad4efc58 100644
--- a/drivers/soc/xilinx/xlnx_vcu.c
+++ b/drivers/soc/xilinx/xlnx_vcu.c
@@ -10,39 +10,12 @@
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/io.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mfd/syscon/xlnx-vcu.h>
 #include <linux/module.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
-
-/* Address map for different registers implemented in the VCU LogiCORE IP. */
-#define VCU_ECODER_ENABLE		0x00
-#define VCU_DECODER_ENABLE		0x04
-#define VCU_MEMORY_DEPTH		0x08
-#define VCU_ENC_COLOR_DEPTH		0x0c
-#define VCU_ENC_VERTICAL_RANGE		0x10
-#define VCU_ENC_FRAME_SIZE_X		0x14
-#define VCU_ENC_FRAME_SIZE_Y		0x18
-#define VCU_ENC_COLOR_FORMAT		0x1c
-#define VCU_ENC_FPS			0x20
-#define VCU_MCU_CLK			0x24
-#define VCU_CORE_CLK			0x28
-#define VCU_PLL_BYPASS			0x2c
-#define VCU_ENC_CLK			0x30
-#define VCU_PLL_CLK			0x34
-#define VCU_ENC_VIDEO_STANDARD		0x38
-#define VCU_STATUS			0x3c
-#define VCU_AXI_ENC_CLK			0x40
-#define VCU_AXI_DEC_CLK			0x44
-#define VCU_AXI_MCU_CLK			0x48
-#define VCU_DEC_VIDEO_STANDARD		0x4c
-#define VCU_DEC_FRAME_SIZE_X		0x50
-#define VCU_DEC_FRAME_SIZE_Y		0x54
-#define VCU_DEC_FPS			0x58
-#define VCU_BUFFER_B_FRAME		0x5c
-#define VCU_WPP_EN			0x60
-#define VCU_PLL_CLK_DEC			0x64
-#define VCU_GASKET_INIT			0x74
-#define VCU_GASKET_VALUE		0x03
+#include <linux/regmap.h>
 
 /* vcu slcr registers, bitmask and shift */
 #define VCU_PLL_CTRL			0x24
@@ -106,11 +79,20 @@ struct xvcu_device {
 	struct device *dev;
 	struct clk *pll_ref;
 	struct clk *aclk;
-	void __iomem *logicore_reg_ba;
+	struct regmap *logicore_reg_ba;
 	void __iomem *vcu_slcr_ba;
 	u32 coreclk;
 };
 
+static struct regmap_config vcu_settings_regmap_config = {
+	.name = "regmap",
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 4,
+	.max_register = 0xfff,
+	.cache_type = REGCACHE_NONE,
+};
+
 /**
  * struct xvcu_pll_cfg - Helper data
  * @fbdiv: The integer portion of the feedback divider to the PLL
@@ -300,10 +282,12 @@ static int xvcu_set_vcu_pll_info(struct xvcu_device *xvcu)
 	int ret, i;
 	const struct xvcu_pll_cfg *found = NULL;
 
-	inte = xvcu_read(xvcu->logicore_reg_ba, VCU_PLL_CLK);
-	deci = xvcu_read(xvcu->logicore_reg_ba, VCU_PLL_CLK_DEC);
-	coreclk = xvcu_read(xvcu->logicore_reg_ba, VCU_CORE_CLK) * MHZ;
-	mcuclk = xvcu_read(xvcu->logicore_reg_ba, VCU_MCU_CLK) * MHZ;
+	regmap_read(xvcu->logicore_reg_ba, VCU_PLL_CLK, &inte);
+	regmap_read(xvcu->logicore_reg_ba, VCU_PLL_CLK_DEC, &deci);
+	regmap_read(xvcu->logicore_reg_ba, VCU_CORE_CLK, &coreclk);
+	coreclk *= MHZ;
+	regmap_read(xvcu->logicore_reg_ba, VCU_MCU_CLK, &mcuclk);
+	mcuclk *= MHZ;
 	if (!mcuclk || !coreclk) {
 		dev_err(xvcu->dev, "Invalid mcu and core clock data\n");
 		return -EINVAL;
@@ -498,6 +482,7 @@ static int xvcu_probe(struct platform_device *pdev)
 {
 	struct resource *res;
 	struct xvcu_device *xvcu;
+	void __iomem *regs;
 	int ret;
 
 	xvcu = devm_kzalloc(&pdev->dev, sizeof(*xvcu), GFP_KERNEL);
@@ -518,17 +503,32 @@ static int xvcu_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "logicore");
-	if (!res) {
-		dev_err(&pdev->dev, "get logicore memory resource failed.\n");
-		return -ENODEV;
-	}
+	xvcu->logicore_reg_ba =
+		syscon_regmap_lookup_by_compatible("xlnx,vcu-settings");
+	if (IS_ERR(xvcu->logicore_reg_ba)) {
+		dev_info(&pdev->dev,
+			 "could not find xlnx,vcu-settings: trying direct register access\n");
+
+		res = platform_get_resource_byname(pdev,
+						   IORESOURCE_MEM, "logicore");
+		if (!res) {
+			dev_err(&pdev->dev, "get logicore memory resource failed.\n");
+			return -ENODEV;
+		}
 
-	xvcu->logicore_reg_ba = devm_ioremap(&pdev->dev, res->start,
-						     resource_size(res));
-	if (!xvcu->logicore_reg_ba) {
-		dev_err(&pdev->dev, "logicore register mapping failed.\n");
-		return -ENOMEM;
+		regs = devm_ioremap(&pdev->dev, res->start, resource_size(res));
+		if (!regs) {
+			dev_err(&pdev->dev, "logicore register mapping failed.\n");
+			return -ENOMEM;
+		}
+
+		xvcu->logicore_reg_ba =
+			devm_regmap_init_mmio(&pdev->dev, regs,
+					      &vcu_settings_regmap_config);
+		if (IS_ERR(xvcu->logicore_reg_ba)) {
+			dev_err(&pdev->dev, "failed to init regmap\n");
+			return PTR_ERR(xvcu->logicore_reg_ba);
+		}
 	}
 
 	xvcu->aclk = devm_clk_get(&pdev->dev, "aclk");
@@ -560,7 +560,7 @@ static int xvcu_probe(struct platform_device *pdev)
 	 * Bit 0 : Gasket isolation
 	 * Bit 1 : put VCU out of reset
 	 */
-	xvcu_write(xvcu->logicore_reg_ba, VCU_GASKET_INIT, VCU_GASKET_VALUE);
+	regmap_write(xvcu->logicore_reg_ba, VCU_GASKET_INIT, VCU_GASKET_VALUE);
 
 	/* Do the PLL Settings based on the ref clk,core and mcu clk freq */
 	ret = xvcu_set_pll(xvcu);
@@ -597,7 +597,7 @@ static int xvcu_remove(struct platform_device *pdev)
 		return -ENODEV;
 
 	/* Add the the Gasket isolation and put the VCU in reset. */
-	xvcu_write(xvcu->logicore_reg_ba, VCU_GASKET_INIT, 0);
+	regmap_write(xvcu->logicore_reg_ba, VCU_GASKET_INIT, 0);
 
 	clk_disable_unprepare(xvcu->pll_ref);
 	clk_disable_unprepare(xvcu->aclk);
diff --git a/include/linux/mfd/syscon/xlnx-vcu.h b/include/linux/mfd/syscon/xlnx-vcu.h
new file mode 100644
index 000000000000..d3edec4b7b1d
--- /dev/null
+++ b/include/linux/mfd/syscon/xlnx-vcu.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Pengutronix, Michael Tretter <kernel@pengutronix.de>
+ */
+
+#ifndef __XLNX_VCU_H
+#define __XLNX_VCU_H
+
+#define VCU_ECODER_ENABLE		0x00
+#define VCU_DECODER_ENABLE		0x04
+#define VCU_MEMORY_DEPTH		0x08
+#define VCU_ENC_COLOR_DEPTH		0x0c
+#define VCU_ENC_VERTICAL_RANGE		0x10
+#define VCU_ENC_FRAME_SIZE_X		0x14
+#define VCU_ENC_FRAME_SIZE_Y		0x18
+#define VCU_ENC_COLOR_FORMAT		0x1c
+#define VCU_ENC_FPS			0x20
+#define VCU_MCU_CLK			0x24
+#define VCU_CORE_CLK			0x28
+#define VCU_PLL_BYPASS			0x2c
+#define VCU_ENC_CLK			0x30
+#define VCU_PLL_CLK			0x34
+#define VCU_ENC_VIDEO_STANDARD		0x38
+#define VCU_STATUS			0x3c
+#define VCU_AXI_ENC_CLK			0x40
+#define VCU_AXI_DEC_CLK			0x44
+#define VCU_AXI_MCU_CLK			0x48
+#define VCU_DEC_VIDEO_STANDARD		0x4c
+#define VCU_DEC_FRAME_SIZE_X		0x50
+#define VCU_DEC_FRAME_SIZE_Y		0x54
+#define VCU_DEC_FPS			0x58
+#define VCU_BUFFER_B_FRAME		0x5c
+#define VCU_WPP_EN			0x60
+#define VCU_PLL_CLK_DEC			0x64
+#define VCU_GASKET_INIT			0x74
+#define VCU_GASKET_VALUE		0x03
+
+#endif /* __XLNX_VCU_H */
-- 
cgit 


From 7b1c9b8441aa94a549a90fa3d42687ccbad3eade Mon Sep 17 00:00:00 2001
From: Michael Tretter <m.tretter@pengutronix.de>
Date: Mon, 9 Nov 2020 14:48:18 +0100
Subject: soc: xilinx: vcu: add missing register NUM_CORE

The H.264/H.265 Video Codec Unit v1.2 documentation describes this
register as follows:

	Number of encoders core used for the provided configuration

This is required for configuring the VCU encoder buffer.

Signed-off-by: Michael Tretter <m.tretter@pengutronix.de>
Reviewed-by: Hyun Kwon <hyun.kwon@xilinx.com>
Link: https://lore.kernel.org/r/20201109134818.4159342-5-m.tretter@pengutronix.de
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 include/linux/mfd/syscon/xlnx-vcu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/xlnx-vcu.h b/include/linux/mfd/syscon/xlnx-vcu.h
index d3edec4b7b1d..ff7bc3656f6e 100644
--- a/include/linux/mfd/syscon/xlnx-vcu.h
+++ b/include/linux/mfd/syscon/xlnx-vcu.h
@@ -32,6 +32,7 @@
 #define VCU_BUFFER_B_FRAME		0x5c
 #define VCU_WPP_EN			0x60
 #define VCU_PLL_CLK_DEC			0x64
+#define VCU_NUM_CORE			0x6c
 #define VCU_GASKET_INIT			0x74
 #define VCU_GASKET_VALUE		0x03
 
-- 
cgit 


From 463edf5a59fd8f0fe0135101d67bfca81d1e3771 Mon Sep 17 00:00:00 2001
From: Wendy Liang <wendy.liang@xilinx.com>
Date: Tue, 24 Nov 2020 00:18:18 -0800
Subject: firmware: xlnx-zynqmp: fix compilation warning

Fix compilation warning when ZYNQMP_FIRMWARE is not defined.

include/linux/firmware/xlnx-zynqmp.h: In function
'zynqmp_pm_get_eemi_ops':
 include/linux/firmware/xlnx-zynqmp.h:363:9: error: implicit
 declaration of function 'ERR_PTR'
 [-Werror=implicit-function-declaration]
     363 |  return ERR_PTR(-ENODEV);

include/linux/firmware/xlnx-zynqmp.h:363:18: note: each undeclared
identifier is reported only once for each function it appears in
   include/linux/firmware/xlnx-zynqmp.h: In function
'zynqmp_pm_get_api_version':
   include/linux/firmware/xlnx-zynqmp.h:367:10: error: 'ENODEV'
undeclared (first use in this function)
     367 |  return -ENODEV;
         |          ^~~~~~

Signed-off-by: Wendy Liang <wendy.liang@xilinx.com>
Link: https://lore.kernel.org/r/1606205898-12642-1-git-send-email-wendy.liang@xilinx.com
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 include/linux/firmware/xlnx-zynqmp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 5968df82b991..f84244ea633b 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -13,6 +13,8 @@
 #ifndef __FIRMWARE_ZYNQMP_H__
 #define __FIRMWARE_ZYNQMP_H__
 
+#include <linux/err.h>
+
 #define ZYNQMP_PM_VERSION_MAJOR	1
 #define ZYNQMP_PM_VERSION_MINOR	0
 
-- 
cgit 


From 1f6a11a01059f9c65f8461987cc0bab4c0b58338 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 2 Dec 2020 08:38:48 +0100
Subject: firmware: xilinx: Remove additional newline

This additional newline is useless and also reported by checkpatch
--strict.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/d927f3f2c97910958dd77a22828cd0bf8d89c9de.1606894725.git.michal.simek@xilinx.com
---
 include/linux/firmware/xlnx-zynqmp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index f84244ea633b..0db9005782d6 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -316,7 +316,6 @@ struct zynqmp_pm_query_data {
 	u32 arg3;
 };
 
-
 int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1,
 			u32 arg2, u32 arg3, u32 *ret_payload);
 
-- 
cgit 


From a80cefec2c2783166727324bde724c39aa8a12df Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 2 Dec 2020 08:38:49 +0100
Subject: firmware: xilinx: Add a blank line after function declaration

Fix all these issues which are also reported by checkpatch --strict.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/7b6007e05f6c01214861a37f198cd5bee62a4d3e.1606894725.git.michal.simek@xilinx.com
---
 include/linux/firmware/xlnx-zynqmp.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 0db9005782d6..0e7e72650ed3 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -363,107 +363,132 @@ static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void)
 {
 	return ERR_PTR(-ENODEV);
 }
+
 static inline int zynqmp_pm_get_api_version(u32 *version)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata,
 				       u32 *out)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_enable(u32 clock_id)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_disable(u32 clock_id)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_get_pll_frac_mode(u32 clk_id, u32 *mode)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
 			   const enum zynqmp_pm_reset_action assert_flag)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset,
 					     u32 *status)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_init_finalize(void)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_suspend_mode(u32 mode)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_request_node(const u32 node, const u32 capabilities,
 					 const u32 qos,
 					 const enum zynqmp_pm_request_ack ack)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_release_node(const u32 node)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_requirement(const u32 node,
 					const u32 capabilities,
 					const u32 qos,
@@ -471,39 +496,48 @@ static inline int zynqmp_pm_set_requirement(const u32 node,
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_aes_engine(const u64 address, u32 *out)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_fpga_load(const u64 address, const u32 size,
 				      const u32 flags)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_fpga_get_status(u32 *value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_write_ggs(u32 index, u32 value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_read_ggs(u32 index, u32 *value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_write_pggs(u32 index, u32 value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_read_pggs(u32 index, u32 *value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_boot_health_status(u32 value)
 {
 	return -ENODEV;
-- 
cgit 


From 311c2520de21cb2f44291ad3d984b42191126628 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 2 Dec 2020 08:38:50 +0100
Subject: firmware: xilinx: Properly align function parameter

Fix parameters alignment reported by checkpatch --strict.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/00ed9fcb94a6c22eff1fe8afdea46b2764a8687d.1606894725.git.michal.simek@xilinx.com
---
 include/linux/firmware/xlnx-zynqmp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 0e7e72650ed3..edc2977b26d9 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -456,7 +456,7 @@ static inline int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type)
 }
 
 static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
-			   const enum zynqmp_pm_reset_action assert_flag)
+					 const enum zynqmp_pm_reset_action assert_flag)
 {
 	return -ENODEV;
 }
@@ -490,9 +490,9 @@ static inline int zynqmp_pm_release_node(const u32 node)
 }
 
 static inline int zynqmp_pm_set_requirement(const u32 node,
-					const u32 capabilities,
-					const u32 qos,
-					const enum zynqmp_pm_request_ack ack)
+					    const u32 capabilities,
+					    const u32 qos,
+					    const enum zynqmp_pm_request_ack ack)
 {
 	return -ENODEV;
 }
-- 
cgit 


From 5b4258f6721f41b092c63f6ee71be76e9616718b Mon Sep 17 00:00:00 2001
From: Ricky Wu <ricky_wu@realtek.com>
Date: Wed, 2 Dec 2020 14:58:57 +0800
Subject: misc: rtsx: rts5249 support runtime PM

rtsx_pcr:
add callback functions to support runtime PM
add delay_work to put device to D3 after idle
over 10 sec

rts5249:
add extra init flow for rtd3 and set rtd3_en from
config setting

rtsx_pci_sdmmc:
child device support autosuspend

Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
Link: https://lore.kernel.org/r/20201202065857.19412-1-ricky_wu@realtek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/cardreader/rts5249.c  |  26 +++++++--
 drivers/misc/cardreader/rtsx_pcr.c | 106 ++++++++++++++++++++++++++++++++++++-
 drivers/misc/cardreader/rtsx_pcr.h |   1 +
 drivers/mmc/host/rtsx_pci_sdmmc.c  |  16 ++++++
 include/linux/rtsx_pci.h           |   2 +
 5 files changed, 145 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/cardreader/rts5249.c b/drivers/misc/cardreader/rts5249.c
index b85279f1fc5e..b2676e7f5027 100644
--- a/drivers/misc/cardreader/rts5249.c
+++ b/drivers/misc/cardreader/rts5249.c
@@ -73,6 +73,9 @@ static void rtsx_base_fetch_vendor_settings(struct rtsx_pcr *pcr)
 
 	pci_read_config_dword(pdev, PCR_SETTING_REG2, &reg);
 	pcr_dbg(pcr, "Cfg 0x%x: 0x%x\n", PCR_SETTING_REG2, reg);
+
+	pcr->rtd3_en = rtsx_reg_to_rtd3_uhsii(reg);
+
 	if (rtsx_check_mmc_support(reg))
 		pcr->extra_caps |= EXTRA_CAPS_NO_MMC;
 	pcr->sd30_drive_sel_3v3 = rtsx_reg_to_sd30_drive_sel_3v3(reg);
@@ -278,15 +281,28 @@ static int rts5249_extra_init_hw(struct rtsx_pcr *pcr)
 
 	rtsx_pci_send_cmd(pcr, CMD_TIMEOUT_DEF);
 
-	if (CHK_PCI_PID(pcr, PID_524A) || CHK_PCI_PID(pcr, PID_525A)) {
+	if (CHK_PCI_PID(pcr, PID_524A) || CHK_PCI_PID(pcr, PID_525A))
 		rtsx_pci_write_register(pcr, REG_VREF, PWD_SUSPND_EN, PWD_SUSPND_EN);
-		rtsx_pci_write_register(pcr, RTS524A_PM_CTRL3, 0x01, 0x00);
-		rtsx_pci_write_register(pcr, RTS524A_PME_FORCE_CTL, 0x30, 0x20);
+
+	if (pcr->rtd3_en) {
+		if (CHK_PCI_PID(pcr, PID_524A) || CHK_PCI_PID(pcr, PID_525A)) {
+			rtsx_pci_write_register(pcr, RTS524A_PM_CTRL3, 0x01, 0x01);
+			rtsx_pci_write_register(pcr, RTS524A_PME_FORCE_CTL, 0x30, 0x30);
+		} else {
+			rtsx_pci_write_register(pcr, PM_CTRL3, 0x01, 0x01);
+			rtsx_pci_write_register(pcr, PME_FORCE_CTL, 0xFF, 0x33);
+		}
 	} else {
-		rtsx_pci_write_register(pcr, PME_FORCE_CTL, 0xFF, 0x30);
-		rtsx_pci_write_register(pcr, PM_CTRL3, 0x01, 0x00);
+		if (CHK_PCI_PID(pcr, PID_524A) || CHK_PCI_PID(pcr, PID_525A)) {
+			rtsx_pci_write_register(pcr, RTS524A_PM_CTRL3, 0x01, 0x00);
+			rtsx_pci_write_register(pcr, RTS524A_PME_FORCE_CTL, 0x30, 0x20);
+		} else {
+			rtsx_pci_write_register(pcr, PME_FORCE_CTL, 0xFF, 0x30);
+			rtsx_pci_write_register(pcr, PM_CTRL3, 0x01, 0x00);
+		}
 	}
 
+
 	/*
 	 * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced
 	 * to drive low, and we forcibly request clock.
diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c
index 3612063cab09..2700d1997750 100644
--- a/drivers/misc/cardreader/rtsx_pcr.c
+++ b/drivers/misc/cardreader/rtsx_pcr.c
@@ -20,6 +20,8 @@
 #include <linux/rtsx_pci.h>
 #include <linux/mmc/card.h>
 #include <asm/unaligned.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
 
 #include "rtsx_pcr.h"
 #include "rts5261.h"
@@ -150,6 +152,12 @@ void rtsx_pci_start_run(struct rtsx_pcr *pcr)
 	if (pcr->remove_pci)
 		return;
 
+	if (pcr->rtd3_en)
+		if (pcr->is_runtime_suspended) {
+			pm_runtime_get(&(pcr->pci->dev));
+			pcr->is_runtime_suspended = false;
+		}
+
 	if (pcr->state != PDEV_STAT_RUN) {
 		pcr->state = PDEV_STAT_RUN;
 		if (pcr->ops->enable_auto_blink)
@@ -1081,6 +1089,16 @@ static void rtsx_pm_power_saving(struct rtsx_pcr *pcr)
 	rtsx_comm_pm_power_saving(pcr);
 }
 
+static void rtsx_pci_rtd3_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct rtsx_pcr *pcr = container_of(dwork, struct rtsx_pcr, rtd3_work);
+
+	pcr_dbg(pcr, "--> %s\n", __func__);
+	if (!pcr->is_runtime_suspended)
+		pm_runtime_put(&(pcr->pci->dev));
+}
+
 static void rtsx_pci_idle_work(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
@@ -1100,6 +1118,9 @@ static void rtsx_pci_idle_work(struct work_struct *work)
 	rtsx_pm_power_saving(pcr);
 
 	mutex_unlock(&pcr->pcr_mutex);
+
+	if (pcr->rtd3_en)
+		mod_delayed_work(system_wq, &pcr->rtd3_work, msecs_to_jiffies(10000));
 }
 
 static void rtsx_base_force_power_down(struct rtsx_pcr *pcr, u8 pm_state)
@@ -1579,6 +1600,15 @@ static int rtsx_pci_probe(struct pci_dev *pcidev,
 		rtsx_pcr_cells[i].platform_data = handle;
 		rtsx_pcr_cells[i].pdata_size = sizeof(*handle);
 	}
+
+	if (pcr->rtd3_en) {
+		INIT_DELAYED_WORK(&pcr->rtd3_work, rtsx_pci_rtd3_work);
+		pm_runtime_allow(&pcidev->dev);
+		pm_runtime_enable(&pcidev->dev);
+		pcr->is_runtime_suspended = false;
+	}
+
+
 	ret = mfd_add_devices(&pcidev->dev, pcr->id, rtsx_pcr_cells,
 			ARRAY_SIZE(rtsx_pcr_cells), NULL, 0, NULL);
 	if (ret < 0)
@@ -1616,6 +1646,9 @@ static void rtsx_pci_remove(struct pci_dev *pcidev)
 	struct pcr_handle *handle = pci_get_drvdata(pcidev);
 	struct rtsx_pcr *pcr = handle->pcr;
 
+	if (pcr->rtd3_en)
+		pm_runtime_get_noresume(&pcr->pci->dev);
+
 	pcr->remove_pci = true;
 
 	/* Disable interrupts at the pcr level */
@@ -1626,6 +1659,8 @@ static void rtsx_pci_remove(struct pci_dev *pcidev)
 
 	cancel_delayed_work_sync(&pcr->carddet_work);
 	cancel_delayed_work_sync(&pcr->idle_work);
+	if (pcr->rtd3_en)
+		cancel_delayed_work_sync(&pcr->rtd3_work);
 
 	mfd_remove_devices(&pcidev->dev);
 
@@ -1643,6 +1678,11 @@ static void rtsx_pci_remove(struct pci_dev *pcidev)
 	idr_remove(&rtsx_pci_idr, pcr->id);
 	spin_unlock(&rtsx_pci_lock);
 
+	if (pcr->rtd3_en) {
+		pm_runtime_disable(&pcr->pci->dev);
+		pm_runtime_put_noidle(&pcr->pci->dev);
+	}
+
 	kfree(pcr->slots);
 	kfree(pcr);
 	kfree(handle);
@@ -1724,13 +1764,77 @@ static void rtsx_pci_shutdown(struct pci_dev *pcidev)
 		pci_disable_msi(pcr->pci);
 }
 
+static int rtsx_pci_runtime_suspend(struct device *device)
+{
+	struct pci_dev *pcidev = to_pci_dev(device);
+	struct pcr_handle *handle;
+	struct rtsx_pcr *pcr;
+
+	handle = pci_get_drvdata(pcidev);
+	pcr = handle->pcr;
+	dev_dbg(&(pcidev->dev), "--> %s\n", __func__);
+
+	cancel_delayed_work(&pcr->carddet_work);
+	cancel_delayed_work(&pcr->rtd3_work);
+	cancel_delayed_work(&pcr->idle_work);
+
+	mutex_lock(&pcr->pcr_mutex);
+	rtsx_pci_power_off(pcr, HOST_ENTER_S3);
+
+	free_irq(pcr->irq, (void *)pcr);
+
+	mutex_unlock(&pcr->pcr_mutex);
+
+	pcr->is_runtime_suspended = true;
+
+	return 0;
+}
+
+static int rtsx_pci_runtime_resume(struct device *device)
+{
+	struct pci_dev *pcidev = to_pci_dev(device);
+	struct pcr_handle *handle;
+	struct rtsx_pcr *pcr;
+	int ret = 0;
+
+	handle = pci_get_drvdata(pcidev);
+	pcr = handle->pcr;
+	dev_dbg(&(pcidev->dev), "--> %s\n", __func__);
+
+	mutex_lock(&pcr->pcr_mutex);
+
+	rtsx_pci_write_register(pcr, HOST_SLEEP_STATE, 0x03, 0x00);
+	rtsx_pci_acquire_irq(pcr);
+	synchronize_irq(pcr->irq);
+
+	if (pcr->ops->fetch_vendor_settings)
+		pcr->ops->fetch_vendor_settings(pcr);
+
+	rtsx_pci_init_hw(pcr);
+
+	if (pcr->slots[RTSX_SD_CARD].p_dev != NULL) {
+		pcr->slots[RTSX_SD_CARD].card_event(
+				pcr->slots[RTSX_SD_CARD].p_dev);
+	}
+
+	schedule_delayed_work(&pcr->idle_work, msecs_to_jiffies(200));
+
+	mutex_unlock(&pcr->pcr_mutex);
+	return ret;
+}
+
 #else /* CONFIG_PM */
 
 #define rtsx_pci_shutdown NULL
+#define rtsx_pci_runtime_suspend NULL
+#define rtsx_pic_runtime_resume NULL
 
 #endif /* CONFIG_PM */
 
-static SIMPLE_DEV_PM_OPS(rtsx_pci_pm_ops, rtsx_pci_suspend, rtsx_pci_resume);
+static const struct dev_pm_ops rtsx_pci_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(rtsx_pci_suspend, rtsx_pci_resume)
+	SET_RUNTIME_PM_OPS(rtsx_pci_runtime_suspend, rtsx_pci_runtime_resume, NULL)
+};
 
 static struct pci_driver rtsx_pci_driver = {
 	.name = DRV_NAME_RTSX_PCI,
diff --git a/drivers/misc/cardreader/rtsx_pcr.h b/drivers/misc/cardreader/rtsx_pcr.h
index fe5f4ca0f937..daf057c4eea6 100644
--- a/drivers/misc/cardreader/rtsx_pcr.h
+++ b/drivers/misc/cardreader/rtsx_pcr.h
@@ -90,6 +90,7 @@ static inline u8 map_sd_drive(int idx)
 
 #define rtsx_check_mmc_support(reg)		((reg) & 0x10)
 #define rtsx_reg_to_rtd3(reg)				((reg) & 0x02)
+#define rtsx_reg_to_rtd3_uhsii(reg)				((reg) & 0x04)
 #define rtsx_reg_to_aspm(reg)			(((reg) >> 28) & 0x03)
 #define rtsx_reg_to_sd30_drive_sel_1v8(reg)	(((reg) >> 26) & 0x03)
 #define rtsx_reg_to_sd30_drive_sel_3v3(reg)	(((reg) >> 5) & 0x03)
diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c
index eb395e144207..a7b5ad17bcf5 100644
--- a/drivers/mmc/host/rtsx_pci_sdmmc.c
+++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
@@ -20,6 +20,7 @@
 #include <linux/mmc/card.h>
 #include <linux/rtsx_pci.h>
 #include <asm/unaligned.h>
+#include <linux/pm_runtime.h>
 
 struct realtek_pci_sdmmc {
 	struct platform_device	*pdev;
@@ -1343,6 +1344,7 @@ static void init_extra_caps(struct realtek_pci_sdmmc *host)
 static void realtek_init_host(struct realtek_pci_sdmmc *host)
 {
 	struct mmc_host *mmc = host->mmc;
+	struct rtsx_pcr *pcr = host->pcr;
 
 	mmc->f_min = 250000;
 	mmc->f_max = 208000000;
@@ -1350,6 +1352,8 @@ static void realtek_init_host(struct realtek_pci_sdmmc *host)
 	mmc->caps = MMC_CAP_4_BIT_DATA | MMC_CAP_SD_HIGHSPEED |
 		MMC_CAP_MMC_HIGHSPEED | MMC_CAP_BUS_WIDTH_TEST |
 		MMC_CAP_UHS_SDR12 | MMC_CAP_UHS_SDR25;
+	if (pcr->rtd3_en)
+		mmc->caps = mmc->caps | MMC_CAP_AGGRESSIVE_PM;
 	mmc->caps2 = MMC_CAP2_NO_PRESCAN_POWERUP | MMC_CAP2_FULL_PWR_CYCLE;
 	mmc->max_current_330 = 400;
 	mmc->max_current_180 = 800;
@@ -1407,6 +1411,13 @@ static int rtsx_pci_sdmmc_drv_probe(struct platform_device *pdev)
 
 	realtek_init_host(host);
 
+	if (pcr->rtd3_en) {
+		pm_runtime_set_autosuspend_delay(&pdev->dev, 5000);
+		pm_runtime_use_autosuspend(&pdev->dev);
+		pm_runtime_enable(&pdev->dev);
+	}
+
+
 	mmc_add_host(mmc);
 
 	return 0;
@@ -1426,6 +1437,11 @@ static int rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev)
 	pcr->slots[RTSX_SD_CARD].card_event = NULL;
 	mmc = host->mmc;
 
+	if (pcr->rtd3_en) {
+		pm_runtime_dont_use_autosuspend(&pdev->dev);
+		pm_runtime_disable(&pdev->dev);
+	}
+
 	cancel_work_sync(&host->work);
 
 	mutex_lock(&host->host_mutex);
diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h
index 745f5e73f99a..f895ccabbe29 100644
--- a/include/linux/rtsx_pci.h
+++ b/include/linux/rtsx_pci.h
@@ -1174,6 +1174,7 @@ struct rtsx_pcr {
 
 	struct delayed_work		carddet_work;
 	struct delayed_work		idle_work;
+	struct delayed_work		rtd3_work;
 
 	spinlock_t			lock;
 	struct mutex			pcr_mutex;
@@ -1183,6 +1184,7 @@ struct rtsx_pcr {
 	unsigned int			cur_clock;
 	bool				remove_pci;
 	bool				msi_en;
+	bool				is_runtime_suspended;
 
 #define EXTRA_CAPS_SD_SDR50		(1 << 0)
 #define EXTRA_CAPS_SD_SDR104		(1 << 1)
-- 
cgit 


From 8010622c86ca5bb44bc98492f5968726fc7c7a21 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Wed, 9 Dec 2020 16:26:39 +0100
Subject: USB: UAS: introduce a quirk to set no_write_same

UAS does not share the pessimistic assumption storage is making that
devices cannot deal with WRITE_SAME.  A few devices supported by UAS,
are reported to not deal well with WRITE_SAME. Those need a quirk.

Add it to the device that needs it.

Reported-by: David C. Partridge <david.partridge@perdrix.co.uk>
Signed-off-by: Oliver Neukum <oneukum@suse.com>
Cc: stable <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20201209152639.9195-1-oneukum@suse.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 1 +
 drivers/usb/storage/uas.c                       | 3 +++
 drivers/usb/storage/unusual_uas.h               | 7 +++++--
 drivers/usb/storage/usb.c                       | 3 +++
 include/linux/usb_usual.h                       | 2 ++
 5 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 44fde25bb221..f6a1513dfb76 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5663,6 +5663,7 @@
 					device);
 				j = NO_REPORT_LUNS (don't use report luns
 					command, uas only);
+				k = NO_SAME (do not use WRITE_SAME, uas only)
 				l = NOT_LOCKABLE (don't try to lock and
 					unlock ejectable media, not on uas);
 				m = MAX_SECTORS_64 (don't transfer more
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index 56422c4b4ff3..bef89c6bd1d7 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -868,6 +868,9 @@ static int uas_slave_configure(struct scsi_device *sdev)
 	if (devinfo->flags & US_FL_NO_READ_CAPACITY_16)
 		sdev->no_read_capacity_16 = 1;
 
+	/* Some disks cannot handle WRITE_SAME */
+	if (devinfo->flags & US_FL_NO_SAME)
+		sdev->no_write_same = 1;
 	/*
 	 * Some disks return the total number of blocks in response
 	 * to READ CAPACITY rather than the highest block number.
diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h
index 711ab240058c..870e9cf3d5dc 100644
--- a/drivers/usb/storage/unusual_uas.h
+++ b/drivers/usb/storage/unusual_uas.h
@@ -35,12 +35,15 @@ UNUSUAL_DEV(0x054c, 0x087d, 0x0000, 0x9999,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_NO_REPORT_OPCODES),
 
-/* Reported-by: Julian Groß <julian.g@posteo.de> */
+/*
+ *  Initially Reported-by: Julian Groß <julian.g@posteo.de>
+ *  Further reports David C. Partridge <david.partridge@perdrix.co.uk>
+ */
 UNUSUAL_DEV(0x059f, 0x105f, 0x0000, 0x9999,
 		"LaCie",
 		"2Big Quadra USB3",
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
-		US_FL_NO_REPORT_OPCODES),
+		US_FL_NO_REPORT_OPCODES | US_FL_NO_SAME),
 
 /*
  * Apricorn USB3 dongle sometimes returns "USBSUSBSUSBS" in response to SCSI
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index 94a64729dc27..90aa9c12ffac 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -541,6 +541,9 @@ void usb_stor_adjust_quirks(struct usb_device *udev, unsigned long *fflags)
 		case 'j':
 			f |= US_FL_NO_REPORT_LUNS;
 			break;
+		case 'k':
+			f |= US_FL_NO_SAME;
+			break;
 		case 'l':
 			f |= US_FL_NOT_LOCKABLE;
 			break;
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index 4a19ac3f24d0..6b03fdd69d27 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -84,6 +84,8 @@
 		/* Cannot handle REPORT_LUNS */			\
 	US_FLAG(ALWAYS_SYNC, 0x20000000)			\
 		/* lies about caching, so always sync */	\
+	US_FLAG(NO_SAME, 0x40000000)				\
+		/* Cannot handle WRITE_SAME */			\
 
 #define US_FLAG(name, value)	US_FL_##name = value ,
 enum { US_DO_ALL_FLAGS };
-- 
cgit 


From c73ebb685fb6dfb513d394cbea64fb81ba3d994f Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Tue, 3 Nov 2020 10:54:37 +0800
Subject: io_uring: add timeout support for io_uring_enter()

Now users who want to get woken when waiting for events should submit a
timeout command first. It is not safe for applications that split SQ and
CQ handling between two threads, such as mysql. Users should synchronize
the two threads explicitly to protect SQ and that will impact the
performance.

This patch adds support for timeout to existing io_uring_enter(). To
avoid overloading arguments, it introduces a new parameter structure
which contains sigmask and timeout.

I have tested the workloads with one thread submiting nop requests
while the other reaping the cqe with timeout. It shows 1.8~2x faster
when the iodepth is 16.

Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
[axboe: various cleanups/fixes, and name change to SIG_IS_DATA]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 69 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/syscalls.h      |  2 +-
 include/uapi/linux/io_uring.h |  9 ++++++
 3 files changed, 72 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 11ce97d6259c..ee25c70527aa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7118,7 +7118,8 @@ static int io_run_task_work_sig(void)
  * application must reap them itself, as they reside on the shared cq ring.
  */
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
-			  const sigset_t __user *sig, size_t sigsz)
+			  const sigset_t __user *sig, size_t sigsz,
+			  struct __kernel_timespec __user *uts)
 {
 	struct io_wait_queue iowq = {
 		.wq = {
@@ -7130,6 +7131,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
+	struct timespec64 ts;
+	signed long timeout = 0;
 	int ret = 0;
 
 	do {
@@ -7152,6 +7155,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
+	if (uts) {
+		if (get_timespec64(&ts, uts))
+			return -EFAULT;
+		timeout = timespec64_to_jiffies(&ts);
+	}
+
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
@@ -7165,7 +7174,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			break;
 		if (io_should_wake(&iowq, false))
 			break;
-		schedule();
+		if (uts) {
+			timeout = schedule_timeout(timeout);
+			if (timeout == 0) {
+				ret = -ETIME;
+				break;
+			}
+		} else {
+			schedule();
+		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
 
@@ -9167,9 +9184,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 	finish_wait(&ctx->sqo_sq_wait, &wait);
 }
 
+static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
+			  struct __kernel_timespec __user **ts,
+			  const sigset_t __user **sig)
+{
+	struct io_uring_getevents_arg arg;
+
+	/*
+	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
+	 * is just a pointer to the sigset_t.
+	 */
+	if (!(flags & IORING_ENTER_EXT_ARG)) {
+		*sig = (const sigset_t __user *) argp;
+		*ts = NULL;
+		return 0;
+	}
+
+	/*
+	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
+	 * timespec and sigset_t pointers if good.
+	 */
+	if (*argsz != sizeof(arg))
+		return -EINVAL;
+	if (copy_from_user(&arg, argp, sizeof(arg)))
+		return -EFAULT;
+	*sig = u64_to_user_ptr(arg.sigmask);
+	*argsz = arg.sigmask_sz;
+	*ts = u64_to_user_ptr(arg.ts);
+	return 0;
+}
+
 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
-		u32, min_complete, u32, flags, const sigset_t __user *, sig,
-		size_t, sigsz)
+		u32, min_complete, u32, flags, const void __user *, argp,
+		size_t, argsz)
 {
 	struct io_ring_ctx *ctx;
 	long ret = -EBADF;
@@ -9179,7 +9226,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	io_run_task_work();
 
 	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			IORING_ENTER_SQ_WAIT))
+			IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
 		return -EINVAL;
 
 	f = fdget(fd);
@@ -9225,6 +9272,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			goto out;
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
+		const sigset_t __user *sig;
+		struct __kernel_timespec __user *ts;
+
+		ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+		if (unlikely(ret))
+			goto out;
+
 		min_complete = min(min_complete, ctx->cq_entries);
 
 		/*
@@ -9237,7 +9291,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
 			ret = io_iopoll_check(ctx, min_complete);
 		} else {
-			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
 		}
 	}
 
@@ -9600,7 +9654,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
-			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED;
+			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+			IORING_FEAT_EXT_ARG;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 37bea07c12f2..8576e8bf92fe 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -317,7 +317,7 @@ asmlinkage long sys_io_uring_setup(u32 entries,
 				struct io_uring_params __user *p);
 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
 				u32 min_complete, u32 flags,
-				const sigset_t __user *sig, size_t sigsz);
+				const void __user *argp, size_t argsz);
 asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
 				void __user *arg, unsigned int nr_args);
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 557e7eae497f..6bb8229de892 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -231,6 +231,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_GETEVENTS	(1U << 0)
 #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
 #define IORING_ENTER_SQ_WAIT	(1U << 2)
+#define IORING_ENTER_EXT_ARG	(1U << 3)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -259,6 +260,7 @@ struct io_uring_params {
 #define IORING_FEAT_FAST_POLL		(1U << 5)
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
 #define IORING_FEAT_SQPOLL_NONFIXED	(1U << 7)
+#define IORING_FEAT_EXT_ARG		(1U << 8)
 
 /*
  * io_uring_register(2) opcodes and arguments
@@ -335,4 +337,11 @@ enum {
 	IORING_RESTRICTION_LAST
 };
 
+struct io_uring_getevents_arg {
+	__u64	sigmask;
+	__u32	sigmask_sz;
+	__u32	pad;
+	__u64	ts;
+};
+
 #endif
-- 
cgit 


From 5a6338cce9f4133c478d3b10b300f96dd644379a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 17 Nov 2020 15:32:06 +0530
Subject: mailbox: arm_mhuv2: Add driver

This adds driver for the ARM MHUv2 (Message Handling Unit) mailbox
controller.

This is based on the accepted DT bindings of the controller and supports
combination of both transport protocols, i.e. doorbell and data-transfer.

Transmitting and receiving data through the mailbox framework is done
through struct arm_mhuv2_mbox_msg.

Based on the initial work done by Morten Borup Petersen from ARM.

Co-developed-by: Tushar Khandelwal <tushar.khandelwal@arm.com>
Signed-off-by: Tushar Khandelwal <tushar.khandelwal@arm.com>
Tested-by: Usama Arif <usama.arif@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 MAINTAINERS                               |    9 +
 drivers/mailbox/Kconfig                   |    7 +
 drivers/mailbox/Makefile                  |    2 +
 drivers/mailbox/arm_mhuv2.c               | 1136 +++++++++++++++++++++++++++++
 include/linux/mailbox/arm_mhuv2_message.h |   20 +
 5 files changed, 1174 insertions(+)
 create mode 100644 drivers/mailbox/arm_mhuv2.c
 create mode 100644 include/linux/mailbox/arm_mhuv2_message.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 2daa6ee673f7..3917b7ef1da6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10418,6 +10418,15 @@ F:	drivers/mailbox/
 F:	include/linux/mailbox_client.h
 F:	include/linux/mailbox_controller.h
 
+MAILBOX ARM MHUv2
+M:	Viresh Kumar <viresh.kumar@linaro.org>
+M:	Tushar Khandelwal <Tushar.Khandelwal@arm.com>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	drivers/mailbox/arm_mhuv2.c
+F:	include/linux/mailbox/arm_mhuv2_message.h
+F:	Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml
+
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
 M:	Michael Kerrisk <mtk.manpages@gmail.com>
 L:	linux-man@vger.kernel.org
diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index abbf5d67ffa2..f4abe3529acd 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -16,6 +16,13 @@ config ARM_MHU
 	  The controller has 3 mailbox channels, the last of which can be
 	  used in Secure mode only.
 
+config ARM_MHU_V2
+	tristate "ARM MHUv2 Mailbox"
+	depends on ARM_AMBA
+	help
+	  Say Y here if you want to build the ARM MHUv2 controller driver,
+	  which provides unidirectional mailboxes between processing elements.
+
 config IMX_MBOX
 	tristate "i.MX Mailbox"
 	depends on ARCH_MXC || COMPILE_TEST
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 2e06e02b2e03..7194fa92c787 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -7,6 +7,8 @@ obj-$(CONFIG_MAILBOX_TEST)	+= mailbox-test.o
 
 obj-$(CONFIG_ARM_MHU)	+= arm_mhu.o arm_mhu_db.o
 
+obj-$(CONFIG_ARM_MHU_V2)	+= arm_mhuv2.o
+
 obj-$(CONFIG_IMX_MBOX)	+= imx-mailbox.o
 
 obj-$(CONFIG_ARMADA_37XX_RWTM_MBOX)	+= armada-37xx-rwtm-mailbox.o
diff --git a/drivers/mailbox/arm_mhuv2.c b/drivers/mailbox/arm_mhuv2.c
new file mode 100644
index 000000000000..67fb10885bb4
--- /dev/null
+++ b/drivers/mailbox/arm_mhuv2.c
@@ -0,0 +1,1136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM Message Handling Unit Version 2 (MHUv2) driver.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ * Copyright (C) 2020 Linaro Ltd.
+ *
+ * An MHUv2 mailbox controller can provide up to 124 channel windows (each 32
+ * bit long) and the driver allows any combination of both the transport
+ * protocol modes: data-transfer and doorbell, to be used on those channel
+ * windows.
+ *
+ * The transport protocols should be specified in the device tree entry for the
+ * device. The transport protocols determine how the underlying hardware
+ * resources of the device are utilized when transmitting data. Refer to the
+ * device tree bindings of the ARM MHUv2 controller for more details.
+ *
+ * The number of registered mailbox channels is dependent on both the underlying
+ * hardware - mainly the number of channel windows implemented by the platform,
+ * as well as the selected transport protocols.
+ *
+ * The MHUv2 controller can work both as a sender and receiver, but the driver
+ * and the DT bindings support unidirectional transfers for better allocation of
+ * the channels. That is, this driver will be probed for two separate devices
+ * for each mailbox controller, a sender device and a receiver device.
+ */
+
+#include <linux/amba/bus.h>
+#include <linux/interrupt.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox/arm_mhuv2_message.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/spinlock.h>
+
+/* ====== MHUv2 Registers ====== */
+
+/* Maximum number of channel windows */
+#define MHUV2_CH_WN_MAX			124
+/* Number of combined interrupt status registers */
+#define MHUV2_CMB_INT_ST_REG_CNT	4
+#define MHUV2_STAT_BYTES		(sizeof(u32))
+#define MHUV2_STAT_BITS			(MHUV2_STAT_BYTES * __CHAR_BIT__)
+
+#define LSB_MASK(n)			((1 << (n * __CHAR_BIT__)) - 1)
+#define MHUV2_PROTOCOL_PROP		"arm,mhuv2-protocols"
+
+/* Register Message Handling Unit Configuration fields */
+struct mhu_cfg_t {
+	u32 num_ch : 7;
+	u32 pad : 25;
+} __packed;
+
+/* register Interrupt Status fields */
+struct int_st_t {
+	u32 nr2r : 1;
+	u32 r2nr : 1;
+	u32 pad : 30;
+} __packed;
+
+/* Register Interrupt Clear fields */
+struct int_clr_t {
+	u32 nr2r : 1;
+	u32 r2nr : 1;
+	u32 pad : 30;
+} __packed;
+
+/* Register Interrupt Enable fields */
+struct int_en_t {
+	u32 r2nr : 1;
+	u32 nr2r : 1;
+	u32 chcomb : 1;
+	u32 pad : 29;
+} __packed;
+
+/* Register Implementer Identification fields */
+struct iidr_t {
+	u32 implementer : 12;
+	u32 revision : 4;
+	u32 variant : 4;
+	u32 product_id : 12;
+} __packed;
+
+/* Register Architecture Identification Register fields */
+struct aidr_t {
+	u32 arch_minor_rev : 4;
+	u32 arch_major_rev : 4;
+	u32 pad : 24;
+} __packed;
+
+/* Sender Channel Window fields */
+struct mhu2_send_ch_wn_reg {
+	u32 stat;
+	u8 pad1[0x0C - 0x04];
+	u32 stat_set;
+	u32 int_st;
+	u32 int_clr;
+	u32 int_en;
+	u8 pad2[0x20 - 0x1C];
+} __packed;
+
+/* Sender frame register fields */
+struct mhu2_send_frame_reg {
+	struct mhu2_send_ch_wn_reg ch_wn[MHUV2_CH_WN_MAX];
+	struct mhu_cfg_t mhu_cfg;
+	u32 resp_cfg;
+	u32 access_request;
+	u32 access_ready;
+	struct int_st_t int_st;
+	struct int_clr_t int_clr;
+	struct int_en_t int_en;
+	u32 reserved0;
+	u32 chcomb_int_st[MHUV2_CMB_INT_ST_REG_CNT];
+	u8 pad[0xFC8 - 0xFB0];
+	struct iidr_t iidr;
+	struct aidr_t aidr;
+} __packed;
+
+/* Receiver Channel Window fields */
+struct mhu2_recv_ch_wn_reg {
+	u32 stat;
+	u32 stat_masked;
+	u32 stat_clear;
+	u8 reserved0[0x10 - 0x0C];
+	u32 mask;
+	u32 mask_set;
+	u32 mask_clear;
+	u8 pad[0x20 - 0x1C];
+} __packed;
+
+/* Receiver frame register fields */
+struct mhu2_recv_frame_reg {
+	struct mhu2_recv_ch_wn_reg ch_wn[MHUV2_CH_WN_MAX];
+	struct mhu_cfg_t mhu_cfg;
+	u8 reserved0[0xF90 - 0xF84];
+	struct int_st_t int_st;
+	struct int_clr_t int_clr;
+	struct int_en_t int_en;
+	u32 pad;
+	u32 chcomb_int_st[MHUV2_CMB_INT_ST_REG_CNT];
+	u8 reserved2[0xFC8 - 0xFB0];
+	struct iidr_t iidr;
+	struct aidr_t aidr;
+} __packed;
+
+
+/* ====== MHUv2 data structures ====== */
+
+enum mhuv2_transport_protocol {
+	DOORBELL = 0,
+	DATA_TRANSFER = 1
+};
+
+enum mhuv2_frame {
+	RECEIVER_FRAME,
+	SENDER_FRAME
+};
+
+/**
+ * struct mhuv2 - MHUv2 mailbox controller data
+ *
+ * @mbox:	Mailbox controller belonging to the MHU frame.
+ * @send/recv:	Base address of the register mapping region.
+ * @frame:	Frame type: RECEIVER_FRAME or SENDER_FRAME.
+ * @irq:	Interrupt.
+ * @windows:	Channel windows implemented by the platform.
+ * @minor:	Minor version of the controller.
+ * @length:	Length of the protocols array in bytes.
+ * @protocols:	Raw protocol information, derived from device tree.
+ * @doorbell_pending_lock: spinlock required for correct operation of Tx
+ *		interrupt for doorbells.
+ */
+struct mhuv2 {
+	struct mbox_controller mbox;
+	union {
+		struct mhu2_send_frame_reg __iomem *send;
+		struct mhu2_recv_frame_reg __iomem *recv;
+	};
+	enum mhuv2_frame frame;
+	unsigned int irq;
+	unsigned int windows;
+	unsigned int minor;
+	unsigned int length;
+	u32 *protocols;
+
+	spinlock_t doorbell_pending_lock;
+};
+
+#define mhu_from_mbox(_mbox) container_of(_mbox, struct mhuv2, mbox)
+
+/**
+ * struct mhuv2_protocol_ops - MHUv2 operations
+ *
+ * Each transport protocol must provide an implementation of the operations
+ * provided here.
+ *
+ * @rx_startup: Startup callback for receiver.
+ * @rx_shutdown: Shutdown callback for receiver.
+ * @read_data: Reads and clears newly available data.
+ * @tx_startup: Startup callback for receiver.
+ * @tx_shutdown: Shutdown callback for receiver.
+ * @last_tx_done: Report back if the last tx is completed or not.
+ * @send_data: Send data to the receiver.
+ */
+struct mhuv2_protocol_ops {
+	int (*rx_startup)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void (*rx_shutdown)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void *(*read_data)(struct mhuv2 *mhu, struct mbox_chan *chan);
+
+	void (*tx_startup)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void (*tx_shutdown)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	int (*last_tx_done)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	int (*send_data)(struct mhuv2 *mhu, struct mbox_chan *chan, void *arg);
+};
+
+/*
+ * MHUv2 mailbox channel's private information
+ *
+ * @ops:	protocol specific ops for the channel.
+ * @ch_wn_idx:	Channel window index allocated to the channel.
+ * @windows:	Total number of windows consumed by the channel, only relevant
+ *		in DATA_TRANSFER protocol.
+ * @doorbell:	Doorbell bit number within the ch_wn_idx window, only relevant
+ *		in DOORBELL protocol.
+ * @pending:	Flag indicating pending doorbell interrupt, only relevant in
+ *		DOORBELL protocol.
+ */
+struct mhuv2_mbox_chan_priv {
+	const struct mhuv2_protocol_ops *ops;
+	u32 ch_wn_idx;
+	union {
+		u32 windows;
+		struct {
+			u32 doorbell;
+			u32 pending;
+		};
+	};
+};
+
+/* Macro for reading a bitfield within a physically mapped packed struct */
+#define readl_relaxed_bitfield(_regptr, _field)				\
+	({								\
+		u32 _regval;						\
+		_regval = readl_relaxed((_regptr));			\
+		(*(typeof((_regptr)))(&_regval))._field;		\
+	})
+
+/* Macro for writing a bitfield within a physically mapped packed struct */
+#define writel_relaxed_bitfield(_value, _regptr, _field)		\
+	({								\
+		u32 _regval;						\
+		_regval = readl_relaxed(_regptr);			\
+		(*(typeof(_regptr))(&_regval))._field = _value;		\
+		writel_relaxed(_regval, _regptr);			\
+	})
+
+
+/* =================== Doorbell transport protocol operations =============== */
+
+static int mhuv2_doorbell_rx_startup(struct mhuv2 *mhu, struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].mask_clear);
+	return 0;
+}
+
+static void mhuv2_doorbell_rx_shutdown(struct mhuv2 *mhu,
+				       struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].mask_set);
+}
+
+static void *mhuv2_doorbell_read_data(struct mhuv2 *mhu, struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].stat_clear);
+	return NULL;
+}
+
+static int mhuv2_doorbell_last_tx_done(struct mhuv2 *mhu,
+				       struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return !(readl_relaxed(&mhu->send->ch_wn[priv->ch_wn_idx].stat) &
+		 BIT(priv->doorbell));
+}
+
+static int mhuv2_doorbell_send_data(struct mhuv2 *mhu, struct mbox_chan *chan,
+				    void *arg)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mhu->doorbell_pending_lock, flags);
+
+	priv->pending = 1;
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->send->ch_wn[priv->ch_wn_idx].stat_set);
+
+	spin_unlock_irqrestore(&mhu->doorbell_pending_lock, flags);
+
+	return 0;
+}
+
+static const struct mhuv2_protocol_ops mhuv2_doorbell_ops = {
+	.rx_startup = mhuv2_doorbell_rx_startup,
+	.rx_shutdown = mhuv2_doorbell_rx_shutdown,
+	.read_data = mhuv2_doorbell_read_data,
+	.last_tx_done = mhuv2_doorbell_last_tx_done,
+	.send_data = mhuv2_doorbell_send_data,
+};
+#define IS_PROTOCOL_DOORBELL(_priv) (_priv->ops == &mhuv2_doorbell_ops)
+
+/* ============= Data transfer transport protocol operations ================ */
+
+static int mhuv2_data_transfer_rx_startup(struct mhuv2 *mhu,
+					  struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/*
+	 * The protocol mandates that all but the last status register must be
+	 * masked.
+	 */
+	writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_clear);
+	return 0;
+}
+
+static void mhuv2_data_transfer_rx_shutdown(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_set);
+}
+
+static void *mhuv2_data_transfer_read_data(struct mhuv2 *mhu,
+					   struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	const int windows = priv->windows;
+	struct arm_mhuv2_mbox_msg *msg;
+	u32 *data;
+	int i, idx;
+
+	msg = kzalloc(sizeof(*msg) + windows * MHUV2_STAT_BYTES, GFP_KERNEL);
+	if (!msg)
+		return ERR_PTR(-ENOMEM);
+
+	data = msg->data = msg + 1;
+	msg->len = windows * MHUV2_STAT_BYTES;
+
+	/*
+	 * Messages are expected in order of most significant word to least
+	 * significant word. Refer mhuv2_data_transfer_send_data() for more
+	 * details.
+	 *
+	 * We also need to read the stat register instead of stat_masked, as we
+	 * masked all but the last window.
+	 *
+	 * Last channel window must be cleared as the final operation. Upon
+	 * clearing the last channel window register, which is unmasked in
+	 * data-transfer protocol, the interrupt is de-asserted.
+	 */
+	for (i = 0; i < windows; i++) {
+		idx = priv->ch_wn_idx + i;
+		data[windows - 1 - i] = readl_relaxed(&mhu->recv->ch_wn[idx].stat);
+		writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[idx].stat_clear);
+	}
+
+	return msg;
+}
+
+static void mhuv2_data_transfer_tx_startup(struct mhuv2 *mhu,
+					   struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/* Enable interrupts only for the last window */
+	if (mhu->minor) {
+		writel_relaxed(0x1, &mhu->send->ch_wn[i].int_clr);
+		writel_relaxed(0x1, &mhu->send->ch_wn[i].int_en);
+	}
+}
+
+static void mhuv2_data_transfer_tx_shutdown(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	if (mhu->minor)
+		writel_relaxed(0x0, &mhu->send->ch_wn[i].int_en);
+}
+
+static int mhuv2_data_transfer_last_tx_done(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/* Just checking the last channel window should be enough */
+	return !readl_relaxed(&mhu->send->ch_wn[i].stat);
+}
+
+/*
+ * Message will be transmitted from most significant to least significant word.
+ * This is to allow for messages shorter than channel windows to still trigger
+ * the receiver interrupt which gets activated when the last stat register is
+ * written. As an example, a 6-word message is to be written on a 4-channel MHU
+ * connection: Registers marked with '*' are masked, and will not generate an
+ * interrupt on the receiver side once written.
+ *
+ * u32 *data =	[0x00000001], [0x00000002], [0x00000003], [0x00000004],
+ *		[0x00000005], [0x00000006]
+ *
+ * ROUND 1:
+ * stat reg		To write	Write sequence
+ * [ stat 3 ]	<-	[0x00000001]	4 <- triggers interrupt on receiver
+ * [ stat 2 ]	<-	[0x00000002]	3
+ * [ stat 1 ]	<-	[0x00000003]	2
+ * [ stat 0 ]	<-	[0x00000004]	1
+ *
+ * data += 4 // Increment data pointer by number of stat regs
+ *
+ * ROUND 2:
+ * stat reg		To write	Write sequence
+ * [ stat 3 ]	<-	[0x00000005]	2 <- triggers interrupt on receiver
+ * [ stat 2 ]	<-	[0x00000006]	1
+ * [ stat 1 ]	<-	[0x00000000]
+ * [ stat 0 ]	<-	[0x00000000]
+ */
+static int mhuv2_data_transfer_send_data(struct mhuv2 *mhu,
+					 struct mbox_chan *chan, void *arg)
+{
+	const struct arm_mhuv2_mbox_msg *msg = arg;
+	int bytes_left = msg->len, bytes_to_send, bytes_in_round, i;
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int windows = priv->windows;
+	u32 *data = msg->data, word;
+
+	while (bytes_left) {
+		if (!data[0]) {
+			dev_err(mhu->mbox.dev, "Data aligned at first window can't be zero to guarantee interrupt generation at receiver");
+			return -EINVAL;
+		}
+
+		while(!mhuv2_data_transfer_last_tx_done(mhu, chan))
+			continue;
+
+		bytes_in_round = min(bytes_left, (int)(windows * MHUV2_STAT_BYTES));
+
+		for (i = windows - 1; i >= 0; i--) {
+			/* Data less than windows can transfer ? */
+			if (unlikely(bytes_in_round <= i * MHUV2_STAT_BYTES))
+				continue;
+
+			word = data[i];
+			bytes_to_send = bytes_in_round & (MHUV2_STAT_BYTES - 1);
+			if (unlikely(bytes_to_send))
+				word &= LSB_MASK(bytes_to_send);
+			else
+				bytes_to_send = MHUV2_STAT_BYTES;
+
+			writel_relaxed(word, &mhu->send->ch_wn[priv->ch_wn_idx + windows - 1 - i].stat_set);
+			bytes_left -= bytes_to_send;
+			bytes_in_round -= bytes_to_send;
+		}
+
+		data += windows;
+	}
+
+	return 0;
+}
+
+static const struct mhuv2_protocol_ops mhuv2_data_transfer_ops = {
+	.rx_startup = mhuv2_data_transfer_rx_startup,
+	.rx_shutdown = mhuv2_data_transfer_rx_shutdown,
+	.read_data = mhuv2_data_transfer_read_data,
+	.tx_startup = mhuv2_data_transfer_tx_startup,
+	.tx_shutdown = mhuv2_data_transfer_tx_shutdown,
+	.last_tx_done = mhuv2_data_transfer_last_tx_done,
+	.send_data = mhuv2_data_transfer_send_data,
+};
+
+/* Interrupt handlers */
+
+static struct mbox_chan *get_irq_chan_comb(struct mhuv2 *mhu, u32 *reg)
+{
+	struct mbox_chan *chans = mhu->mbox.chans;
+	int channel = 0, i, offset = 0, windows, protocol, ch_wn;
+	u32 stat;
+
+	for (i = 0; i < MHUV2_CMB_INT_ST_REG_CNT; i++) {
+		stat = readl_relaxed(reg + i);
+		if (!stat)
+			continue;
+
+		ch_wn = i * MHUV2_STAT_BITS + __builtin_ctz(stat);
+
+		for (i = 0; i < mhu->length; i += 2) {
+			protocol = mhu->protocols[i];
+			windows = mhu->protocols[i + 1];
+
+			if (ch_wn >= offset + windows) {
+				if (protocol == DOORBELL)
+					channel += MHUV2_STAT_BITS * windows;
+				else
+					channel++;
+
+				offset += windows;
+				continue;
+			}
+
+			/* Return first chan of the window in doorbell mode */
+			if (protocol == DOORBELL)
+				channel += MHUV2_STAT_BITS * (ch_wn - offset);
+
+			return &chans[channel];
+		}
+	}
+
+	return ERR_PTR(-EIO);
+}
+
+static irqreturn_t mhuv2_sender_interrupt(int irq, void *data)
+{
+	struct mhuv2 *mhu = data;
+	struct device *dev = mhu->mbox.dev;
+	struct mhuv2_mbox_chan_priv *priv;
+	struct mbox_chan *chan;
+	unsigned long flags;
+	int i, found = 0;
+	u32 stat;
+
+	chan = get_irq_chan_comb(mhu, mhu->send->chcomb_int_st);
+	if (IS_ERR(chan)) {
+		dev_warn(dev, "Failed to find channel for the Tx interrupt\n");
+		return IRQ_NONE;
+	}
+	priv = chan->con_priv;
+
+	if (!IS_PROTOCOL_DOORBELL(priv)) {
+		writel_relaxed(1, &mhu->send->ch_wn[priv->ch_wn_idx + priv->windows - 1].int_clr);
+
+		if (chan->cl) {
+			mbox_chan_txdone(chan, 0);
+			return IRQ_HANDLED;
+		}
+
+		dev_warn(dev, "Tx interrupt Received on channel (%u) not currently attached to a mailbox client\n",
+			 priv->ch_wn_idx);
+		return IRQ_NONE;
+	}
+
+	/* Clear the interrupt first, so we don't miss any doorbell later */
+	writel_relaxed(1, &mhu->send->ch_wn[priv->ch_wn_idx].int_clr);
+
+	/*
+	 * In Doorbell mode, make sure no new transitions happen while the
+	 * interrupt handler is trying to find the finished doorbell tx
+	 * operations, else we may think few of the transfers were complete
+	 * before they actually were.
+	 */
+	spin_lock_irqsave(&mhu->doorbell_pending_lock, flags);
+
+	/*
+	 * In case of doorbell mode, the first channel of the window is returned
+	 * by get_irq_chan_comb(). Find all the pending channels here.
+	 */
+	stat = readl_relaxed(&mhu->send->ch_wn[priv->ch_wn_idx].stat);
+
+	for (i = 0; i < MHUV2_STAT_BITS; i++) {
+		priv = chan[i].con_priv;
+
+		/* Find cases where pending was 1, but stat's bit is cleared */
+		if (priv->pending ^ ((stat >> i) & 0x1)) {
+			BUG_ON(!priv->pending);
+
+			if (!chan->cl) {
+				dev_warn(dev, "Tx interrupt received on doorbell (%u : %u) channel not currently attached to a mailbox client\n",
+					 priv->ch_wn_idx, i);
+				continue;
+			}
+
+			mbox_chan_txdone(&chan[i], 0);
+			priv->pending = 0;
+			found++;
+		}
+	}
+
+	spin_unlock_irqrestore(&mhu->doorbell_pending_lock, flags);
+
+	if (!found) {
+		/*
+		 * We may have already processed the doorbell in the previous
+		 * iteration if the interrupt came right after we cleared it but
+		 * before we read the stat register.
+		 */
+		dev_dbg(dev, "Couldn't find the doorbell (%u) for the Tx interrupt interrupt\n",
+			priv->ch_wn_idx);
+		return IRQ_NONE;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct mbox_chan *get_irq_chan_comb_rx(struct mhuv2 *mhu)
+{
+	struct mhuv2_mbox_chan_priv *priv;
+	struct mbox_chan *chan;
+	u32 stat;
+
+	chan = get_irq_chan_comb(mhu, mhu->recv->chcomb_int_st);
+	if (IS_ERR(chan))
+		return chan;
+
+	priv = chan->con_priv;
+	if (!IS_PROTOCOL_DOORBELL(priv))
+		return chan;
+
+	/*
+	 * In case of doorbell mode, the first channel of the window is returned
+	 * by the routine. Find the exact channel here.
+	 */
+	stat = readl_relaxed(&mhu->recv->ch_wn[priv->ch_wn_idx].stat_masked);
+	BUG_ON(!stat);
+
+	return chan + __builtin_ctz(stat);
+}
+
+static struct mbox_chan *get_irq_chan_stat_rx(struct mhuv2 *mhu)
+{
+	struct mbox_chan *chans = mhu->mbox.chans;
+	struct mhuv2_mbox_chan_priv *priv;
+	u32 stat;
+	int i = 0;
+
+	while (i < mhu->mbox.num_chans) {
+		priv = chans[i].con_priv;
+		stat = readl_relaxed(&mhu->recv->ch_wn[priv->ch_wn_idx].stat_masked);
+
+		if (stat) {
+			if (IS_PROTOCOL_DOORBELL(priv))
+				i += __builtin_ctz(stat);
+			return &chans[i];
+		}
+
+		i += IS_PROTOCOL_DOORBELL(priv) ? MHUV2_STAT_BITS : 1;
+	}
+
+	return ERR_PTR(-EIO);
+}
+
+static struct mbox_chan *get_irq_chan_rx(struct mhuv2 *mhu)
+{
+	if (!mhu->minor)
+		return get_irq_chan_stat_rx(mhu);
+
+	return get_irq_chan_comb_rx(mhu);
+}
+
+static irqreturn_t mhuv2_receiver_interrupt(int irq, void *arg)
+{
+	struct mhuv2 *mhu = arg;
+	struct mbox_chan *chan = get_irq_chan_rx(mhu);
+	struct device *dev = mhu->mbox.dev;
+	struct mhuv2_mbox_chan_priv *priv;
+	int ret = IRQ_NONE;
+	void *data;
+
+	if (IS_ERR(chan)) {
+		dev_warn(dev, "Failed to find channel for the rx interrupt\n");
+		return IRQ_NONE;
+	}
+	priv = chan->con_priv;
+
+	/* Read and clear the data first */
+	data = priv->ops->read_data(mhu, chan);
+
+	if (!chan->cl) {
+		dev_warn(dev, "Received data on channel (%u) not currently attached to a mailbox client\n",
+			 priv->ch_wn_idx);
+	} else if (IS_ERR(data)) {
+		dev_err(dev, "Failed to read data: %lu\n", PTR_ERR(data));
+	} else {
+		mbox_chan_received_data(chan, data);
+		ret = IRQ_HANDLED;
+	}
+
+	kfree(data);
+	return ret;
+}
+
+/* Sender and receiver ops */
+static bool mhuv2_sender_last_tx_done(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return priv->ops->last_tx_done(mhu, chan);
+}
+
+static int mhuv2_sender_send_data(struct mbox_chan *chan, void *data)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (!priv->ops->last_tx_done(mhu, chan))
+		return -EBUSY;
+
+	return priv->ops->send_data(mhu, chan, data);
+}
+
+static int mhuv2_sender_startup(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (priv->ops->tx_startup)
+		priv->ops->tx_startup(mhu, chan);
+	return 0;
+}
+
+static void mhuv2_sender_shutdown(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (priv->ops->tx_shutdown)
+		priv->ops->tx_shutdown(mhu, chan);
+}
+
+static const struct mbox_chan_ops mhuv2_sender_ops = {
+	.send_data = mhuv2_sender_send_data,
+	.startup = mhuv2_sender_startup,
+	.shutdown = mhuv2_sender_shutdown,
+	.last_tx_done = mhuv2_sender_last_tx_done,
+};
+
+static int mhuv2_receiver_startup(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return priv->ops->rx_startup(mhu, chan);
+}
+
+static void mhuv2_receiver_shutdown(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	priv->ops->rx_shutdown(mhu, chan);
+}
+
+static int mhuv2_receiver_send_data(struct mbox_chan *chan, void *data)
+{
+	dev_err(chan->mbox->dev,
+		"Trying to transmit on a receiver MHU frame\n");
+	return -EIO;
+}
+
+static bool mhuv2_receiver_last_tx_done(struct mbox_chan *chan)
+{
+	dev_err(chan->mbox->dev, "Trying to Tx poll on a receiver MHU frame\n");
+	return true;
+}
+
+static const struct mbox_chan_ops mhuv2_receiver_ops = {
+	.send_data = mhuv2_receiver_send_data,
+	.startup = mhuv2_receiver_startup,
+	.shutdown = mhuv2_receiver_shutdown,
+	.last_tx_done = mhuv2_receiver_last_tx_done,
+};
+
+static struct mbox_chan *mhuv2_mbox_of_xlate(struct mbox_controller *mbox,
+					     const struct of_phandle_args *pa)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(mbox);
+	struct mbox_chan *chans = mbox->chans;
+	int channel = 0, i, offset, doorbell, protocol, windows;
+
+	if (pa->args_count != 2)
+		return ERR_PTR(-EINVAL);
+
+	offset = pa->args[0];
+	doorbell = pa->args[1];
+	if (doorbell >= MHUV2_STAT_BITS)
+		goto out;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (protocol == DOORBELL) {
+			if (offset < windows)
+				return &chans[channel + MHUV2_STAT_BITS * offset + doorbell];
+
+			channel += MHUV2_STAT_BITS * windows;
+			offset -= windows;
+		} else {
+			if (offset == 0) {
+				if (doorbell)
+					goto out;
+
+				return &chans[channel];
+			}
+
+			channel++;
+			offset--;
+		}
+	}
+
+out:
+	dev_err(mbox->dev, "Couldn't xlate to a valid channel (%d: %d)\n",
+		pa->args[0], doorbell);
+	return ERR_PTR(-ENODEV);
+}
+
+static int mhuv2_verify_protocol(struct mhuv2 *mhu)
+{
+	struct device *dev = mhu->mbox.dev;
+	int protocol, windows, channels = 0, total_windows = 0, i;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (!windows) {
+			dev_err(dev, "Window size can't be zero (%d)\n", i);
+			return -EINVAL;
+		}
+		total_windows += windows;
+
+		if (protocol == DOORBELL) {
+			channels += MHUV2_STAT_BITS * windows;
+		} else if (protocol == DATA_TRANSFER) {
+			channels++;
+		} else {
+			dev_err(dev, "Invalid protocol (%d) present in %s property at index %d\n",
+				protocol, MHUV2_PROTOCOL_PROP, i);
+			return -EINVAL;
+		}
+	}
+
+	if (total_windows > mhu->windows) {
+		dev_err(dev, "Channel windows can't be more than what's implemented by the hardware ( %d: %d)\n",
+			total_windows, mhu->windows);
+		return -EINVAL;
+	}
+
+	mhu->mbox.num_chans = channels;
+	return 0;
+}
+
+static int mhuv2_allocate_channels(struct mhuv2 *mhu)
+{
+	struct mbox_controller *mbox = &mhu->mbox;
+	struct mhuv2_mbox_chan_priv *priv;
+	struct device *dev = mbox->dev;
+	struct mbox_chan *chans;
+	int protocol, windows = 0, next_window = 0, i, j, k;
+
+	chans = devm_kcalloc(dev, mbox->num_chans, sizeof(*chans), GFP_KERNEL);
+	if (!chans)
+		return -ENOMEM;
+
+	mbox->chans = chans;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		next_window += windows;
+
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (protocol == DATA_TRANSFER) {
+			priv = devm_kmalloc(dev, sizeof(*priv), GFP_KERNEL);
+			if (!priv)
+				return -ENOMEM;
+
+			priv->ch_wn_idx = next_window;
+			priv->ops = &mhuv2_data_transfer_ops;
+			priv->windows = windows;
+			chans++->con_priv = priv;
+			continue;
+		}
+
+		for (j = 0; j < windows; j++) {
+			for (k = 0; k < MHUV2_STAT_BITS; k++) {
+				priv = devm_kmalloc(dev, sizeof(*priv), GFP_KERNEL);
+				if (!priv)
+					return -ENOMEM;
+
+				priv->ch_wn_idx = next_window + j;
+				priv->ops = &mhuv2_doorbell_ops;
+				priv->doorbell = k;
+				chans++->con_priv = priv;
+			}
+
+			/*
+			 * Permanently enable interrupt as we can't
+			 * control it per doorbell.
+			 */
+			if (mhu->frame == SENDER_FRAME && mhu->minor)
+				writel_relaxed(0x1, &mhu->send->ch_wn[priv->ch_wn_idx].int_en);
+		}
+	}
+
+	/* Make sure we have initialized all channels */
+	BUG_ON(chans - mbox->chans != mbox->num_chans);
+
+	return 0;
+}
+
+static int mhuv2_parse_channels(struct mhuv2 *mhu)
+{
+	struct device *dev = mhu->mbox.dev;
+	const struct device_node *np = dev->of_node;
+	int ret, count;
+	u32 *protocols;
+
+	count = of_property_count_u32_elems(np, MHUV2_PROTOCOL_PROP);
+	if (count <= 0 || count % 2) {
+		dev_err(dev, "Invalid %s property (%d)\n", MHUV2_PROTOCOL_PROP,
+			count);
+		return -EINVAL;
+	}
+
+	protocols = devm_kmalloc_array(dev, count, sizeof(*protocols), GFP_KERNEL);
+	if (!protocols)
+		return -ENOMEM;
+
+	ret = of_property_read_u32_array(np, MHUV2_PROTOCOL_PROP, protocols, count);
+	if (ret) {
+		dev_err(dev, "Failed to read %s property: %d\n",
+			MHUV2_PROTOCOL_PROP, ret);
+		return ret;
+	}
+
+	mhu->protocols = protocols;
+	mhu->length = count;
+
+	ret = mhuv2_verify_protocol(mhu);
+	if (ret)
+		return ret;
+
+	return mhuv2_allocate_channels(mhu);
+}
+
+static int mhuv2_tx_init(struct amba_device *adev, struct mhuv2 *mhu,
+			 void __iomem *reg)
+{
+	struct device *dev = mhu->mbox.dev;
+	int ret, i;
+
+	mhu->frame = SENDER_FRAME;
+	mhu->mbox.ops = &mhuv2_sender_ops;
+	mhu->send = reg;
+
+	mhu->windows = readl_relaxed_bitfield(&mhu->send->mhu_cfg, num_ch);
+	mhu->minor = readl_relaxed_bitfield(&mhu->send->aidr, arch_minor_rev);
+
+	spin_lock_init(&mhu->doorbell_pending_lock);
+
+	/*
+	 * For minor version 1 and forward, tx interrupt is provided by
+	 * the controller.
+	 */
+	if (mhu->minor && adev->irq[0]) {
+		ret = devm_request_threaded_irq(dev, adev->irq[0], NULL,
+						mhuv2_sender_interrupt,
+						IRQF_ONESHOT, "mhuv2-tx", mhu);
+		if (ret) {
+			dev_err(dev, "Failed to request tx IRQ, fallback to polling mode: %d\n",
+				ret);
+		} else {
+			mhu->mbox.txdone_irq = true;
+			mhu->mbox.txdone_poll = false;
+			mhu->irq = adev->irq[0];
+
+			writel_relaxed_bitfield(1, &mhu->send->int_en, chcomb);
+
+			/* Disable all channel interrupts */
+			for (i = 0; i < mhu->windows; i++)
+				writel_relaxed(0x0, &mhu->send->ch_wn[i].int_en);
+
+			goto out;
+		}
+	}
+
+	mhu->mbox.txdone_irq = false;
+	mhu->mbox.txdone_poll = true;
+	mhu->mbox.txpoll_period = 1;
+
+out:
+	/* Wait for receiver to be ready */
+	writel_relaxed(0x1, &mhu->send->access_request);
+	while (!readl_relaxed(&mhu->send->access_ready))
+		continue;
+
+	return 0;
+}
+
+static int mhuv2_rx_init(struct amba_device *adev, struct mhuv2 *mhu,
+			 void __iomem *reg)
+{
+	struct device *dev = mhu->mbox.dev;
+	int ret, i;
+
+	mhu->frame = RECEIVER_FRAME;
+	mhu->mbox.ops = &mhuv2_receiver_ops;
+	mhu->recv = reg;
+
+	mhu->windows = readl_relaxed_bitfield(&mhu->recv->mhu_cfg, num_ch);
+	mhu->minor = readl_relaxed_bitfield(&mhu->recv->aidr, arch_minor_rev);
+
+	mhu->irq = adev->irq[0];
+	if (!mhu->irq) {
+		dev_err(dev, "Missing receiver IRQ\n");
+		return -EINVAL;
+	}
+
+	ret = devm_request_threaded_irq(dev, mhu->irq, NULL,
+					mhuv2_receiver_interrupt, IRQF_ONESHOT,
+					"mhuv2-rx", mhu);
+	if (ret) {
+		dev_err(dev, "Failed to request rx IRQ\n");
+		return ret;
+	}
+
+	/* Mask all the channel windows */
+	for (i = 0; i < mhu->windows; i++)
+		writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_set);
+
+	if (mhu->minor)
+		writel_relaxed_bitfield(1, &mhu->recv->int_en, chcomb);
+
+	return 0;
+}
+
+static int mhuv2_probe(struct amba_device *adev, const struct amba_id *id)
+{
+	struct device *dev = &adev->dev;
+	const struct device_node *np = dev->of_node;
+	struct mhuv2 *mhu;
+	void __iomem *reg;
+	int ret = -EINVAL;
+
+	reg = devm_of_iomap(dev, dev->of_node, 0, NULL);
+	if (!reg)
+		return -ENOMEM;
+
+	mhu = devm_kzalloc(dev, sizeof(*mhu), GFP_KERNEL);
+	if (!mhu)
+		return -ENOMEM;
+
+	mhu->mbox.dev = dev;
+	mhu->mbox.of_xlate = mhuv2_mbox_of_xlate;
+
+	if (of_device_is_compatible(np, "arm,mhuv2-tx"))
+		ret = mhuv2_tx_init(adev, mhu, reg);
+	else if (of_device_is_compatible(np, "arm,mhuv2-rx"))
+		ret = mhuv2_rx_init(adev, mhu, reg);
+	else
+		dev_err(dev, "Invalid compatible property\n");
+
+	if (ret)
+		return ret;
+
+	/* Channel windows can't be 0 */
+	BUG_ON(!mhu->windows);
+
+	ret = mhuv2_parse_channels(mhu);
+	if (ret)
+		return ret;
+
+	amba_set_drvdata(adev, mhu);
+
+	ret = devm_mbox_controller_register(dev, &mhu->mbox);
+	if (ret)
+		dev_err(dev, "failed to register ARM MHUv2 driver %d\n", ret);
+
+	return ret;
+}
+
+static int mhuv2_remove(struct amba_device *adev)
+{
+	struct mhuv2 *mhu = amba_get_drvdata(adev);
+
+	if (mhu->frame == SENDER_FRAME)
+		writel_relaxed(0x0, &mhu->send->access_request);
+
+	return 0;
+}
+
+static struct amba_id mhuv2_ids[] = {
+	{
+		/* 2.0 */
+		.id = 0xbb0d1,
+		.mask = 0xfffff,
+	},
+	{
+		/* 2.1 */
+		.id = 0xbb076,
+		.mask = 0xfffff,
+	},
+	{ 0, 0 },
+};
+MODULE_DEVICE_TABLE(amba, mhuv2_ids);
+
+static struct amba_driver mhuv2_driver = {
+	.drv = {
+		.name	= "arm-mhuv2",
+	},
+	.id_table	= mhuv2_ids,
+	.probe		= mhuv2_probe,
+	.remove		= mhuv2_remove,
+};
+module_amba_driver(mhuv2_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("ARM MHUv2 Driver");
+MODULE_AUTHOR("Viresh Kumar <viresh.kumar@linaro.org>");
+MODULE_AUTHOR("Tushar Khandelwal <tushar.khandelwal@arm.com>");
diff --git a/include/linux/mailbox/arm_mhuv2_message.h b/include/linux/mailbox/arm_mhuv2_message.h
new file mode 100644
index 000000000000..821b9d96daa4
--- /dev/null
+++ b/include/linux/mailbox/arm_mhuv2_message.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM MHUv2 Mailbox Message
+ *
+ * Copyright (C) 2020 Arm Ltd.
+ * Copyright (C) 2020 Linaro Ltd.
+ */
+
+#ifndef _LINUX_ARM_MHUV2_MESSAGE_H_
+#define _LINUX_ARM_MHUV2_MESSAGE_H_
+
+#include <linux/types.h>
+
+/* Data structure for data-transfer protocol */
+struct arm_mhuv2_mbox_msg {
+	void *data;
+	size_t len;
+};
+
+#endif /* _LINUX_ARM_MHUV2_MESSAGE_H_ */
-- 
cgit 


From d40c2d4ed62df64ce603c208bceff25245380157 Mon Sep 17 00:00:00 2001
From: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Date: Wed, 9 Dec 2020 18:33:43 -0800
Subject: spmi: Add driver shutdown support

Add new shutdown() method.  Use it in the standard driver model style.

Link: https://lore.kernel.org/r/1603187810-30481-2-git-send-email-hsin-hsiung.wang@mediatek.com
Signed-off-by: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20201210023344.2838141-4-sboyd@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/spmi/spmi.c  | 9 +++++++++
 include/linux/spmi.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/spmi/spmi.c b/drivers/spmi/spmi.c
index 253340e10dab..51f5aeb65b3b 100644
--- a/drivers/spmi/spmi.c
+++ b/drivers/spmi/spmi.c
@@ -359,6 +359,14 @@ static int spmi_drv_remove(struct device *dev)
 	return 0;
 }
 
+static void spmi_drv_shutdown(struct device *dev)
+{
+	const struct spmi_driver *sdrv = to_spmi_driver(dev->driver);
+
+	if (sdrv && sdrv->shutdown)
+		sdrv->shutdown(to_spmi_device(dev));
+}
+
 static int spmi_drv_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	int ret;
@@ -375,6 +383,7 @@ static struct bus_type spmi_bus_type = {
 	.match		= spmi_device_match,
 	.probe		= spmi_drv_probe,
 	.remove		= spmi_drv_remove,
+	.shutdown	= spmi_drv_shutdown,
 	.uevent		= spmi_drv_uevent,
 };
 
diff --git a/include/linux/spmi.h b/include/linux/spmi.h
index 394a3f68bad5..729bcbf9f5ad 100644
--- a/include/linux/spmi.h
+++ b/include/linux/spmi.h
@@ -138,6 +138,7 @@ struct spmi_driver {
 	struct device_driver driver;
 	int	(*probe)(struct spmi_device *sdev);
 	void	(*remove)(struct spmi_device *sdev);
+	void	(*shutdown)(struct spmi_device *sdev);
 };
 
 static inline struct spmi_driver *to_spmi_driver(struct device_driver *d)
-- 
cgit 


From 1c12c27086dcef853832a7cbebcb48bdac8104b6 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 25 Nov 2020 10:31:06 +0100
Subject: siox: Make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of the remove callback, so
don't give siox drivers the chance to provide a value.

All siox drivers only allocate devm-managed resources in
.probe, so there is no .remove callback to fix.

Tested-by: Thorsten Scherer <t.scherer@eckelmann.de>
Acked-by: Thorsten Scherer <t.scherer@eckelmann.de>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20201125093106.240643-3-u.kleine-koenig@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/siox/siox-core.c | 5 ++---
 include/linux/siox.h     | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/siox/siox-core.c b/drivers/siox/siox-core.c
index b56cdcb52967..1794ff0106bc 100644
--- a/drivers/siox/siox-core.c
+++ b/drivers/siox/siox-core.c
@@ -525,12 +525,11 @@ static int siox_remove(struct device *dev)
 	struct siox_driver *sdriver =
 		container_of(dev->driver, struct siox_driver, driver);
 	struct siox_device *sdevice = to_siox_device(dev);
-	int ret = 0;
 
 	if (sdriver->remove)
-		ret = sdriver->remove(sdevice);
+		sdriver->remove(sdevice);
 
-	return ret;
+	return 0;
 }
 
 static void siox_shutdown(struct device *dev)
diff --git a/include/linux/siox.h b/include/linux/siox.h
index da7225bf1877..6bfbda3f634c 100644
--- a/include/linux/siox.h
+++ b/include/linux/siox.h
@@ -36,7 +36,7 @@ bool siox_device_connected(struct siox_device *sdevice);
 
 struct siox_driver {
 	int (*probe)(struct siox_device *sdevice);
-	int (*remove)(struct siox_device *sdevice);
+	void (*remove)(struct siox_device *sdevice);
 	void (*shutdown)(struct siox_device *sdevice);
 
 	/*
-- 
cgit 


From 0aec2da436623abe19b80b21dd9fc5ec9300a152 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 9 Dec 2020 22:36:38 +0200
Subject: driver core: platform: Introduce platform_get_mem_or_io()

There are at least few existing users of the proposed API which
retrieves either MEM or IO resource from platform device.

Make it common to utilize in the existing and new users.

Cc: Eric Auger <eric.auger@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-usb@vger.kernel.org
Cc: Peng Hao <peng.hao2@zte.com.cn>
Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20201209203642.27648-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/platform.c         | 15 +++++++++++++++
 include/linux/platform_device.h |  3 +++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 8ad06daa2eaa..0358dc3ea3ad 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -63,6 +63,21 @@ struct resource *platform_get_resource(struct platform_device *dev,
 }
 EXPORT_SYMBOL_GPL(platform_get_resource);
 
+struct resource *platform_get_mem_or_io(struct platform_device *dev,
+					unsigned int num)
+{
+	u32 i;
+
+	for (i = 0; i < dev->num_resources; i++) {
+		struct resource *r = &dev->resource[i];
+
+		if ((resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) && num-- == 0)
+			return r;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(platform_get_mem_or_io);
+
 #ifdef CONFIG_HAS_IOMEM
 /**
  * devm_platform_get_and_ioremap_resource - call devm_ioremap_resource() for a
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 77a2aada106d..ee6a9f10c2c7 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -52,6 +52,9 @@ extern struct device platform_bus;
 
 extern struct resource *platform_get_resource(struct platform_device *,
 					      unsigned int, unsigned int);
+extern struct resource *platform_get_mem_or_io(struct platform_device *,
+					       unsigned int);
+
 extern struct device *
 platform_find_device_by_driver(struct device *start,
 			       const struct device_driver *drv);
-- 
cgit 


From 1f702603e7125a390b5cdf5ce00539781cfcc86a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:19 -0600
Subject: exec: Simplify unshare_files

Now that exec no longer needs to return the unshared files to their
previous value there is no reason to return displaced.

Instead when unshare_fd creates a copy of the file table, call
put_files_struct before returning from unshare_files.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
v1: https://lkml.kernel.org/r/20200817220425.9389-2-ebiederm@xmission.com
Link: https://lkml.kernel.org/r/20201120231441.29911-2-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/coredump.c           |  5 +----
 fs/exec.c               |  5 +----
 include/linux/fdtable.h |  2 +-
 kernel/fork.c           | 12 ++++++------
 4 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coredump.c b/fs/coredump.c
index 0cd9056d79cc..abf807235262 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -585,7 +585,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 	int ispipe;
 	size_t *argv = NULL;
 	int argc = 0;
-	struct files_struct *displaced;
 	/* require nonrelative corefile path and be extra careful */
 	bool need_suid_safe = false;
 	bool core_dumped = false;
@@ -791,11 +790,9 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 	}
 
 	/* get us an unshared descriptor table; almost always a no-op */
-	retval = unshare_files(&displaced);
+	retval = unshare_files();
 	if (retval)
 		goto close_fail;
-	if (displaced)
-		put_files_struct(displaced);
 	if (!dump_interrupted()) {
 		/*
 		 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
diff --git a/fs/exec.c b/fs/exec.c
index 0d6533ab1c97..48fa4fc1b116 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1238,7 +1238,6 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
 int begin_new_exec(struct linux_binprm * bprm)
 {
 	struct task_struct *me = current;
-	struct files_struct *displaced;
 	int retval;
 
 	/* Once we are committed compute the creds */
@@ -1259,11 +1258,9 @@ int begin_new_exec(struct linux_binprm * bprm)
 		goto out;
 
 	/* Ensure the files table is not shared. */
-	retval = unshare_files(&displaced);
+	retval = unshare_files();
 	if (retval)
 		goto out;
-	if (displaced)
-		put_files_struct(displaced);
 
 	/*
 	 * Must be called _before_ exec_mmap() as bprm->mm is
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index a32bf47c593e..f46a084b60b2 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -109,7 +109,7 @@ struct task_struct;
 struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
 void reset_files_struct(struct files_struct *);
-int unshare_files(struct files_struct **);
+int unshare_files(void);
 struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
 void do_close_on_exec(struct files_struct *);
 int iterate_fd(struct files_struct *, unsigned,
diff --git a/kernel/fork.c b/kernel/fork.c
index 32083db7a2a2..837b546528c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3023,21 +3023,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
  *	the exec layer of the kernel.
  */
 
-int unshare_files(struct files_struct **displaced)
+int unshare_files(void)
 {
 	struct task_struct *task = current;
-	struct files_struct *copy = NULL;
+	struct files_struct *old, *copy = NULL;
 	int error;
 
 	error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
-	if (error || !copy) {
-		*displaced = NULL;
+	if (error || !copy)
 		return error;
-	}
-	*displaced = task->files;
+
+	old = task->files;
 	task_lock(task);
 	task->files = copy;
 	task_unlock(task);
+	put_files_struct(old);
 	return 0;
 }
 
-- 
cgit 


From 950db38ff2c01b7aabbd7ab4a50b7992750fa63d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:20 -0600
Subject: exec: Remove reset_files_struct

Now that exec no longer needs to restore the previous value of current->files
on error there are no more callers of reset_files_struct so remove it.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
v1: https://lkml.kernel.org/r/20200817220425.9389-3-ebiederm@xmission.com
Link: https://lkml.kernel.org/r/20201120231441.29911-3-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c               | 12 ------------
 include/linux/fdtable.h |  1 -
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 4559b5fec3bd..beae7c55c84c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -436,18 +436,6 @@ void put_files_struct(struct files_struct *files)
 	}
 }
 
-void reset_files_struct(struct files_struct *files)
-{
-	struct task_struct *tsk = current;
-	struct files_struct *old;
-
-	old = tsk->files;
-	task_lock(tsk);
-	tsk->files = files;
-	task_unlock(tsk);
-	put_files_struct(old);
-}
-
 void exit_files(struct task_struct *tsk)
 {
 	struct files_struct * files = tsk->files;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f46a084b60b2..7cc9885044d9 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -108,7 +108,6 @@ struct task_struct;
 
 struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
-void reset_files_struct(struct files_struct *);
 int unshare_files(void);
 struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
 void do_close_on_exec(struct files_struct *);
-- 
cgit 


From bebf684bf330915e6c96313ad7db89a5480fc9c2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:24 -0600
Subject: file: Rename __fcheck_files to files_lookup_fd_raw

The function fcheck despite it's comment is poorly named
as it has no callers that only check it's return value.
All of fcheck's callers use the returned file descriptor.
The same is true for fcheck_files and __fcheck_files.

A new less confusing name is needed.  In addition the names
of these functions are confusing as they do not report
the kind of locks that are needed to be held when these
functions are called making error prone to use them.

To remedy this I am making the base functio name lookup_fd
and will and prefixes and sufficies to indicate the rest
of the context.

Name the function (previously called __fcheck_files) that proceeds
from a struct files_struct, looks up the struct file of a file
descriptor, and requires it's callers to verify all of the appropriate
locks are held files_lookup_fd_raw.

The need for better names became apparent in the last round of
discussion of this set of changes[1].

[1] https://lkml.kernel.org/r/CAHk-=wj8BQbgJFLa+J0e=iT-1qpmCRTbPAJ8gd6MJQ=kbRPqyQ@mail.gmail.com
Link: https://lkml.kernel.org/r/20201120231441.29911-7-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c               | 2 +-
 include/linux/fdtable.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index beae7c55c84c..b5591efb87f5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -887,7 +887,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
 	struct file *file;
 
 	if (atomic_read(&files->count) == 1) {
-		file = __fcheck_files(files, fd);
+		file = files_lookup_fd_raw(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
 			return 0;
 		return (unsigned long)file;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 7cc9885044d9..639933f37da9 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -80,7 +80,7 @@ struct dentry;
 /*
  * The caller must ensure that fd table isn't shared or hold rcu or file lock
  */
-static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd)
+static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
 
@@ -96,7 +96,7 @@ static inline struct file *fcheck_files(struct files_struct *files, unsigned int
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
 			   !lockdep_is_held(&files->file_lock),
 			   "suspicious rcu_dereference_check() usage");
-	return __fcheck_files(files, fd);
+	return files_lookup_fd_raw(files, fd);
 }
 
 /*
-- 
cgit 


From 120ce2b0cd52abe73e8b16c23461eb14df5a87d8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:25 -0600
Subject: file: Factor files_lookup_fd_locked out of fcheck_files

To make it easy to tell where files->file_lock protection is being
used when looking up a file create files_lookup_fd_locked.  Only allow
this function to be called with the file_lock held.

Update the callers of fcheck and fcheck_files that are called with the
files->file_lock held to call files_lookup_fd_locked instead.

Hopefully this makes it easier to quickly understand what is going on.

The need for better names became apparent in the last round of
discussion of this set of changes[1].

[1] https://lkml.kernel.org/r/CAHk-=wj8BQbgJFLa+J0e=iT-1qpmCRTbPAJ8gd6MJQ=kbRPqyQ@mail.gmail.com
Link: https://lkml.kernel.org/r/20201120231441.29911-8-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c               |  2 +-
 fs/locks.c              | 14 ++++++++------
 fs/proc/fd.c            |  2 +-
 include/linux/fdtable.h |  7 +++++++
 4 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index b5591efb87f5..9d0e91168be1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1098,7 +1098,7 @@ static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 
 	spin_lock(&files->file_lock);
 	err = expand_files(files, newfd);
-	file = fcheck(oldfd);
+	file = files_lookup_fd_locked(files, oldfd);
 	if (unlikely(!file))
 		goto Ebadf;
 	if (unlikely(err < 0)) {
diff --git a/fs/locks.c b/fs/locks.c
index 1f84a03601fe..148197c1b547 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2539,14 +2539,15 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	 */
 	if (!error && file_lock->fl_type != F_UNLCK &&
 	    !(file_lock->fl_flags & FL_OFDLCK)) {
+		struct files_struct *files = current->files;
 		/*
 		 * We need that spin_lock here - it prevents reordering between
 		 * update of i_flctx->flc_posix and check for it done in
 		 * close(). rcu_read_lock() wouldn't do.
 		 */
-		spin_lock(&current->files->file_lock);
-		f = fcheck(fd);
-		spin_unlock(&current->files->file_lock);
+		spin_lock(&files->file_lock);
+		f = files_lookup_fd_locked(files, fd);
+		spin_unlock(&files->file_lock);
 		if (f != filp) {
 			file_lock->fl_type = F_UNLCK;
 			error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2670,14 +2671,15 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 	 */
 	if (!error && file_lock->fl_type != F_UNLCK &&
 	    !(file_lock->fl_flags & FL_OFDLCK)) {
+		struct files_struct *files = current->files;
 		/*
 		 * We need that spin_lock here - it prevents reordering between
 		 * update of i_flctx->flc_posix and check for it done in
 		 * close(). rcu_read_lock() wouldn't do.
 		 */
-		spin_lock(&current->files->file_lock);
-		f = fcheck(fd);
-		spin_unlock(&current->files->file_lock);
+		spin_lock(&files->file_lock);
+		f = files_lookup_fd_locked(files, fd);
+		spin_unlock(&files->file_lock);
 		if (f != filp) {
 			file_lock->fl_type = F_UNLCK;
 			error = do_lock_file_wait(filp, cmd, file_lock);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d58960f6ee52..2cca9bca3b3a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -35,7 +35,7 @@ static int seq_show(struct seq_file *m, void *v)
 		unsigned int fd = proc_fd(m->private);
 
 		spin_lock(&files->file_lock);
-		file = fcheck_files(files, fd);
+		file = files_lookup_fd_locked(files, fd);
 		if (file) {
 			struct fdtable *fdt = files_fdtable(files);
 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 639933f37da9..fda4b81dd735 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -91,6 +91,13 @@ static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsig
 	return NULL;
 }
 
+static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
+{
+	RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
+			   "suspicious rcu_dereference_check() usage");
+	return files_lookup_fd_raw(files, fd);
+}
+
 static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
 {
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
-- 
cgit 


From f36c2943274199cb8aef32ac96531ffb7c4b43d0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:26 -0600
Subject: file: Replace fcheck_files with files_lookup_fd_rcu

This change renames fcheck_files to files_lookup_fd_rcu.  All of the
remaining callers take the rcu_read_lock before calling this function
so the _rcu suffix is appropriate.  This change also tightens up the
debug check to verify that all callers hold the rcu_read_lock.

All callers that used to call files_check with the files->file_lock
held have now been changed to call files_lookup_fd_locked.

This change of name has helped remind me of which locks and which
guarantees are in place helping me to catch bugs later in the
patchset.

The need for better names became apparent in the last round of
discussion of this set of changes[1].

[1] https://lkml.kernel.org/r/CAHk-=wj8BQbgJFLa+J0e=iT-1qpmCRTbPAJ8gd6MJQ=kbRPqyQ@mail.gmail.com
Link: https://lkml.kernel.org/r/20201120231441.29911-9-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 Documentation/filesystems/files.rst | 6 +++---
 fs/file.c                           | 4 ++--
 fs/proc/fd.c                        | 4 ++--
 include/linux/fdtable.h             | 7 +++----
 kernel/bpf/task_iter.c              | 2 +-
 kernel/kcmp.c                       | 2 +-
 6 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index cbf8e57376bf..ea75acdb632c 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@ the fdtable structure -
    be held.
 
 4. To look up the file structure given an fd, a reader
-   must use either fcheck() or fcheck_files() APIs. These
+   must use either fcheck() or files_lookup_fd_rcu() APIs. These
    take care of barrier requirements due to lock-free lookup.
 
    An example::
@@ -84,7 +84,7 @@ the fdtable structure -
    on ->f_count::
 
 	rcu_read_lock();
-	file = fcheck_files(files, fd);
+	file = files_lookup_fd_rcu(files, fd);
 	if (file) {
 		if (atomic_long_inc_not_zero(&file->f_count))
 			*fput_needed = 1;
@@ -104,7 +104,7 @@ the fdtable structure -
    lock-free, they must be installed using rcu_assign_pointer()
    API. If they are looked up lock-free, rcu_dereference()
    must be used. However it is advisable to use files_fdtable()
-   and fcheck()/fcheck_files() which take care of these issues.
+   and fcheck()/files_lookup_fd_rcu() which take care of these issues.
 
 7. While updating, the fdtable pointer must be looked up while
    holding files->file_lock. If ->file_lock is dropped, then
diff --git a/fs/file.c b/fs/file.c
index 9d0e91168be1..5861c4f89419 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -814,7 +814,7 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd,
 
 	rcu_read_lock();
 loop:
-	file = fcheck_files(files, fd);
+	file = files_lookup_fd_rcu(files, fd);
 	if (file) {
 		/* File object ref couldn't be taken.
 		 * dup2() atomicity guarantee is the reason
@@ -1127,7 +1127,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 		int retval = oldfd;
 
 		rcu_read_lock();
-		if (!fcheck_files(files, oldfd))
+		if (!files_lookup_fd_rcu(files, oldfd))
 			retval = -EBADF;
 		rcu_read_unlock();
 		return retval;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 2cca9bca3b3a..3dec44d7c5c5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -90,7 +90,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
 		return false;
 
 	rcu_read_lock();
-	file = fcheck_files(files, fd);
+	file = files_lookup_fd_rcu(files, fd);
 	if (file)
 		*mode = file->f_mode;
 	rcu_read_unlock();
@@ -243,7 +243,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 		char name[10 + 1];
 		unsigned int len;
 
-		f = fcheck_files(files, fd);
+		f = files_lookup_fd_rcu(files, fd);
 		if (!f)
 			continue;
 		data.mode = f->f_mode;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index fda4b81dd735..fa8c402a7790 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,10 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
 	return files_lookup_fd_raw(files, fd);
 }
 
-static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
+static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
 {
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
-			   !lockdep_is_held(&files->file_lock),
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
 			   "suspicious rcu_dereference_check() usage");
 	return files_lookup_fd_raw(files, fd);
 }
@@ -109,7 +108,7 @@ static inline struct file *fcheck_files(struct files_struct *files, unsigned int
 /*
  * Check whether the specified fd has an open file.
  */
-#define fcheck(fd)	fcheck_files(current->files, fd)
+#define fcheck(fd)	files_lookup_fd_rcu(current->files, fd)
 
 struct task_struct;
 
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 5b6af30bfbcd..5ab2ccfb96cb 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -183,7 +183,7 @@ again:
 	for (; curr_fd < max_fds; curr_fd++) {
 		struct file *f;
 
-		f = fcheck_files(curr_files, curr_fd);
+		f = files_lookup_fd_rcu(curr_files, curr_fd);
 		if (!f)
 			continue;
 		if (!get_file_rcu(f))
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 87c48c0104ad..990717c1aed3 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -67,7 +67,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
 	rcu_read_lock();
 
 	if (task->files)
-		file = fcheck_files(task->files, idx);
+		file = files_lookup_fd_rcu(task->files, idx);
 
 	rcu_read_unlock();
 	task_unlock(task);
-- 
cgit 


From 460b4f812a9d473d4b39d87d37844f9fc30a9eb3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:27 -0600
Subject: file: Rename fcheck lookup_fd_rcu

Also remove the confusing comment about checking if a fd exists.  I
could not find one instance in the entire kernel that still matches
the description or the reason for the name fcheck.

The need for better names became apparent in the last round of
discussion of this set of changes[1].

[1] https://lkml.kernel.org/r/CAHk-=wj8BQbgJFLa+J0e=iT-1qpmCRTbPAJ8gd6MJQ=kbRPqyQ@mail.gmail.com
Link: https://lkml.kernel.org/r/20201120231441.29911-10-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 Documentation/filesystems/files.rst          | 6 +++---
 arch/powerpc/platforms/cell/spufs/coredump.c | 2 +-
 fs/notify/dnotify/dnotify.c                  | 2 +-
 include/linux/fdtable.h                      | 8 ++++----
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index ea75acdb632c..bcf84459917f 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@ the fdtable structure -
    be held.
 
 4. To look up the file structure given an fd, a reader
-   must use either fcheck() or files_lookup_fd_rcu() APIs. These
+   must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These
    take care of barrier requirements due to lock-free lookup.
 
    An example::
@@ -70,7 +70,7 @@ the fdtable structure -
 	struct file *file;
 
 	rcu_read_lock();
-	file = fcheck(fd);
+	file = lookup_fd_rcu(fd);
 	if (file) {
 		...
 	}
@@ -104,7 +104,7 @@ the fdtable structure -
    lock-free, they must be installed using rcu_assign_pointer()
    API. If they are looked up lock-free, rcu_dereference()
    must be used. However it is advisable to use files_fdtable()
-   and fcheck()/files_lookup_fd_rcu() which take care of these issues.
+   and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues.
 
 7. While updating, the fdtable pointer must be looked up while
    holding files->file_lock. If ->file_lock is dropped, then
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 026c181a98c5..60b5583e9eaf 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -74,7 +74,7 @@ static struct spu_context *coredump_next_context(int *fd)
 	*fd = n - 1;
 
 	rcu_read_lock();
-	file = fcheck(*fd);
+	file = lookup_fd_rcu(*fd);
 	ctx = SPUFS_I(file_inode(file))->i_ctx;
 	get_spu_context(ctx);
 	rcu_read_unlock();
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5dcda8f20c04..5486aaca60b0 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -327,7 +327,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	}
 
 	rcu_read_lock();
-	f = fcheck(fd);
+	f = lookup_fd_rcu(fd);
 	rcu_read_unlock();
 
 	/* if (f != filp) means that we lost a race and another task/thread
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index fa8c402a7790..2a4a8fed536e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -105,10 +105,10 @@ static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsig
 	return files_lookup_fd_raw(files, fd);
 }
 
-/*
- * Check whether the specified fd has an open file.
- */
-#define fcheck(fd)	files_lookup_fd_rcu(current->files, fd)
+static inline struct file *lookup_fd_rcu(unsigned int fd)
+{
+	return files_lookup_fd_rcu(current->files, fd);
+}
 
 struct task_struct;
 
-- 
cgit 


From 3a879fb38082125cc0d8aa89b70c7f3a7cdf584b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:28 -0600
Subject: file: Implement task_lookup_fd_rcu

As a companion to lookup_fd_rcu implement task_lookup_fd_rcu for
querying an arbitrary process about a specific file.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
v1: https://lkml.kernel.org/r/20200818103713.aw46m7vprsy4vlve@wittgenstein
Link: https://lkml.kernel.org/r/20201120231441.29911-11-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c               | 15 +++++++++++++++
 include/linux/fdtable.h |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 5861c4f89419..6448523ca29e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -865,6 +865,21 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
 	return file;
 }
 
+struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+{
+	/* Must be called with rcu_read_lock held */
+	struct files_struct *files;
+	struct file *file = NULL;
+
+	task_lock(task);
+	files = task->files;
+	if (files)
+		file = files_lookup_fd_rcu(files, fd);
+	task_unlock(task);
+
+	return file;
+}
+
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  *
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 2a4a8fed536e..a0558ae9b40c 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -110,6 +110,8 @@ static inline struct file *lookup_fd_rcu(unsigned int fd)
 	return files_lookup_fd_rcu(current->files, fd);
 }
 
+struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
+
 struct task_struct;
 
 struct files_struct *get_files_struct(struct task_struct *);
-- 
cgit 


From e9a53aeb5e0a838f10fcea74235664e7ad5e6e1a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:31 -0600
Subject: file: Implement task_lookup_next_fd_rcu

As a companion to fget_task and task_lookup_fd_rcu implement
task_lookup_next_fd_rcu that will return the struct file for the first
file descriptor number that is equal or greater than the fd argument
value, or NULL if there is no such struct file.

This allows file descriptors of foreign processes to be iterated
through safely, without needed to increment the count on files_struct.

Some concern[1] has been expressed that this function takes the task_lock
for each iteration and thus for each file descriptor.  This place
where this function will be called in a commonly used code path is for
listing /proc/<pid>/fd.  I did some small benchmarks and did not see
any measurable performance differences.  For ordinary users ls is
likely to stat each of the directory entries and tid_fd_mode called
from tid_fd_revalidae has always taken the task lock for each file
descriptor.  So this does not look like it will be a big change in
practice.

At some point is will probably be worth changing put_files_struct to
free files_struct after an rcu grace period so that task_lock won't be
needed at all.

[1] https://lkml.kernel.org/r/20200817220425.9389-10-ebiederm@xmission.com
v1: https://lkml.kernel.org/r/20200817220425.9389-9-ebiederm@xmission.com
Link: https://lkml.kernel.org/r/20201120231441.29911-14-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c               | 21 +++++++++++++++++++++
 include/linux/fdtable.h |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 6448523ca29e..23b888a4acbe 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -880,6 +880,27 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
 	return file;
 }
 
+struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+{
+	/* Must be called with rcu_read_lock held */
+	struct files_struct *files;
+	unsigned int fd = *ret_fd;
+	struct file *file = NULL;
+
+	task_lock(task);
+	files = task->files;
+	if (files) {
+		for (; fd < files_fdtable(files)->max_fds; fd++) {
+			file = files_lookup_fd_rcu(files, fd);
+			if (file)
+				break;
+		}
+	}
+	task_unlock(task);
+	*ret_fd = fd;
+	return file;
+}
+
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  *
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index a0558ae9b40c..cf6c52dae3a1 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -111,6 +111,7 @@ static inline struct file *lookup_fd_rcu(unsigned int fd)
 }
 
 struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
+struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
 
 struct task_struct;
 
-- 
cgit 


From d74ba04d919ebe30bf47406819c18c6b50003d92 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:35 -0600
Subject: file: Merge __fd_install into fd_install

The function __fd_install was added to support binder[1].  With binder
fixed[2] there are no more users.

As fd_install just calls __fd_install with "files=current->files",
merge them together by transforming the files parameter into a
local variable initialized to current->files.

[1] f869e8a7f753 ("expose a low-level variant of fd_install() for binder")
[2] 44d8047f1d87 ("binder: use standard functions to allocate fds")
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
v1:https://lkml.kernel.org/r/20200817220425.9389-14-ebiederm@xmission.com
Link: https://lkml.kernel.org/r/20201120231441.29911-18-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c               | 25 ++++++-------------------
 include/linux/fdtable.h |  2 --
 2 files changed, 6 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 23b888a4acbe..0d4ec0fa23b3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -158,7 +158,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
 	spin_unlock(&files->file_lock);
 	new_fdt = alloc_fdtable(nr);
 
-	/* make sure all __fd_install() have seen resize_in_progress
+	/* make sure all fd_install() have seen resize_in_progress
 	 * or have finished their rcu_read_lock_sched() section.
 	 */
 	if (atomic_read(&files->count) > 1)
@@ -181,7 +181,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
 	rcu_assign_pointer(files->fdt, new_fdt);
 	if (cur_fdt != &files->fdtab)
 		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
-	/* coupled with smp_rmb() in __fd_install() */
+	/* coupled with smp_rmb() in fd_install() */
 	smp_wmb();
 	return 1;
 }
@@ -584,17 +584,13 @@ EXPORT_SYMBOL(put_unused_fd);
  * It should never happen - if we allow dup2() do it, _really_ bad things
  * will follow.
  *
- * NOTE: __fd_install() variant is really, really low-level; don't
- * use it unless you are forced to by truly lousy API shoved down
- * your throat.  'files' *MUST* be either current->files or obtained
- * by get_files_struct(current) done by whoever had given it to you,
- * or really bad things will happen.  Normally you want to use
- * fd_install() instead.
+ * This consumes the "file" refcount, so callers should treat it
+ * as if they had called fput(file).
  */
 
-void __fd_install(struct files_struct *files, unsigned int fd,
-		struct file *file)
+void fd_install(unsigned int fd, struct file *file)
 {
+	struct files_struct *files = current->files;
 	struct fdtable *fdt;
 
 	rcu_read_lock_sched();
@@ -616,15 +612,6 @@ void __fd_install(struct files_struct *files, unsigned int fd,
 	rcu_read_unlock_sched();
 }
 
-/*
- * This consumes the "file" refcount, so callers should treat it
- * as if they had called fput(file).
- */
-void fd_install(unsigned int fd, struct file *file)
-{
-	__fd_install(current->files, fd, file);
-}
-
 EXPORT_SYMBOL(fd_install);
 
 static struct file *pick_file(struct files_struct *files, unsigned fd)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index cf6c52dae3a1..a5ec736d74a5 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -126,8 +126,6 @@ int iterate_fd(struct files_struct *, unsigned,
 
 extern int __alloc_fd(struct files_struct *files,
 		      unsigned start, unsigned end, unsigned flags);
-extern void __fd_install(struct files_struct *files,
-		      unsigned int fd, struct file *file);
 extern int __close_fd(struct files_struct *files,
 		      unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
-- 
cgit 


From aa384d10f3d06d4b85597ff5df41551262220e16 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:37 -0600
Subject: file: Merge __alloc_fd into alloc_fd

The function __alloc_fd was added to support binder[1].  With binder
fixed[2] there are no more users.

As alloc_fd just calls __alloc_fd with "files=current->files",
merge them together by transforming the files parameter into a
local variable initialized to current->files.

[1] dcfadfa4ec5a ("new helper: __alloc_fd()")
[2] 44d8047f1d87 ("binder: use standard functions to allocate fds")
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
v1: https://lkml.kernel.org/r/20200817220425.9389-16-ebiederm@xmission.com
Link: https://lkml.kernel.org/r/20201120231441.29911-20-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c               | 11 +++--------
 include/linux/fdtable.h |  2 --
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 07e25f1b9dfd..621563701bd9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -480,9 +480,9 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
 /*
  * allocate a file descriptor, mark it busy.
  */
-int __alloc_fd(struct files_struct *files,
-	       unsigned start, unsigned end, unsigned flags)
+static int alloc_fd(unsigned start, unsigned end, unsigned flags)
 {
+	struct files_struct *files = current->files;
 	unsigned int fd;
 	int error;
 	struct fdtable *fdt;
@@ -538,14 +538,9 @@ out:
 	return error;
 }
 
-static int alloc_fd(unsigned start, unsigned end, unsigned flags)
-{
-	return __alloc_fd(current->files, start, end, flags);
-}
-
 int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
 {
-	return __alloc_fd(current->files, 0, nofile, flags);
+	return alloc_fd(0, nofile, flags);
 }
 
 int get_unused_fd_flags(unsigned flags)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index a5ec736d74a5..dc476ae92f56 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -124,8 +124,6 @@ int iterate_fd(struct files_struct *, unsigned,
 		int (*)(const void *, struct file *, unsigned),
 		const void *);
 
-extern int __alloc_fd(struct files_struct *files,
-		      unsigned start, unsigned end, unsigned flags);
 extern int __close_fd(struct files_struct *files,
 		      unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
-- 
cgit 


From 8760c909f54a82aaa6e76da19afe798a0c77c3c3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:38 -0600
Subject: file: Rename __close_fd to close_fd and remove the files parameter

The function __close_fd was added to support binder[1].  Now that
binder has been fixed to no longer need __close_fd[2] all calls
to __close_fd pass current->files.

Therefore transform the files parameter into a local variable
initialized to current->files, and rename __close_fd to close_fd to
reflect this change, and keep it in sync with the similar changes to
__alloc_fd, and __fd_install.

This removes the need for callers to care about the extra care that
needs to be take if anything except current->files is passed, by
limiting the callers to only operation on current->files.

[1] 483ce1d4b8c3 ("take descriptor-related part of close() to file.c")
[2] 44d8047f1d87 ("binder: use standard functions to allocate fds")
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
v1: https://lkml.kernel.org/r/20200817220425.9389-17-ebiederm@xmission.com
Link: https://lkml.kernel.org/r/20201120231441.29911-21-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c                | 10 ++++------
 fs/open.c                |  2 +-
 include/linux/fdtable.h  |  3 +--
 include/linux/syscalls.h |  6 +++---
 4 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 621563701bd9..987ea51630b4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -629,11 +629,9 @@ out_unlock:
 	return file;
 }
 
-/*
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
- */
-int __close_fd(struct files_struct *files, unsigned fd)
+int close_fd(unsigned fd)
 {
+	struct files_struct *files = current->files;
 	struct file *file;
 
 	file = pick_file(files, fd);
@@ -642,7 +640,7 @@ int __close_fd(struct files_struct *files, unsigned fd)
 
 	return filp_close(file, files);
 }
-EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd); /* for ksys_close() */
 
 /**
  * __close_range() - Close all file descriptors in a given range.
@@ -1027,7 +1025,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
 	struct files_struct *files = current->files;
 
 	if (!file)
-		return __close_fd(files, fd);
+		return close_fd(fd);
 
 	if (fd >= rlimit(RLIMIT_NOFILE))
 		return -EBADF;
diff --git a/fs/open.c b/fs/open.c
index 9af548fb841b..581a674d7eee 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1292,7 +1292,7 @@ EXPORT_SYMBOL(filp_close);
  */
 SYSCALL_DEFINE1(close, unsigned int, fd)
 {
-	int retval = __close_fd(current->files, fd);
+	int retval = close_fd(fd);
 
 	/* can't restart close syscall because file table entry was cleared */
 	if (unlikely(retval == -ERESTARTSYS ||
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index dc476ae92f56..dad4a488ce60 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -124,8 +124,7 @@ int iterate_fd(struct files_struct *, unsigned,
 		int (*)(const void *, struct file *, unsigned),
 		const void *);
 
-extern int __close_fd(struct files_struct *files,
-		      unsigned int fd);
+extern int close_fd(unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
 extern int __close_fd_get_file(unsigned int fd, struct file **res);
 extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 37bea07c12f2..9e055a0579f8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1295,16 +1295,16 @@ static inline long ksys_ftruncate(unsigned int fd, loff_t length)
 	return do_sys_ftruncate(fd, length, 1);
 }
 
-extern int __close_fd(struct files_struct *files, unsigned int fd);
+extern int close_fd(unsigned int fd);
 
 /*
  * In contrast to sys_close(), this stub does not check whether the syscall
  * should or should not be restarted, but returns the raw error codes from
- * __close_fd().
+ * close_fd().
  */
 static inline int ksys_close(unsigned int fd)
 {
-	return __close_fd(current->files, fd);
+	return close_fd(fd);
 }
 
 extern long do_sys_truncate(const char __user *pathname, loff_t length);
-- 
cgit 


From 1572bfdf21d4d50e51941498ffe0b56c2289f783 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:39 -0600
Subject: file: Replace ksys_close with close_fd

Now that ksys_close is exactly identical to close_fd replace
the one caller of ksys_close with close_fd.

[1] https://lkml.kernel.org/r/20200818112020.GA17080@infradead.org
Suggested-by: Christoph Hellwig <hch@infradead.org>
Link: https://lkml.kernel.org/r/20201120231441.29911-22-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/autofs/dev-ioctl.c    |  5 +++--
 include/linux/syscalls.h | 12 ------------
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 322b7dfb4ea0..5bf781ea6d67 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -4,9 +4,10 @@
  * Copyright 2008 Ian Kent <raven@themaw.net>
  */
 
+#include <linux/module.h>
 #include <linux/miscdevice.h>
 #include <linux/compat.h>
-#include <linux/syscalls.h>
+#include <linux/fdtable.h>
 #include <linux/magic.h>
 #include <linux/nospec.h>
 
@@ -289,7 +290,7 @@ static int autofs_dev_ioctl_closemount(struct file *fp,
 				       struct autofs_sb_info *sbi,
 				       struct autofs_dev_ioctl *param)
 {
-	return ksys_close(param->ioctlfd);
+	return close_fd(param->ioctlfd);
 }
 
 /*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 9e055a0579f8..0f72f380db72 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1295,18 +1295,6 @@ static inline long ksys_ftruncate(unsigned int fd, loff_t length)
 	return do_sys_ftruncate(fd, length, 1);
 }
 
-extern int close_fd(unsigned int fd);
-
-/*
- * In contrast to sys_close(), this stub does not check whether the syscall
- * should or should not be restarted, but returns the raw error codes from
- * close_fd().
- */
-static inline int ksys_close(unsigned int fd)
-{
-	return close_fd(fd);
-}
-
 extern long do_sys_truncate(const char __user *pathname, loff_t length);
 
 static inline long ksys_truncate(const char __user *pathname, loff_t length)
-- 
cgit 


From 9fe83c43e71cdb8e5b9520bcb98706a2b3c680c8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:40 -0600
Subject: file: Rename __close_fd_get_file close_fd_get_file

The function close_fd_get_file is explicitly a variant of
__close_fd[1].  Now that __close_fd has been renamed close_fd, rename
close_fd_get_file to be consistent with close_fd.

When __alloc_fd, __close_fd and __fd_install were introduced the
double underscore indicated that the function took a struct
files_struct parameter.  The function __close_fd_get_file never has so
the naming has always been inconsistent.  This just cleans things up
so there are not any lingering mentions or references __close_fd left
in the code.

[1] 80cd795630d6 ("binder: fix use-after-free due to ksys_close() during fdget()")
Link: https://lkml.kernel.org/r/20201120231441.29911-23-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/android/binder.c | 2 +-
 fs/file.c                | 4 ++--
 fs/io_uring.c            | 2 +-
 include/linux/fdtable.h  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index b5117576792b..5b2e3721ac27 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2226,7 +2226,7 @@ static void binder_deferred_fd_close(int fd)
 	if (!twcb)
 		return;
 	init_task_work(&twcb->twork, binder_do_fd_close);
-	__close_fd_get_file(fd, &twcb->file);
+	close_fd_get_file(fd, &twcb->file);
 	if (twcb->file) {
 		filp_close(twcb->file, current->files);
 		task_work_add(current, &twcb->twork, TWA_RESUME);
diff --git a/fs/file.c b/fs/file.c
index 987ea51630b4..947ac6d5602f 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -721,11 +721,11 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 }
 
 /*
- * variant of __close_fd that gets a ref on the file for later fput.
+ * variant of close_fd that gets a ref on the file for later fput.
  * The caller must ensure that filp_close() called on the file, and then
  * an fput().
  */
-int __close_fd_get_file(unsigned int fd, struct file **res)
+int close_fd_get_file(unsigned int fd, struct file **res)
 {
 	struct files_struct *files = current->files;
 	struct file *file;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b42dfa0243bf..2e6e51292eff 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4206,7 +4206,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock,
 
 	/* might be already done during nonblock submission */
 	if (!close->put_file) {
-		ret = __close_fd_get_file(close->fd, &close->put_file);
+		ret = close_fd_get_file(close->fd, &close->put_file);
 		if (ret < 0)
 			return (ret == -ENOENT) ? -EBADF : ret;
 	}
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index dad4a488ce60..4ed3589f9294 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -126,7 +126,7 @@ int iterate_fd(struct files_struct *, unsigned,
 
 extern int close_fd(unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
-extern int __close_fd_get_file(unsigned int fd, struct file **res);
+extern int close_fd_get_file(unsigned int fd, struct file **res);
 extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
 		      struct files_struct **new_fdp);
 
-- 
cgit 


From fa67bf885e5211c7dce9514ef2877212c0a5e09e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Nov 2020 17:14:41 -0600
Subject: file: Remove get_files_struct

When discussing[1] exec and posix file locks it was realized that none
of the callers of get_files_struct fundamentally needed to call
get_files_struct, and that by switching them to helper functions
instead it will both simplify their code and remove unnecessary
increments of files_struct.count.  Those unnecessary increments can
result in exec unnecessarily unsharing files_struct which breaking
posix locks, and it can result in fget_light having to fallback to
fget reducing system performance.

Now that get_files_struct has no more users and can not cause the
problems for posix file locking and fget_light remove get_files_struct
so that it does not gain any new users.

[1] https://lkml.kernel.org/r/20180915160423.GA31461@redhat.com
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
v1: https://lkml.kernel.org/r/20200817220425.9389-13-ebiederm@xmission.com
Link: https://lkml.kernel.org/r/20201120231441.29911-24-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/file.c               | 13 -------------
 include/linux/fdtable.h |  1 -
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 947ac6d5602f..412033d8cfdf 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -411,19 +411,6 @@ static struct fdtable *close_files(struct files_struct * files)
 	return fdt;
 }
 
-struct files_struct *get_files_struct(struct task_struct *task)
-{
-	struct files_struct *files;
-
-	task_lock(task);
-	files = task->files;
-	if (files)
-		atomic_inc(&files->count);
-	task_unlock(task);
-
-	return files;
-}
-
 void put_files_struct(struct files_struct *files)
 {
 	if (atomic_dec_and_test(&files->count)) {
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 4ed3589f9294..d0e78174874a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -115,7 +115,6 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd)
 
 struct task_struct;
 
-struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
 int unshare_files(void);
 struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
-- 
cgit 


From f7cfd871ae0c5008d94b6f66834e7845caa93c15 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 3 Dec 2020 14:12:00 -0600
Subject: exec: Transform exec_update_mutex into a rw_semaphore

Recently syzbot reported[0] that there is a deadlock amongst the users
of exec_update_mutex.  The problematic lock ordering found by lockdep
was:

   perf_event_open  (exec_update_mutex -> ovl_i_mutex)
   chown            (ovl_i_mutex       -> sb_writes)
   sendfile         (sb_writes         -> p->lock)
     by reading from a proc file and writing to overlayfs
   proc_pid_syscall (p->lock           -> exec_update_mutex)

While looking at possible solutions it occured to me that all of the
users and possible users involved only wanted to state of the given
process to remain the same.  They are all readers.  The only writer is
exec.

There is no reason for readers to block on each other.  So fix
this deadlock by transforming exec_update_mutex into a rw_semaphore
named exec_update_lock that only exec takes for writing.

Cc: Jann Horn <jannh@google.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Bernd Edlinger <bernd.edlinger@hotmail.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christopher Yeoh <cyeoh@au1.ibm.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Fixes: eea9673250db ("exec: Add exec_update_mutex to replace cred_guard_mutex")
[0] https://lkml.kernel.org/r/00000000000063640c05ade8e3de@google.com
Reported-by: syzbot+db9cdf3dd1f64252c6ef@syzkaller.appspotmail.com
Link: https://lkml.kernel.org/r/87ft4mbqen.fsf@x220.int.ebiederm.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c                    | 12 ++++++------
 fs/proc/base.c               | 10 +++++-----
 include/linux/sched/signal.h | 11 ++++++-----
 init/init_task.c             |  2 +-
 kernel/events/core.c         | 12 ++++++------
 kernel/fork.c                |  6 +++---
 kernel/kcmp.c                | 30 +++++++++++++++---------------
 kernel/pid.c                 |  4 ++--
 8 files changed, 44 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..ca89e0e3ef10 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -965,8 +965,8 @@ EXPORT_SYMBOL(read_code);
 
 /*
  * Maps the mm_struct mm into the current task struct.
- * On success, this function returns with the mutex
- * exec_update_mutex locked.
+ * On success, this function returns with exec_update_lock
+ * held for writing.
  */
 static int exec_mmap(struct mm_struct *mm)
 {
@@ -981,7 +981,7 @@ static int exec_mmap(struct mm_struct *mm)
 	if (old_mm)
 		sync_mm_rss(old_mm);
 
-	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
+	ret = down_write_killable(&tsk->signal->exec_update_lock);
 	if (ret)
 		return ret;
 
@@ -995,7 +995,7 @@ static int exec_mmap(struct mm_struct *mm)
 		mmap_read_lock(old_mm);
 		if (unlikely(old_mm->core_state)) {
 			mmap_read_unlock(old_mm);
-			mutex_unlock(&tsk->signal->exec_update_mutex);
+			up_write(&tsk->signal->exec_update_lock);
 			return -EINTR;
 		}
 	}
@@ -1382,7 +1382,7 @@ int begin_new_exec(struct linux_binprm * bprm)
 	return 0;
 
 out_unlock:
-	mutex_unlock(&me->signal->exec_update_mutex);
+	up_write(&me->signal->exec_update_lock);
 out:
 	return retval;
 }
@@ -1423,7 +1423,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	 * some architectures like powerpc
 	 */
 	me->mm->task_size = TASK_SIZE;
-	mutex_unlock(&me->signal->exec_update_mutex);
+	up_write(&me->signal->exec_update_lock);
 	mutex_unlock(&me->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(setup_new_exec);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b362523a9829..55ce0ee9c5c7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -405,11 +405,11 @@ print0:
 
 static int lock_trace(struct task_struct *task)
 {
-	int err = mutex_lock_killable(&task->signal->exec_update_mutex);
+	int err = down_read_killable(&task->signal->exec_update_lock);
 	if (err)
 		return err;
 	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
-		mutex_unlock(&task->signal->exec_update_mutex);
+		up_read(&task->signal->exec_update_lock);
 		return -EPERM;
 	}
 	return 0;
@@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task)
 
 static void unlock_trace(struct task_struct *task)
 {
-	mutex_unlock(&task->signal->exec_update_mutex);
+	up_read(&task->signal->exec_update_lock);
 }
 
 #ifdef CONFIG_STACKTRACE
@@ -2930,7 +2930,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 	unsigned long flags;
 	int result;
 
-	result = mutex_lock_killable(&task->signal->exec_update_mutex);
+	result = down_read_killable(&task->signal->exec_update_lock);
 	if (result)
 		return result;
 
@@ -2966,7 +2966,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 	result = 0;
 
 out_unlock:
-	mutex_unlock(&task->signal->exec_update_mutex);
+	up_read(&task->signal->exec_update_lock);
 	return result;
 }
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 1bad18a1d8ba..4b6a8234d7fc 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -228,12 +228,13 @@ struct signal_struct {
 					 * credential calculations
 					 * (notably. ptrace)
 					 * Deprecated do not use in new code.
-					 * Use exec_update_mutex instead.
-					 */
-	struct mutex exec_update_mutex;	/* Held while task_struct is being
-					 * updated during exec, and may have
-					 * inconsistent permissions.
+					 * Use exec_update_lock instead.
 					 */
+	struct rw_semaphore exec_update_lock;	/* Held while task_struct is
+						 * being updated during exec,
+						 * and may have inconsistent
+						 * permissions.
+						 */
 } __randomize_layout;
 
 /*
diff --git a/init/init_task.c b/init/init_task.c
index a56f0abb63e9..15f6eb93a04f 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -26,7 +26,7 @@ static struct signal_struct init_signals = {
 	.multiprocess	= HLIST_HEAD_INIT,
 	.rlim		= INIT_RLIMITS,
 	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
-	.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
+	.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
 #ifdef CONFIG_POSIX_TIMERS
 	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
 	.cputimer	= {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dc568ca295bd..55b2330b556c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1325,7 +1325,7 @@ static void put_ctx(struct perf_event_context *ctx)
  * function.
  *
  * Lock order:
- *    exec_update_mutex
+ *    exec_update_lock
  *	task_struct::perf_event_mutex
  *	  perf_event_context::mutex
  *	    perf_event::child_mutex;
@@ -11721,14 +11721,14 @@ SYSCALL_DEFINE5(perf_event_open,
 	}
 
 	if (task) {
-		err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
+		err = down_read_interruptible(&task->signal->exec_update_lock);
 		if (err)
 			goto err_task;
 
 		/*
 		 * Preserve ptrace permission check for backwards compatibility.
 		 *
-		 * We must hold exec_update_mutex across this and any potential
+		 * We must hold exec_update_lock across this and any potential
 		 * perf_install_in_context() call for this new event to
 		 * serialize against exec() altering our credentials (and the
 		 * perf_event_exit_task() that could imply).
@@ -12017,7 +12017,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	mutex_unlock(&ctx->mutex);
 
 	if (task) {
-		mutex_unlock(&task->signal->exec_update_mutex);
+		up_read(&task->signal->exec_update_lock);
 		put_task_struct(task);
 	}
 
@@ -12053,7 +12053,7 @@ err_alloc:
 		free_event(event);
 err_cred:
 	if (task)
-		mutex_unlock(&task->signal->exec_update_mutex);
+		up_read(&task->signal->exec_update_lock);
 err_task:
 	if (task)
 		put_task_struct(task);
@@ -12358,7 +12358,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 /*
  * When a child task exits, feed back event values to parent events.
  *
- * Can be called with exec_update_mutex held when called from
+ * Can be called with exec_update_lock held when called from
  * setup_new_exec().
  */
 void perf_event_exit_task(struct task_struct *child)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..e8cb80b266d2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1221,7 +1221,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 	struct mm_struct *mm;
 	int err;
 
-	err =  mutex_lock_killable(&task->signal->exec_update_mutex);
+	err =  down_read_killable(&task->signal->exec_update_lock);
 	if (err)
 		return ERR_PTR(err);
 
@@ -1231,7 +1231,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 		mmput(mm);
 		mm = ERR_PTR(-EACCES);
 	}
-	mutex_unlock(&task->signal->exec_update_mutex);
+	up_read(&task->signal->exec_update_lock);
 
 	return mm;
 }
@@ -1591,7 +1591,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
 	mutex_init(&sig->cred_guard_mutex);
-	mutex_init(&sig->exec_update_mutex);
+	init_rwsem(&sig->exec_update_lock);
 
 	return 0;
 }
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index b3ff9288c6cc..c0d2ad9b4705 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -75,25 +75,25 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
 	return file;
 }
 
-static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2)
 {
-	if (likely(m2 != m1))
-		mutex_unlock(m2);
-	mutex_unlock(m1);
+	if (likely(l2 != l1))
+		up_read(l2);
+	up_read(l1);
 }
 
-static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2)
 {
 	int err;
 
-	if (m2 > m1)
-		swap(m1, m2);
+	if (l2 > l1)
+		swap(l1, l2);
 
-	err = mutex_lock_killable(m1);
-	if (!err && likely(m1 != m2)) {
-		err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+	err = down_read_killable(l1);
+	if (!err && likely(l1 != l2)) {
+		err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING);
 		if (err)
-			mutex_unlock(m1);
+			up_read(l1);
 	}
 
 	return err;
@@ -173,8 +173,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 	/*
 	 * One should have enough rights to inspect task details.
 	 */
-	ret = kcmp_lock(&task1->signal->exec_update_mutex,
-			&task2->signal->exec_update_mutex);
+	ret = kcmp_lock(&task1->signal->exec_update_lock,
+			&task2->signal->exec_update_lock);
 	if (ret)
 		goto err;
 	if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
@@ -229,8 +229,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 	}
 
 err_unlock:
-	kcmp_unlock(&task1->signal->exec_update_mutex,
-		    &task2->signal->exec_update_mutex);
+	kcmp_unlock(&task1->signal->exec_update_lock,
+		    &task2->signal->exec_update_lock);
 err:
 	put_task_struct(task1);
 	put_task_struct(task2);
diff --git a/kernel/pid.c b/kernel/pid.c
index a96bc4bf4f86..4856818c9de1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -628,7 +628,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
 	struct file *file;
 	int ret;
 
-	ret = mutex_lock_killable(&task->signal->exec_update_mutex);
+	ret = down_read_killable(&task->signal->exec_update_lock);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -637,7 +637,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
 	else
 		file = ERR_PTR(-EPERM);
 
-	mutex_unlock(&task->signal->exec_update_mutex);
+	up_read(&task->signal->exec_update_lock);
 
 	return file ?: ERR_PTR(-EBADF);
 }
-- 
cgit 


From adf60a870e9130c7883ec2ab798484e05f24db39 Mon Sep 17 00:00:00 2001
From: Siddharth Gupta <sidgup@codeaurora.org>
Date: Thu, 19 Nov 2020 13:05:32 -0800
Subject: remoteproc: core: Add ops to enable custom coredump functionality

Each remoteproc might have different requirements for coredumps and might
want to choose the type of dumps it wants to collect. This change allows
remoteproc drivers to specify their own custom dump function to be executed
in place of rproc_coredump. If the coredump op is not specified by the
remoteproc driver it will be set to rproc_coredump by default.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Siddharth Gupta <sidgup@codeaurora.org>
Link: https://lore.kernel.org/r/1605819935-10726-2-git-send-email-sidgup@codeaurora.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c | 6 +++++-
 include/linux/remoteproc.h           | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 46c2937ebea9..2394eef383e3 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1704,7 +1704,7 @@ int rproc_trigger_recovery(struct rproc *rproc)
 		goto unlock_mutex;
 
 	/* generate coredump */
-	rproc_coredump(rproc);
+	rproc->ops->coredump(rproc);
 
 	/* load firmware */
 	ret = request_firmware(&firmware_p, rproc->firmware, dev);
@@ -2189,6 +2189,10 @@ static int rproc_alloc_ops(struct rproc *rproc, const struct rproc_ops *ops)
 	if (!rproc->ops)
 		return -ENOMEM;
 
+	/* Default to rproc_coredump if no coredump function is specified */
+	if (!rproc->ops->coredump)
+		rproc->ops->coredump = rproc_coredump;
+
 	if (rproc->ops->load)
 		return 0;
 
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index e8ac041c64d9..121c55eff3d2 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -375,6 +375,7 @@ enum rsc_handling_status {
  * @get_boot_addr:	get boot address to entry point specified in firmware
  * @panic:	optional callback to react to system panic, core will delay
  *		panic at least the returned number of milliseconds
+ * @coredump:	  collect firmware dump after the subsystem is shutdown
  */
 struct rproc_ops {
 	int (*prepare)(struct rproc *rproc);
@@ -393,6 +394,7 @@ struct rproc_ops {
 	int (*sanity_check)(struct rproc *rproc, const struct firmware *fw);
 	u64 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw);
 	unsigned long (*panic)(struct rproc *rproc);
+	void (*coredump)(struct rproc *rproc);
 };
 
 /**
-- 
cgit 


From abc72b646066075acf9121a2a68aad39f550813d Mon Sep 17 00:00:00 2001
From: Siddharth Gupta <sidgup@codeaurora.org>
Date: Thu, 19 Nov 2020 13:05:33 -0800
Subject: remoteproc: coredump: Add minidump functionality

This change adds a new kind of core dump mechanism which instead of dumping
entire program segments of the firmware, dumps sections of the remoteproc
memory which are sufficient to allow debugging the firmware. This function
thus uses section headers instead of program headers during creation of the
core dump elf.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Co-developed-by: Rishabh Bhatnagar <rishabhb@codeaurora.org>
Signed-off-by: Rishabh Bhatnagar <rishabhb@codeaurora.org>
Signed-off-by: Siddharth Gupta <sidgup@codeaurora.org>
Link: https://lore.kernel.org/r/1605819935-10726-3-git-send-email-sidgup@codeaurora.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_coredump.c    | 140 ++++++++++++++++++++++++++++
 drivers/remoteproc/remoteproc_elf_helpers.h |  26 ++++++
 include/linux/remoteproc.h                  |   1 +
 3 files changed, 167 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_coredump.c b/drivers/remoteproc/remoteproc_coredump.c
index 34530dc20cb4..81ec154a6a5e 100644
--- a/drivers/remoteproc/remoteproc_coredump.c
+++ b/drivers/remoteproc/remoteproc_coredump.c
@@ -323,3 +323,143 @@ void rproc_coredump(struct rproc *rproc)
 	 */
 	wait_for_completion(&dump_state.dump_done);
 }
+
+/**
+ * rproc_coredump_using_sections() - perform coredump using section headers
+ * @rproc:	rproc handle
+ *
+ * This function will generate an ELF header for the registered sections of
+ * segments and create a devcoredump device associated with rproc. Based on
+ * the coredump configuration this function will directly copy the segments
+ * from device memory to userspace or copy segments from device memory to
+ * a separate buffer, which can then be read by userspace.
+ * The first approach avoids using extra vmalloc memory. But it will stall
+ * recovery flow until dump is read by userspace.
+ */
+void rproc_coredump_using_sections(struct rproc *rproc)
+{
+	struct rproc_dump_segment *segment;
+	void *shdr;
+	void *ehdr;
+	size_t data_size;
+	size_t strtbl_size = 0;
+	size_t strtbl_index = 1;
+	size_t offset;
+	void *data;
+	u8 class = rproc->elf_class;
+	int shnum;
+	struct rproc_coredump_state dump_state;
+	unsigned int dump_conf = rproc->dump_conf;
+	char *str_tbl = "STR_TBL";
+
+	if (list_empty(&rproc->dump_segments) ||
+	    dump_conf == RPROC_COREDUMP_DISABLED)
+		return;
+
+	if (class == ELFCLASSNONE) {
+		dev_err(&rproc->dev, "Elf class is not set\n");
+		return;
+	}
+
+	/*
+	 * We allocate two extra section headers. The first one is null.
+	 * Second section header is for the string table. Also space is
+	 * allocated for string table.
+	 */
+	data_size = elf_size_of_hdr(class) + 2 * elf_size_of_shdr(class);
+	shnum = 2;
+
+	/* the extra byte is for the null character at index 0 */
+	strtbl_size += strlen(str_tbl) + 2;
+
+	list_for_each_entry(segment, &rproc->dump_segments, node) {
+		data_size += elf_size_of_shdr(class);
+		strtbl_size += strlen(segment->priv) + 1;
+		if (dump_conf == RPROC_COREDUMP_ENABLED)
+			data_size += segment->size;
+		shnum++;
+	}
+
+	data_size += strtbl_size;
+
+	data = vmalloc(data_size);
+	if (!data)
+		return;
+
+	ehdr = data;
+	memset(ehdr, 0, elf_size_of_hdr(class));
+	/* e_ident field is common for both elf32 and elf64 */
+	elf_hdr_init_ident(ehdr, class);
+
+	elf_hdr_set_e_type(class, ehdr, ET_CORE);
+	elf_hdr_set_e_machine(class, ehdr, rproc->elf_machine);
+	elf_hdr_set_e_version(class, ehdr, EV_CURRENT);
+	elf_hdr_set_e_entry(class, ehdr, rproc->bootaddr);
+	elf_hdr_set_e_shoff(class, ehdr, elf_size_of_hdr(class));
+	elf_hdr_set_e_ehsize(class, ehdr, elf_size_of_hdr(class));
+	elf_hdr_set_e_shentsize(class, ehdr, elf_size_of_shdr(class));
+	elf_hdr_set_e_shnum(class, ehdr, shnum);
+	elf_hdr_set_e_shstrndx(class, ehdr, 1);
+
+	/*
+	 * The zeroth index of the section header is reserved and is rarely used.
+	 * Set the section header as null (SHN_UNDEF) and move to the next one.
+	 */
+	shdr = data + elf_hdr_get_e_shoff(class, ehdr);
+	memset(shdr, 0, elf_size_of_shdr(class));
+	shdr += elf_size_of_shdr(class);
+
+	/* Initialize the string table. */
+	offset = elf_hdr_get_e_shoff(class, ehdr) +
+		 elf_size_of_shdr(class) * elf_hdr_get_e_shnum(class, ehdr);
+	memset(data + offset, 0, strtbl_size);
+
+	/* Fill in the string table section header. */
+	memset(shdr, 0, elf_size_of_shdr(class));
+	elf_shdr_set_sh_type(class, shdr, SHT_STRTAB);
+	elf_shdr_set_sh_offset(class, shdr, offset);
+	elf_shdr_set_sh_size(class, shdr, strtbl_size);
+	elf_shdr_set_sh_entsize(class, shdr, 0);
+	elf_shdr_set_sh_flags(class, shdr, 0);
+	elf_shdr_set_sh_name(class, shdr, elf_strtbl_add(str_tbl, ehdr, class, &strtbl_index));
+	offset += elf_shdr_get_sh_size(class, shdr);
+	shdr += elf_size_of_shdr(class);
+
+	list_for_each_entry(segment, &rproc->dump_segments, node) {
+		memset(shdr, 0, elf_size_of_shdr(class));
+		elf_shdr_set_sh_type(class, shdr, SHT_PROGBITS);
+		elf_shdr_set_sh_offset(class, shdr, offset);
+		elf_shdr_set_sh_addr(class, shdr, segment->da);
+		elf_shdr_set_sh_size(class, shdr, segment->size);
+		elf_shdr_set_sh_entsize(class, shdr, 0);
+		elf_shdr_set_sh_flags(class, shdr, SHF_WRITE);
+		elf_shdr_set_sh_name(class, shdr,
+				     elf_strtbl_add(segment->priv, ehdr, class, &strtbl_index));
+
+		/* No need to copy segments for inline dumps */
+		if (dump_conf == RPROC_COREDUMP_ENABLED)
+			rproc_copy_segment(rproc, data + offset, segment, 0,
+					   segment->size);
+		offset += elf_shdr_get_sh_size(class, shdr);
+		shdr += elf_size_of_shdr(class);
+	}
+
+	if (dump_conf == RPROC_COREDUMP_ENABLED) {
+		dev_coredumpv(&rproc->dev, data, data_size, GFP_KERNEL);
+		return;
+	}
+
+	/* Initialize the dump state struct to be used by rproc_coredump_read */
+	dump_state.rproc = rproc;
+	dump_state.header = data;
+	init_completion(&dump_state.dump_done);
+
+	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size, GFP_KERNEL,
+		      rproc_coredump_read, rproc_coredump_free);
+
+	/* Wait until the dump is read and free is called. Data is freed
+	 * by devcoredump framework automatically after 5 minutes.
+	 */
+	wait_for_completion(&dump_state.dump_done);
+}
+EXPORT_SYMBOL(rproc_coredump_using_sections);
diff --git a/drivers/remoteproc/remoteproc_elf_helpers.h b/drivers/remoteproc/remoteproc_elf_helpers.h
index 4b6be7b6bf4d..26404e68e17a 100644
--- a/drivers/remoteproc/remoteproc_elf_helpers.h
+++ b/drivers/remoteproc/remoteproc_elf_helpers.h
@@ -65,6 +65,7 @@ ELF_GEN_FIELD_GET_SET(hdr, e_type, u16)
 ELF_GEN_FIELD_GET_SET(hdr, e_version, u32)
 ELF_GEN_FIELD_GET_SET(hdr, e_ehsize, u32)
 ELF_GEN_FIELD_GET_SET(hdr, e_phentsize, u16)
+ELF_GEN_FIELD_GET_SET(hdr, e_shentsize, u16)
 
 ELF_GEN_FIELD_GET_SET(phdr, p_paddr, u64)
 ELF_GEN_FIELD_GET_SET(phdr, p_vaddr, u64)
@@ -75,6 +76,9 @@ ELF_GEN_FIELD_GET_SET(phdr, p_offset, u64)
 ELF_GEN_FIELD_GET_SET(phdr, p_flags, u32)
 ELF_GEN_FIELD_GET_SET(phdr, p_align, u64)
 
+ELF_GEN_FIELD_GET_SET(shdr, sh_type, u32)
+ELF_GEN_FIELD_GET_SET(shdr, sh_flags, u32)
+ELF_GEN_FIELD_GET_SET(shdr, sh_entsize, u16)
 ELF_GEN_FIELD_GET_SET(shdr, sh_size, u64)
 ELF_GEN_FIELD_GET_SET(shdr, sh_offset, u64)
 ELF_GEN_FIELD_GET_SET(shdr, sh_name, u32)
@@ -93,4 +97,26 @@ ELF_STRUCT_SIZE(shdr)
 ELF_STRUCT_SIZE(phdr)
 ELF_STRUCT_SIZE(hdr)
 
+static inline unsigned int elf_strtbl_add(const char *name, void *ehdr, u8 class, size_t *index)
+{
+	u16 shstrndx = elf_hdr_get_e_shstrndx(class, ehdr);
+	void *shdr;
+	char *strtab;
+	size_t idx, ret;
+
+	shdr = ehdr + elf_size_of_hdr(class) + shstrndx * elf_size_of_shdr(class);
+	strtab = ehdr + elf_shdr_get_sh_offset(class, shdr);
+	idx = index ? *index : 0;
+	if (!strtab || !name)
+		return 0;
+
+	ret = idx;
+	strcpy((strtab + idx), name);
+	idx += strlen(name) + 1;
+	if (index)
+		*index = idx;
+
+	return ret;
+}
+
 #endif /* REMOTEPROC_ELF_LOADER_H */
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 121c55eff3d2..f28ee75d1005 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -657,6 +657,7 @@ int rproc_boot(struct rproc *rproc);
 void rproc_shutdown(struct rproc *rproc);
 int rproc_set_firmware(struct rproc *rproc, const char *fw_name);
 void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type);
+void rproc_coredump_using_sections(struct rproc *rproc);
 int rproc_coredump_add_segment(struct rproc *rproc, dma_addr_t da, size_t size);
 int rproc_coredump_add_custom_segment(struct rproc *rproc,
 				      dma_addr_t da, size_t size,
-- 
cgit 


From e7708f5b10e205d6291bb495e645a03553b9768b Mon Sep 17 00:00:00 2001
From: Krzysztof Wilczyński <kw@linux.com>
Date: Sun, 29 Nov 2020 23:07:39 +0000
Subject: PCI: Unify ECAM constants in native PCI Express drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add ECAM-related constants to provide a set of standard constants
defining memory address shift values to the byte-level address that can
be used to access the PCI Express Configuration Space, and then move
native PCI Express controller drivers to use the newly introduced
definitions retiring driver-specific ones.

Refactor pci_ecam_map_bus() function to use newly added constants so
that limits to the bus, device function and offset (now limited to 4K as
per the specification) are in place to prevent the defective or
malicious caller from supplying incorrect configuration offset and thus
targeting the wrong device when accessing extended configuration space.

This refactor also allows for the ".bus_shift" initialisers to be
dropped when the user is not using a custom value as a default value
will be used as per the PCI Express Specification.

Thanks to Qian Cai <qcai@redhat.com>, Michael Walle <michael@walle.cc>,
and Vladimir Oltean <olteanv@gmail.com> for reporting a pci_ecam_create()
issue with .bus_shift and to Vladimir for proposing the fix.

[bhelgaas: incorporate Vladimir's fix, update commit log]
Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20201129230743.3006978-2-kw@linux.com
Tested-by: Michael Walle <michael@walle.cc>
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/controller/dwc/pcie-al.c        | 12 ++---------
 drivers/pci/controller/dwc/pcie-hisi.c      |  2 --
 drivers/pci/controller/pci-aardvark.c       | 13 +++---------
 drivers/pci/controller/pci-host-generic.c   |  1 -
 drivers/pci/controller/pci-thunder-ecam.c   |  1 -
 drivers/pci/controller/pcie-brcmstb.c       | 16 ++-------------
 drivers/pci/controller/pcie-rockchip-host.c | 27 ++++++++++++------------
 drivers/pci/controller/pcie-rockchip.h      |  8 +-------
 drivers/pci/controller/pcie-tango.c         |  1 -
 drivers/pci/controller/pcie-xilinx-nwl.c    |  9 ++------
 drivers/pci/controller/pcie-xilinx.c        | 11 ++--------
 drivers/pci/controller/vmd.c                | 11 +++++-----
 drivers/pci/ecam.c                          | 32 +++++++++++++++++++++--------
 include/linux/pci-ecam.h                    | 27 ++++++++++++++++++++++++
 14 files changed, 80 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/controller/dwc/pcie-al.c b/drivers/pci/controller/dwc/pcie-al.c
index f973fbca90cf..af9e51ab1af8 100644
--- a/drivers/pci/controller/dwc/pcie-al.c
+++ b/drivers/pci/controller/dwc/pcie-al.c
@@ -76,7 +76,6 @@ static int al_pcie_init(struct pci_config_window *cfg)
 }
 
 const struct pci_ecam_ops al_pcie_ops = {
-	.bus_shift    = 20,
 	.init         =  al_pcie_init,
 	.pci_ops      = {
 		.map_bus    = al_pcie_map_bus,
@@ -138,8 +137,6 @@ struct al_pcie {
 	struct al_pcie_target_bus_cfg target_bus_cfg;
 };
 
-#define PCIE_ECAM_DEVFN(x)		(((x) & 0xff) << 12)
-
 #define to_al_pcie(x)		dev_get_drvdata((x)->dev)
 
 static inline u32 al_pcie_controller_readl(struct al_pcie *pcie, u32 offset)
@@ -226,11 +223,6 @@ static void __iomem *al_pcie_conf_addr_map_bus(struct pci_bus *bus,
 	struct al_pcie_target_bus_cfg *target_bus_cfg = &pcie->target_bus_cfg;
 	unsigned int busnr_ecam = busnr & target_bus_cfg->ecam_mask;
 	unsigned int busnr_reg = busnr & target_bus_cfg->reg_mask;
-	void __iomem *pci_base_addr;
-
-	pci_base_addr = (void __iomem *)((uintptr_t)pp->va_cfg0_base +
-					 (busnr_ecam << 20) +
-					 PCIE_ECAM_DEVFN(devfn));
 
 	if (busnr_reg != target_bus_cfg->reg_val) {
 		dev_dbg(pcie->pci->dev, "Changing target bus busnum val from 0x%x to 0x%x\n",
@@ -241,7 +233,7 @@ static void __iomem *al_pcie_conf_addr_map_bus(struct pci_bus *bus,
 				       target_bus_cfg->reg_mask);
 	}
 
-	return pci_base_addr + where;
+	return pp->va_cfg0_base + PCIE_ECAM_OFFSET(busnr_ecam, devfn, where);
 }
 
 static struct pci_ops al_child_pci_ops = {
@@ -264,7 +256,7 @@ static void al_pcie_config_prepare(struct al_pcie *pcie)
 
 	target_bus_cfg = &pcie->target_bus_cfg;
 
-	ecam_bus_mask = (pcie->ecam_size >> 20) - 1;
+	ecam_bus_mask = (pcie->ecam_size >> PCIE_ECAM_BUS_SHIFT) - 1;
 	if (ecam_bus_mask > 255) {
 		dev_warn(pcie->dev, "ECAM window size is larger than 256MB. Cutting off at 256\n");
 		ecam_bus_mask = 255;
diff --git a/drivers/pci/controller/dwc/pcie-hisi.c b/drivers/pci/controller/dwc/pcie-hisi.c
index 5ca86796d43a..8fc5960faf28 100644
--- a/drivers/pci/controller/dwc/pcie-hisi.c
+++ b/drivers/pci/controller/dwc/pcie-hisi.c
@@ -100,7 +100,6 @@ static int hisi_pcie_init(struct pci_config_window *cfg)
 }
 
 const struct pci_ecam_ops hisi_pcie_ops = {
-	.bus_shift    = 20,
 	.init         =  hisi_pcie_init,
 	.pci_ops      = {
 		.map_bus    = hisi_pcie_map_bus,
@@ -135,7 +134,6 @@ static int hisi_pcie_platform_init(struct pci_config_window *cfg)
 }
 
 static const struct pci_ecam_ops hisi_pcie_platform_ops = {
-	.bus_shift    = 20,
 	.init         =  hisi_pcie_platform_init,
 	.pci_ops      = {
 		.map_bus    = hisi_pcie_map_bus,
diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c
index 0be485a25327..1043e54c73bd 100644
--- a/drivers/pci/controller/pci-aardvark.c
+++ b/drivers/pci/controller/pci-aardvark.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/init.h>
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
@@ -164,14 +165,6 @@
 #define PCIE_CONFIG_WR_TYPE0			0xa
 #define PCIE_CONFIG_WR_TYPE1			0xb
 
-#define PCIE_CONF_BUS(bus)			(((bus) & 0xff) << 20)
-#define PCIE_CONF_DEV(dev)			(((dev) & 0x1f) << 15)
-#define PCIE_CONF_FUNC(fun)			(((fun) & 0x7)	<< 12)
-#define PCIE_CONF_REG(reg)			((reg) & 0xffc)
-#define PCIE_CONF_ADDR(bus, devfn, where)	\
-	(PCIE_CONF_BUS(bus) | PCIE_CONF_DEV(PCI_SLOT(devfn))	| \
-	 PCIE_CONF_FUNC(PCI_FUNC(devfn)) | PCIE_CONF_REG(where))
-
 #define PIO_RETRY_CNT			500
 #define PIO_RETRY_DELAY			2 /* 2 us*/
 
@@ -687,7 +680,7 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn,
 	advk_writel(pcie, reg, PIO_CTRL);
 
 	/* Program the address registers */
-	reg = PCIE_CONF_ADDR(bus->number, devfn, where);
+	reg = ALIGN_DOWN(PCIE_ECAM_OFFSET(bus->number, devfn, where), 4);
 	advk_writel(pcie, reg, PIO_ADDR_LS);
 	advk_writel(pcie, 0, PIO_ADDR_MS);
 
@@ -748,7 +741,7 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn,
 	advk_writel(pcie, reg, PIO_CTRL);
 
 	/* Program the address registers */
-	reg = PCIE_CONF_ADDR(bus->number, devfn, where);
+	reg = ALIGN_DOWN(PCIE_ECAM_OFFSET(bus->number, devfn, where), 4);
 	advk_writel(pcie, reg, PIO_ADDR_LS);
 	advk_writel(pcie, 0, PIO_ADDR_MS);
 
diff --git a/drivers/pci/controller/pci-host-generic.c b/drivers/pci/controller/pci-host-generic.c
index b51977abfdf1..63865aeb636b 100644
--- a/drivers/pci/controller/pci-host-generic.c
+++ b/drivers/pci/controller/pci-host-generic.c
@@ -49,7 +49,6 @@ static void __iomem *pci_dw_ecam_map_bus(struct pci_bus *bus,
 }
 
 static const struct pci_ecam_ops pci_dw_ecam_bus_ops = {
-	.bus_shift	= 20,
 	.pci_ops	= {
 		.map_bus	= pci_dw_ecam_map_bus,
 		.read		= pci_generic_config_read,
diff --git a/drivers/pci/controller/pci-thunder-ecam.c b/drivers/pci/controller/pci-thunder-ecam.c
index 7e8835fee5f7..f964fd26f7e0 100644
--- a/drivers/pci/controller/pci-thunder-ecam.c
+++ b/drivers/pci/controller/pci-thunder-ecam.c
@@ -346,7 +346,6 @@ static int thunder_ecam_config_write(struct pci_bus *bus, unsigned int devfn,
 }
 
 const struct pci_ecam_ops pci_thunder_ecam_ops = {
-	.bus_shift	= 20,
 	.pci_ops	= {
 		.map_bus        = pci_ecam_map_bus,
 		.read           = thunder_ecam_config_read,
diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c
index bea86899bd5d..7fc80fd6f13f 100644
--- a/drivers/pci/controller/pcie-brcmstb.c
+++ b/drivers/pci/controller/pcie-brcmstb.c
@@ -22,6 +22,7 @@
 #include <linux/of_pci.h>
 #include <linux/of_platform.h>
 #include <linux/pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/printk.h>
 #include <linux/reset.h>
 #include <linux/sizes.h>
@@ -127,11 +128,7 @@
 #define  MSI_INT_MASK_CLR		0x14
 
 #define PCIE_EXT_CFG_DATA				0x8000
-
 #define PCIE_EXT_CFG_INDEX				0x9000
-#define  PCIE_EXT_BUSNUM_SHIFT				20
-#define  PCIE_EXT_SLOT_SHIFT				15
-#define  PCIE_EXT_FUNC_SHIFT				12
 
 #define  PCIE_RGR1_SW_INIT_1_PERST_MASK			0x1
 #define  PCIE_RGR1_SW_INIT_1_PERST_SHIFT		0x0
@@ -695,15 +692,6 @@ static bool brcm_pcie_link_up(struct brcm_pcie *pcie)
 	return dla && plu;
 }
 
-/* Configuration space read/write support */
-static inline int brcm_pcie_cfg_index(int busnr, int devfn, int reg)
-{
-	return ((PCI_SLOT(devfn) & 0x1f) << PCIE_EXT_SLOT_SHIFT)
-		| ((PCI_FUNC(devfn) & 0x07) << PCIE_EXT_FUNC_SHIFT)
-		| (busnr << PCIE_EXT_BUSNUM_SHIFT)
-		| (reg & ~3);
-}
-
 static void __iomem *brcm_pcie_map_conf(struct pci_bus *bus, unsigned int devfn,
 					int where)
 {
@@ -716,7 +704,7 @@ static void __iomem *brcm_pcie_map_conf(struct pci_bus *bus, unsigned int devfn,
 		return PCI_SLOT(devfn) ? NULL : base + where;
 
 	/* For devices, write to the config space index register */
-	idx = brcm_pcie_cfg_index(bus->number, devfn, 0);
+	idx = PCIE_ECAM_OFFSET(bus->number, devfn, 0);
 	writel(idx, pcie->base + PCIE_EXT_CFG_INDEX);
 	return base + PCIE_EXT_CFG_DATA + where;
 }
diff --git a/drivers/pci/controller/pcie-rockchip-host.c b/drivers/pci/controller/pcie-rockchip-host.c
index 9705059523a6..f1d08a1b1591 100644
--- a/drivers/pci/controller/pcie-rockchip-host.c
+++ b/drivers/pci/controller/pcie-rockchip-host.c
@@ -157,12 +157,11 @@ static int rockchip_pcie_rd_other_conf(struct rockchip_pcie *rockchip,
 				       struct pci_bus *bus, u32 devfn,
 				       int where, int size, u32 *val)
 {
-	u32 busdev;
+	void __iomem *addr;
 
-	busdev = PCIE_ECAM_ADDR(bus->number, PCI_SLOT(devfn),
-				PCI_FUNC(devfn), where);
+	addr = rockchip->reg_base + PCIE_ECAM_OFFSET(bus->number, devfn, where);
 
-	if (!IS_ALIGNED(busdev, size)) {
+	if (!IS_ALIGNED((uintptr_t)addr, size)) {
 		*val = 0;
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 	}
@@ -175,11 +174,11 @@ static int rockchip_pcie_rd_other_conf(struct rockchip_pcie *rockchip,
 						AXI_WRAPPER_TYPE1_CFG);
 
 	if (size == 4) {
-		*val = readl(rockchip->reg_base + busdev);
+		*val = readl(addr);
 	} else if (size == 2) {
-		*val = readw(rockchip->reg_base + busdev);
+		*val = readw(addr);
 	} else if (size == 1) {
-		*val = readb(rockchip->reg_base + busdev);
+		*val = readb(addr);
 	} else {
 		*val = 0;
 		return PCIBIOS_BAD_REGISTER_NUMBER;
@@ -191,11 +190,11 @@ static int rockchip_pcie_wr_other_conf(struct rockchip_pcie *rockchip,
 				       struct pci_bus *bus, u32 devfn,
 				       int where, int size, u32 val)
 {
-	u32 busdev;
+	void __iomem *addr;
 
-	busdev = PCIE_ECAM_ADDR(bus->number, PCI_SLOT(devfn),
-				PCI_FUNC(devfn), where);
-	if (!IS_ALIGNED(busdev, size))
+	addr = rockchip->reg_base + PCIE_ECAM_OFFSET(bus->number, devfn, where);
+
+	if (!IS_ALIGNED((uintptr_t)addr, size))
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 
 	if (pci_is_root_bus(bus->parent))
@@ -206,11 +205,11 @@ static int rockchip_pcie_wr_other_conf(struct rockchip_pcie *rockchip,
 						AXI_WRAPPER_TYPE1_CFG);
 
 	if (size == 4)
-		writel(val, rockchip->reg_base + busdev);
+		writel(val, addr);
 	else if (size == 2)
-		writew(val, rockchip->reg_base + busdev);
+		writew(val, addr);
 	else if (size == 1)
-		writeb(val, rockchip->reg_base + busdev);
+		writeb(val, addr);
 	else
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 
diff --git a/drivers/pci/controller/pcie-rockchip.h b/drivers/pci/controller/pcie-rockchip.h
index c7d0178fc8c2..1650a5087450 100644
--- a/drivers/pci/controller/pcie-rockchip.h
+++ b/drivers/pci/controller/pcie-rockchip.h
@@ -13,6 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
+#include <linux/pci-ecam.h>
 
 /*
  * The upper 16 bits of PCIE_CLIENT_CONFIG are a write mask for the lower 16
@@ -178,13 +179,6 @@
 #define MIN_AXI_ADDR_BITS_PASSED		8
 #define PCIE_RC_SEND_PME_OFF			0x11960
 #define ROCKCHIP_VENDOR_ID			0x1d87
-#define PCIE_ECAM_BUS(x)			(((x) & 0xff) << 20)
-#define PCIE_ECAM_DEV(x)			(((x) & 0x1f) << 15)
-#define PCIE_ECAM_FUNC(x)			(((x) & 0x7) << 12)
-#define PCIE_ECAM_REG(x)			(((x) & 0xfff) << 0)
-#define PCIE_ECAM_ADDR(bus, dev, func, reg) \
-	  (PCIE_ECAM_BUS(bus) | PCIE_ECAM_DEV(dev) | \
-	   PCIE_ECAM_FUNC(func) | PCIE_ECAM_REG(reg))
 #define PCIE_LINK_IS_L2(x) \
 	(((x) & PCIE_CLIENT_DEBUG_LTSSM_MASK) == PCIE_CLIENT_DEBUG_LTSSM_L2)
 #define PCIE_LINK_UP(x) \
diff --git a/drivers/pci/controller/pcie-tango.c b/drivers/pci/controller/pcie-tango.c
index d093a8ce4bb1..62a061f1d62e 100644
--- a/drivers/pci/controller/pcie-tango.c
+++ b/drivers/pci/controller/pcie-tango.c
@@ -208,7 +208,6 @@ static int smp8759_config_write(struct pci_bus *bus, unsigned int devfn,
 }
 
 static const struct pci_ecam_ops smp8759_ecam_ops = {
-	.bus_shift	= 20,
 	.pci_ops	= {
 		.map_bus	= pci_ecam_map_bus,
 		.read		= smp8759_config_read,
diff --git a/drivers/pci/controller/pcie-xilinx-nwl.c b/drivers/pci/controller/pcie-xilinx-nwl.c
index f3cf7d61924f..7f29c2fdcd51 100644
--- a/drivers/pci/controller/pcie-xilinx-nwl.c
+++ b/drivers/pci/controller/pcie-xilinx-nwl.c
@@ -18,6 +18,7 @@
 #include <linux/of_platform.h>
 #include <linux/of_irq.h>
 #include <linux/pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/platform_device.h>
 #include <linux/irqchip/chained_irq.h>
 
@@ -124,8 +125,6 @@
 #define E_ECAM_CR_ENABLE		BIT(0)
 #define E_ECAM_SIZE_LOC			GENMASK(20, 16)
 #define E_ECAM_SIZE_SHIFT		16
-#define ECAM_BUS_LOC_SHIFT		20
-#define ECAM_DEV_LOC_SHIFT		12
 #define NWL_ECAM_VALUE_DEFAULT		12
 
 #define CFG_DMA_REG_BAR			GENMASK(2, 0)
@@ -240,15 +239,11 @@ static void __iomem *nwl_pcie_map_bus(struct pci_bus *bus, unsigned int devfn,
 				      int where)
 {
 	struct nwl_pcie *pcie = bus->sysdata;
-	int relbus;
 
 	if (!nwl_pcie_valid_device(bus, devfn))
 		return NULL;
 
-	relbus = (bus->number << ECAM_BUS_LOC_SHIFT) |
-			(devfn << ECAM_DEV_LOC_SHIFT);
-
-	return pcie->ecam_base + relbus + where;
+	return pcie->ecam_base + PCIE_ECAM_OFFSET(bus->number, devfn, where);
 }
 
 /* PCIe operations */
diff --git a/drivers/pci/controller/pcie-xilinx.c b/drivers/pci/controller/pcie-xilinx.c
index 8523be61bba5..fa5baeb82653 100644
--- a/drivers/pci/controller/pcie-xilinx.c
+++ b/drivers/pci/controller/pcie-xilinx.c
@@ -21,6 +21,7 @@
 #include <linux/of_platform.h>
 #include <linux/of_irq.h>
 #include <linux/pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/platform_device.h>
 
 #include "../pci.h"
@@ -86,10 +87,6 @@
 /* Phy Status/Control Register definitions */
 #define XILINX_PCIE_REG_PSCR_LNKUP	BIT(11)
 
-/* ECAM definitions */
-#define ECAM_BUS_NUM_SHIFT		20
-#define ECAM_DEV_NUM_SHIFT		12
-
 /* Number of MSI IRQs */
 #define XILINX_NUM_MSI_IRQS		128
 
@@ -183,15 +180,11 @@ static void __iomem *xilinx_pcie_map_bus(struct pci_bus *bus,
 					 unsigned int devfn, int where)
 {
 	struct xilinx_pcie_port *port = bus->sysdata;
-	int relbus;
 
 	if (!xilinx_pcie_valid_device(bus, devfn))
 		return NULL;
 
-	relbus = (bus->number << ECAM_BUS_NUM_SHIFT) |
-		 (devfn << ECAM_DEV_NUM_SHIFT);
-
-	return port->reg_base + relbus + where;
+	return port->reg_base + PCIE_ECAM_OFFSET(bus->number, devfn, where);
 }
 
 /* PCIe operations */
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index f375c21ceeb1..1361a79bd1e7 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/msi.h>
 #include <linux/pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/srcu.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
@@ -328,15 +329,13 @@ static void vmd_remove_irq_domain(struct vmd_dev *vmd)
 static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
 				  unsigned int devfn, int reg, int len)
 {
-	char __iomem *addr = vmd->cfgbar +
-			     ((bus->number - vmd->busn_start) << 20) +
-			     (devfn << 12) + reg;
+	unsigned int busnr_ecam = bus->number - vmd->busn_start;
+	u32 offset = PCIE_ECAM_OFFSET(busnr_ecam, devfn, reg);
 
-	if ((addr - vmd->cfgbar) + len >=
-	    resource_size(&vmd->dev->resource[VMD_CFGBAR]))
+	if (offset + len >= resource_size(&vmd->dev->resource[VMD_CFGBAR]))
 		return NULL;
 
-	return addr;
+	return vmd->cfgbar + offset;
 }
 
 /*
diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c
index b54d32a31669..d2a1920bb055 100644
--- a/drivers/pci/ecam.c
+++ b/drivers/pci/ecam.c
@@ -28,6 +28,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
 		struct resource *cfgres, struct resource *busr,
 		const struct pci_ecam_ops *ops)
 {
+	unsigned int bus_shift = ops->bus_shift;
 	struct pci_config_window *cfg;
 	unsigned int bus_range, bus_range_max, bsz;
 	struct resource *conflict;
@@ -40,20 +41,24 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
 	if (!cfg)
 		return ERR_PTR(-ENOMEM);
 
+	/* ECAM-compliant platforms need not supply ops->bus_shift */
+	if (!bus_shift)
+		bus_shift = PCIE_ECAM_BUS_SHIFT;
+
 	cfg->parent = dev;
 	cfg->ops = ops;
 	cfg->busr.start = busr->start;
 	cfg->busr.end = busr->end;
 	cfg->busr.flags = IORESOURCE_BUS;
 	bus_range = resource_size(&cfg->busr);
-	bus_range_max = resource_size(cfgres) >> ops->bus_shift;
+	bus_range_max = resource_size(cfgres) >> bus_shift;
 	if (bus_range > bus_range_max) {
 		bus_range = bus_range_max;
 		cfg->busr.end = busr->start + bus_range - 1;
 		dev_warn(dev, "ECAM area %pR can only accommodate %pR (reduced from %pR desired)\n",
 			 cfgres, &cfg->busr, busr);
 	}
-	bsz = 1 << ops->bus_shift;
+	bsz = 1 << bus_shift;
 
 	cfg->res.start = cfgres->start;
 	cfg->res.end = cfgres->end;
@@ -131,25 +136,36 @@ void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn,
 			       int where)
 {
 	struct pci_config_window *cfg = bus->sysdata;
+	unsigned int bus_shift = cfg->ops->bus_shift;
 	unsigned int devfn_shift = cfg->ops->bus_shift - 8;
 	unsigned int busn = bus->number;
 	void __iomem *base;
+	u32 bus_offset, devfn_offset;
 
 	if (busn < cfg->busr.start || busn > cfg->busr.end)
 		return NULL;
 
 	busn -= cfg->busr.start;
-	if (per_bus_mapping)
+	if (per_bus_mapping) {
 		base = cfg->winp[busn];
-	else
-		base = cfg->win + (busn << cfg->ops->bus_shift);
-	return base + (devfn << devfn_shift) + where;
+		busn = 0;
+	} else
+		base = cfg->win;
+
+	if (cfg->ops->bus_shift) {
+		bus_offset = (busn & PCIE_ECAM_BUS_MASK) << bus_shift;
+		devfn_offset = (devfn & PCIE_ECAM_DEVFN_MASK) << devfn_shift;
+		where &= PCIE_ECAM_REG_MASK;
+
+		return base + (bus_offset | devfn_offset | where);
+	}
+
+	return base + PCIE_ECAM_OFFSET(busn, devfn, where);
 }
 EXPORT_SYMBOL_GPL(pci_ecam_map_bus);
 
 /* ECAM ops */
 const struct pci_ecam_ops pci_generic_ecam_ops = {
-	.bus_shift	= 20,
 	.pci_ops	= {
 		.map_bus	= pci_ecam_map_bus,
 		.read		= pci_generic_config_read,
@@ -161,7 +177,6 @@ EXPORT_SYMBOL_GPL(pci_generic_ecam_ops);
 #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS)
 /* ECAM ops for 32-bit access only (non-compliant) */
 const struct pci_ecam_ops pci_32b_ops = {
-	.bus_shift	= 20,
 	.pci_ops	= {
 		.map_bus	= pci_ecam_map_bus,
 		.read		= pci_generic_config_read32,
@@ -171,7 +186,6 @@ const struct pci_ecam_ops pci_32b_ops = {
 
 /* ECAM ops for 32-bit read only (non-compliant) */
 const struct pci_ecam_ops pci_32b_read_ops = {
-	.bus_shift	= 20,
 	.pci_ops	= {
 		.map_bus	= pci_ecam_map_bus,
 		.read		= pci_generic_config_read32,
diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index 033ce74f02e8..65d3d83015c3 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -9,6 +9,33 @@
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 
+/*
+ * Memory address shift values for the byte-level address that
+ * can be used when accessing the PCI Express Configuration Space.
+ */
+
+/*
+ * Enhanced Configuration Access Mechanism (ECAM)
+ *
+ * See PCI Express Base Specification, Revision 5.0, Version 1.0,
+ * Section 7.2.2, Table 7-1, p. 677.
+ */
+#define PCIE_ECAM_BUS_SHIFT	20 /* Bus number */
+#define PCIE_ECAM_DEVFN_SHIFT	12 /* Device and Function number */
+
+#define PCIE_ECAM_BUS_MASK	0xff
+#define PCIE_ECAM_DEVFN_MASK	0xff
+#define PCIE_ECAM_REG_MASK	0xfff /* Limit offset to a maximum of 4K */
+
+#define PCIE_ECAM_BUS(x)	(((x) & PCIE_ECAM_BUS_MASK) << PCIE_ECAM_BUS_SHIFT)
+#define PCIE_ECAM_DEVFN(x)	(((x) & PCIE_ECAM_DEVFN_MASK) << PCIE_ECAM_DEVFN_SHIFT)
+#define PCIE_ECAM_REG(x)	((x) & PCIE_ECAM_REG_MASK)
+
+#define PCIE_ECAM_OFFSET(bus, devfn, where) \
+	(PCIE_ECAM_BUS(bus) | \
+	 PCIE_ECAM_DEVFN(devfn) | \
+	 PCIE_ECAM_REG(where))
+
 /*
  * struct to hold pci ops and bus shift of the config window
  * for a PCI controller.
-- 
cgit 


From 9994bb3f36e3d181d9f0a078609038080cfd7a3d Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:13 +0200
Subject: mtd: nand: ecc-bch: Create the software BCH engine

Let's continue introducing the generic ECC engine abstraction in the
NAND subsystem by instantiating a first ECC engine: the software
BCH one.

While at it, make a very tidy ecc_sw_bch_init() function and move all
the sanity checks and user input management in
nand_ecc_sw_bch_init_ctx(). This second helper will be called from the
raw RAND core.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-10-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-bch.c       | 333 +++++++++++++++++++++++++++++-------
 drivers/mtd/nand/raw/nand_base.c    |  62 +------
 include/linux/mtd/nand-ecc-sw-bch.h |  16 +-
 include/linux/mtd/nand.h            |   9 +
 4 files changed, 296 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
index 16a54bd2ca31..0a0ac11d5725 100644
--- a/drivers/mtd/nand/ecc-sw-bch.c
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -75,6 +75,19 @@ int nand_ecc_sw_bch_correct(struct nand_device *nand, unsigned char *buf,
 }
 EXPORT_SYMBOL(nand_ecc_sw_bch_correct);
 
+/**
+ * nand_ecc_sw_bch_cleanup - Cleanup software BCH ECC resources
+ * @nand: NAND device
+ */
+static void nand_ecc_sw_bch_cleanup(struct nand_device *nand)
+{
+	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
+
+	bch_free(engine_conf->bch);
+	kfree(engine_conf->errloc);
+	kfree(engine_conf->eccmask);
+}
+
 /**
  * nand_ecc_sw_bch_init - Initialize software BCH ECC engine
  * @nand: NAND device
@@ -92,107 +105,301 @@ EXPORT_SYMBOL(nand_ecc_sw_bch_correct);
  * step_size = 512 (thus, m = 13 is the smallest integer such that 2^m - 1 > 512 * 8)
  * bytes = 7 (7 bytes are required to store m * t = 13 * 4 = 52 bits)
  */
-int nand_ecc_sw_bch_init(struct nand_device *nand)
+static int nand_ecc_sw_bch_init(struct nand_device *nand)
 {
-	struct mtd_info *mtd = nanddev_to_mtd(nand);
-	unsigned int m, t, eccsteps, i;
 	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
-	unsigned char *erased_page;
 	unsigned int eccsize = nand->ecc.ctx.conf.step_size;
 	unsigned int eccbytes = engine_conf->code_size;
-	unsigned int eccstrength = nand->ecc.ctx.conf.strength;
-
-	if (!eccbytes && eccstrength) {
-		eccbytes = DIV_ROUND_UP(eccstrength * fls(8 * eccsize), 8);
-		engine_conf->code_size = eccbytes;
-	}
-
-	if (!eccsize || !eccbytes) {
-		pr_warn("ecc parameters not supplied\n");
-		return -EINVAL;
-	}
+	unsigned int m, t, i;
+	unsigned char *erased_page;
+	int ret;
 
-	m = fls(1+8*eccsize);
-	t = (eccbytes*8)/m;
+	m = fls(1 + (8 * eccsize));
+	t = (eccbytes * 8) / m;
 
 	engine_conf->bch = bch_init(m, t, 0, false);
 	if (!engine_conf->bch)
 		return -EINVAL;
 
-	/* verify that eccbytes has the expected value */
-	if (engine_conf->bch->ecc_bytes != eccbytes) {
-		pr_warn("invalid eccbytes %u, should be %u\n",
-			eccbytes, engine_conf->bch->ecc_bytes);
-		goto fail;
+	engine_conf->eccmask = kzalloc(eccbytes, GFP_KERNEL);
+	engine_conf->errloc = kmalloc_array(t, sizeof(*engine_conf->errloc),
+					    GFP_KERNEL);
+	if (!engine_conf->eccmask || !engine_conf->errloc) {
+		ret = -ENOMEM;
+		goto cleanup;
+	}
+
+	/* Compute and store the inverted ECC of an erased step */
+	erased_page = kmalloc(eccsize, GFP_KERNEL);
+	if (!erased_page) {
+		ret = -ENOMEM;
+		goto cleanup;
 	}
 
-	eccsteps = mtd->writesize/eccsize;
+	memset(erased_page, 0xff, eccsize);
+	bch_encode(engine_conf->bch, erased_page, eccsize,
+		   engine_conf->eccmask);
+	kfree(erased_page);
+
+	for (i = 0; i < eccbytes; i++)
+		engine_conf->eccmask[i] ^= 0xff;
 
-	/* Check that we have an oob layout description. */
-	if (!mtd->ooblayout) {
-		pr_warn("missing oob scheme");
-		goto fail;
+	/* Verify that the number of code bytes has the expected value */
+	if (engine_conf->bch->ecc_bytes != eccbytes) {
+		pr_err("Invalid number of ECC bytes: %u, expected: %u\n",
+		       eccbytes, engine_conf->bch->ecc_bytes);
+		ret = -EINVAL;
+		goto cleanup;
 	}
 
-	/* sanity checks */
-	if (8*(eccsize+eccbytes) >= (1 << m)) {
-		pr_warn("eccsize %u is too large\n", eccsize);
-		goto fail;
+	/* Sanity checks */
+	if (8 * (eccsize + eccbytes) >= (1 << m)) {
+		pr_err("ECC step size is too large (%u)\n", eccsize);
+		ret = -EINVAL;
+		goto cleanup;
 	}
 
-	if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
-		pr_warn("invalid ecc layout\n");
-		goto fail;
+	return 0;
+
+cleanup:
+	nand_ecc_sw_bch_cleanup(nand);
+
+	return ret;
+}
+
+int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
+{
+	struct nand_ecc_props *conf = &nand->ecc.ctx.conf;
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	struct nand_ecc_sw_bch_conf *engine_conf;
+	unsigned int code_size = 0, nsteps;
+	int ret;
+
+	/* Only large page NAND chips may use BCH */
+	if (mtd->oobsize < 64) {
+		pr_err("BCH cannot be used with small page NAND chips\n");
+		return -EINVAL;
 	}
 
-	engine_conf->eccmask = kzalloc(eccbytes, GFP_KERNEL);
-	engine_conf->errloc = kmalloc_array(t, sizeof(*engine_conf->errloc),
-					    GFP_KERNEL);
-	if (!engine_conf->eccmask || !engine_conf->errloc)
-		goto fail;
+	if (!mtd->ooblayout)
+		mtd_set_ooblayout(mtd, nand_get_large_page_ooblayout());
+
+	conf->engine_type = NAND_ECC_ENGINE_TYPE_SOFT;
+	conf->algo = NAND_ECC_ALGO_BCH;
+	conf->step_size = nand->ecc.user_conf.step_size;
+	conf->strength = nand->ecc.user_conf.strength;
 
 	/*
-	 * compute and store the inverted ecc of an erased ecc block
+	 * Board driver should supply ECC size and ECC strength
+	 * values to select how many bits are correctable.
+	 * Otherwise, default to 512 bytes for large page devices and 256 for
+	 * small page devices.
 	 */
-	erased_page = kmalloc(eccsize, GFP_KERNEL);
-	if (!erased_page)
-		goto fail;
+	if (!conf->step_size) {
+		if (mtd->oobsize >= 64)
+			conf->step_size = 512;
+		else
+			conf->step_size = 256;
 
-	memset(erased_page, 0xff, eccsize);
-	bch_encode(engine_conf->bch, erased_page, eccsize,
-		   engine_conf->eccmask);
-	kfree(erased_page);
+		conf->strength = 4;
+	}
 
-	for (i = 0; i < eccbytes; i++)
-		engine_conf->eccmask[i] ^= 0xff;
+	nsteps = mtd->writesize / conf->step_size;
+
+	/* Maximize */
+	if (nand->ecc.user_conf.flags & NAND_ECC_MAXIMIZE_STRENGTH) {
+		conf->step_size = 1024;
+		nsteps = mtd->writesize / conf->step_size;
+		/* Reserve 2 bytes for the BBM */
+		code_size = (mtd->oobsize - 2) / nsteps;
+		conf->strength = code_size * 8 / fls(8 * conf->step_size);
+	}
+
+	if (!code_size)
+		code_size = DIV_ROUND_UP(conf->strength *
+					 fls(8 * conf->step_size), 8);
+
+	if (!conf->strength)
+		conf->strength = (code_size * 8) / fls(8 * conf->step_size);
+
+	if (!code_size && !conf->strength) {
+		pr_err("Missing ECC parameters\n");
+		return -EINVAL;
+	}
+
+	engine_conf = kzalloc(sizeof(*engine_conf), GFP_KERNEL);
+	if (!engine_conf)
+		return -ENOMEM;
+
+	ret = nand_ecc_init_req_tweaking(&engine_conf->req_ctx, nand);
+	if (ret)
+		goto free_engine_conf;
+
+	engine_conf->code_size = code_size;
+	engine_conf->nsteps = nsteps;
+	engine_conf->calc_buf = kzalloc(mtd->oobsize, GFP_KERNEL);
+	engine_conf->code_buf = kzalloc(mtd->oobsize, GFP_KERNEL);
+	if (!engine_conf->calc_buf || !engine_conf->code_buf) {
+		ret = -ENOMEM;
+		goto free_bufs;
+	}
 
-	if (!eccstrength)
-		nand->ecc.ctx.conf.strength = (eccbytes * 8) / fls(8 * eccsize);
+	nand->ecc.ctx.priv = engine_conf;
+	nand->ecc.ctx.total = nsteps * code_size;
+
+	ret = nand_ecc_sw_bch_init(nand);
+	if (ret)
+		goto free_bufs;
+
+	/* Verify the layout validity */
+	if (mtd_ooblayout_count_eccbytes(mtd) !=
+	    engine_conf->nsteps * engine_conf->code_size) {
+		pr_err("Invalid ECC layout\n");
+		ret = -EINVAL;
+		goto cleanup_bch_ctx;
+	}
 
 	return 0;
 
-fail:
+cleanup_bch_ctx:
 	nand_ecc_sw_bch_cleanup(nand);
+free_bufs:
+	nand_ecc_cleanup_req_tweaking(&engine_conf->req_ctx);
+	kfree(engine_conf->calc_buf);
+	kfree(engine_conf->code_buf);
+free_engine_conf:
+	kfree(engine_conf);
 
-	return -EINVAL;
+	return ret;
 }
-EXPORT_SYMBOL(nand_ecc_sw_bch_init);
+EXPORT_SYMBOL(nand_ecc_sw_bch_init_ctx);
 
-/**
- * nand_ecc_sw_bch_cleanup - Cleanup software BCH ECC resources
- * @nand: NAND device
- */
-void nand_ecc_sw_bch_cleanup(struct nand_device *nand)
+void nand_ecc_sw_bch_cleanup_ctx(struct nand_device *nand)
 {
 	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
 
 	if (engine_conf) {
-		bch_free(engine_conf->bch);
-		kfree(engine_conf->errloc);
-		kfree(engine_conf->eccmask);
+		nand_ecc_sw_bch_cleanup(nand);
+		nand_ecc_cleanup_req_tweaking(&engine_conf->req_ctx);
+		kfree(engine_conf->calc_buf);
+		kfree(engine_conf->code_buf);
+		kfree(engine_conf);
+	}
+}
+EXPORT_SYMBOL(nand_ecc_sw_bch_cleanup_ctx);
+
+static int nand_ecc_sw_bch_prepare_io_req(struct nand_device *nand,
+					  struct nand_page_io_req *req)
+{
+	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	int eccsize = nand->ecc.ctx.conf.step_size;
+	int eccbytes = engine_conf->code_size;
+	int eccsteps = engine_conf->nsteps;
+	int total = nand->ecc.ctx.total;
+	u8 *ecccalc = engine_conf->calc_buf;
+	const u8 *data;
+	int i;
+
+	/* Nothing to do for a raw operation */
+	if (req->mode == MTD_OPS_RAW)
+		return 0;
+
+	/* This engine does not provide BBM/free OOB bytes protection */
+	if (!req->datalen)
+		return 0;
+
+	nand_ecc_tweak_req(&engine_conf->req_ctx, req);
+
+	/* No more preparation for page read */
+	if (req->type == NAND_PAGE_READ)
+		return 0;
+
+	/* Preparation for page write: derive the ECC bytes and place them */
+	for (i = 0, data = req->databuf.out;
+	     eccsteps;
+	     eccsteps--, i += eccbytes, data += eccsize)
+		nand_ecc_sw_bch_calculate(nand, data, &ecccalc[i]);
+
+	return mtd_ooblayout_set_eccbytes(mtd, ecccalc, (void *)req->oobbuf.out,
+					  0, total);
+}
+
+static int nand_ecc_sw_bch_finish_io_req(struct nand_device *nand,
+					 struct nand_page_io_req *req)
+{
+	struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	int eccsize = nand->ecc.ctx.conf.step_size;
+	int total = nand->ecc.ctx.total;
+	int eccbytes = engine_conf->code_size;
+	int eccsteps = engine_conf->nsteps;
+	u8 *ecccalc = engine_conf->calc_buf;
+	u8 *ecccode = engine_conf->code_buf;
+	unsigned int max_bitflips = 0;
+	u8 *data = req->databuf.in;
+	int i, ret;
+
+	/* Nothing to do for a raw operation */
+	if (req->mode == MTD_OPS_RAW)
+		return 0;
+
+	/* This engine does not provide BBM/free OOB bytes protection */
+	if (!req->datalen)
+		return 0;
+
+	/* No more preparation for page write */
+	if (req->type == NAND_PAGE_WRITE) {
+		nand_ecc_restore_req(&engine_conf->req_ctx, req);
+		return 0;
 	}
+
+	/* Finish a page read: retrieve the (raw) ECC bytes*/
+	ret = mtd_ooblayout_get_eccbytes(mtd, ecccode, req->oobbuf.in, 0,
+					 total);
+	if (ret)
+		return ret;
+
+	/* Calculate the ECC bytes */
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, data += eccsize)
+		nand_ecc_sw_bch_calculate(nand, data, &ecccalc[i]);
+
+	/* Finish a page read: compare and correct */
+	for (eccsteps = engine_conf->nsteps, i = 0, data = req->databuf.in;
+	     eccsteps;
+	     eccsteps--, i += eccbytes, data += eccsize) {
+		int stat =  nand_ecc_sw_bch_correct(nand, data,
+						    &ecccode[i],
+						    &ecccalc[i]);
+		if (stat < 0) {
+			mtd->ecc_stats.failed++;
+		} else {
+			mtd->ecc_stats.corrected += stat;
+			max_bitflips = max_t(unsigned int, max_bitflips, stat);
+		}
+	}
+
+	nand_ecc_restore_req(&engine_conf->req_ctx, req);
+
+	return max_bitflips;
+}
+
+static struct nand_ecc_engine_ops nand_ecc_sw_bch_engine_ops = {
+	.init_ctx = nand_ecc_sw_bch_init_ctx,
+	.cleanup_ctx = nand_ecc_sw_bch_cleanup_ctx,
+	.prepare_io_req = nand_ecc_sw_bch_prepare_io_req,
+	.finish_io_req = nand_ecc_sw_bch_finish_io_req,
+};
+
+static struct nand_ecc_engine nand_ecc_sw_bch_engine = {
+	.ops = &nand_ecc_sw_bch_engine_ops,
+};
+
+struct nand_ecc_engine *nand_ecc_sw_bch_get_engine(void)
+{
+	return &nand_ecc_sw_bch_engine;
 }
-EXPORT_SYMBOL(nand_ecc_sw_bch_cleanup);
+EXPORT_SYMBOL(nand_ecc_sw_bch_get_engine);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 03106bf629dd..ebaf3bbfc1b2 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5150,17 +5150,11 @@ int rawnand_sw_bch_init(struct nand_chip *chip)
 	base->ecc.user_conf.step_size = chip->ecc.size;
 	base->ecc.user_conf.strength = chip->ecc.strength;
 
-	engine_conf = kzalloc(sizeof(*engine_conf), GFP_KERNEL);
-	if (!engine_conf)
-		return -ENOMEM;
-
-	engine_conf->code_size = chip->ecc.bytes;
-
-	base->ecc.ctx.priv = engine_conf;
-
-	ret = nand_ecc_sw_bch_init(base);
+	ret = nand_ecc_sw_bch_init_ctx(base);
 	if (ret)
-		kfree(base->ecc.ctx.priv);
+		return ret;
+
+	engine_conf = base->ecc.ctx.priv;
 
 	chip->ecc.size = base->ecc.ctx.conf.step_size;
 	chip->ecc.strength = base->ecc.ctx.conf.strength;
@@ -5168,7 +5162,7 @@ int rawnand_sw_bch_init(struct nand_chip *chip)
 	chip->ecc.steps = engine_conf->nsteps;
 	chip->ecc.bytes = engine_conf->code_size;
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(rawnand_sw_bch_init);
 
@@ -5194,9 +5188,7 @@ void rawnand_sw_bch_cleanup(struct nand_chip *chip)
 {
 	struct nand_device *base = &chip->base;
 
-	nand_ecc_sw_bch_cleanup(base);
-
-	kfree(base->ecc.ctx.priv);
+	nand_ecc_sw_bch_cleanup_ctx(base);
 }
 EXPORT_SYMBOL(rawnand_sw_bch_cleanup);
 
@@ -5308,51 +5300,15 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 		ecc->read_oob = nand_read_oob_std;
 		ecc->write_oob = nand_write_oob_std;
 
-		/*
-		* Board driver should supply ecc.size and ecc.strength
-		* values to select how many bits are correctable.
-		* Otherwise, default to 4 bits for large page devices.
-		*/
-		if (!ecc->size && (mtd->oobsize >= 64)) {
-			ecc->size = 512;
-			ecc->strength = 4;
-		}
-
-		/*
-		 * if no ecc placement scheme was provided pickup the default
-		 * large page one.
-		 */
-		if (!mtd->ooblayout) {
-			/* handle large page devices only */
-			if (mtd->oobsize < 64) {
-				WARN(1, "OOB layout is required when using software BCH on small pages\n");
-				return -EINVAL;
-			}
-
-			mtd_set_ooblayout(mtd, nand_get_large_page_ooblayout());
-
-		}
-
 		/*
 		 * We can only maximize ECC config when the default layout is
 		 * used, otherwise we don't know how many bytes can really be
 		 * used.
 		 */
-		if (mtd->ooblayout == nand_get_large_page_ooblayout() &&
-		    nanddev->ecc.user_conf.flags & NAND_ECC_MAXIMIZE_STRENGTH) {
-			int steps, bytes;
-
-			/* Always prefer 1k blocks over 512bytes ones */
-			ecc->size = 1024;
-			steps = mtd->writesize / ecc->size;
+		if (nanddev->ecc.user_conf.flags & NAND_ECC_MAXIMIZE_STRENGTH &&
+		    mtd->ooblayout != nand_get_large_page_ooblayout())
+			nanddev->ecc.user_conf.flags &= ~NAND_ECC_MAXIMIZE_STRENGTH;
 
-			/* Reserve 2 bytes for the BBM */
-			bytes = (mtd->oobsize - 2) / steps;
-			ecc->strength = bytes * 8 / fls(8 * ecc->size);
-		}
-
-		/* See the software BCH ECC initialization for details */
-		ecc->bytes = 0;
 		ret = rawnand_sw_bch_init(chip);
 		if (ret) {
 			WARN(1, "BCH ECC initialization failed!\n");
diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
index ce005528e55f..22c92073b3dd 100644
--- a/include/linux/mtd/nand-ecc-sw-bch.h
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -13,8 +13,8 @@
 
 /**
  * struct nand_ecc_sw_bch_conf - private software BCH ECC engine structure
- * @reqooblen: Save the actual user OOB length requested before overwriting it
- * @spare_oobbuf: Spare OOB buffer if none is provided
+ * @req_ctx: Save request context and tweak the original request to fit the
+ *           engine needs
  * @code_size: Number of bytes needed to store a code (one code per step)
  * @nsteps: Number of steps
  * @calc_buf: Buffer to use when calculating ECC bytes
@@ -24,8 +24,7 @@
  * @eccmask: XOR ecc mask, allows erased pages to be decoded as valid
  */
 struct nand_ecc_sw_bch_conf {
-	unsigned int reqooblen;
-	void *spare_oobbuf;
+	struct nand_ecc_req_tweak_ctx req_ctx;
 	unsigned int code_size;
 	unsigned int nsteps;
 	u8 *calc_buf;
@@ -41,8 +40,9 @@ int nand_ecc_sw_bch_calculate(struct nand_device *nand,
 			      const unsigned char *buf, unsigned char *code);
 int nand_ecc_sw_bch_correct(struct nand_device *nand, unsigned char *buf,
 			    unsigned char *read_ecc, unsigned char *calc_ecc);
-int nand_ecc_sw_bch_init(struct nand_device *nand);
-void nand_ecc_sw_bch_cleanup(struct nand_device *nand);
+int nand_ecc_sw_bch_init_ctx(struct nand_device *nand);
+void nand_ecc_sw_bch_cleanup_ctx(struct nand_device *nand);
+struct nand_ecc_engine *nand_ecc_sw_bch_get_engine(void);
 
 #else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
 
@@ -61,12 +61,12 @@ static inline int nand_ecc_sw_bch_correct(struct nand_device *nand,
 	return -ENOTSUPP;
 }
 
-static inline int nand_ecc_sw_bch_init(struct nand_device *nand)
+static inline int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
 {
 	return -ENOTSUPP;
 }
 
-static inline void nand_ecc_sw_bch_cleanup(struct nand_device *nand) {}
+static inline void nand_ecc_sw_bch_cleanup_ctx(struct nand_device *nand) {}
 
 #endif /* CONFIG_MTD_NAND_ECC_SW_BCH */
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 36e4fe08d0ea..df8548187713 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -278,6 +278,15 @@ int nand_ecc_finish_io_req(struct nand_device *nand,
 			   struct nand_page_io_req *req);
 bool nand_ecc_is_strong_enough(struct nand_device *nand);
 
+#if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
+struct nand_ecc_engine *nand_ecc_sw_bch_get_engine(void);
+#else
+static inline struct nand_ecc_engine *nand_ecc_sw_bch_get_engine(void)
+{
+	return NULL;
+}
+#endif /* CONFIG_MTD_NAND_ECC_SW_BCH */
+
 /**
  * struct nand_ecc_req_tweak_ctx - Help for automatically tweaking requests
  * @orig_req: Pointer to the original IO request
-- 
cgit 


From cbd87780bed580b585d2992f29077ac44950cb66 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:14 +0200
Subject: mtd: rawnand: Get rid of chip->ecc.priv

nand_ecc_ctrl embeds a private pointer which only has a meaning in the
sunxi driver. This structure will soon be deprecated, but as this
field is actually not needed, let's just drop it.

Cc: Maxime Ripard <mripard@kernel.org>
Cc: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Maxime Ripard <mripard@kernel.org>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-11-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/sunxi_nand.c | 31 +++++++++++++++++--------------
 include/linux/mtd/rawnand.h       |  2 --
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/sunxi_nand.c b/drivers/mtd/nand/raw/sunxi_nand.c
index 2a7ca3072f35..540529540ade 100644
--- a/drivers/mtd/nand/raw/sunxi_nand.c
+++ b/drivers/mtd/nand/raw/sunxi_nand.c
@@ -182,6 +182,7 @@ struct sunxi_nand_hw_ecc {
  *
  * @node: used to store NAND chips into a list
  * @nand: base NAND chip structure
+ * @ecc: ECC controller structure
  * @clk_rate: clk_rate required for this NAND chip
  * @timing_cfg: TIMING_CFG register value for this NAND chip
  * @timing_ctl: TIMING_CTL register value for this NAND chip
@@ -191,6 +192,7 @@ struct sunxi_nand_hw_ecc {
 struct sunxi_nand_chip {
 	struct list_head node;
 	struct nand_chip nand;
+	struct sunxi_nand_hw_ecc *ecc;
 	unsigned long clk_rate;
 	u32 timing_cfg;
 	u32 timing_ctl;
@@ -676,15 +678,15 @@ static void sunxi_nfc_randomizer_read_buf(struct nand_chip *nand, uint8_t *buf,
 
 static void sunxi_nfc_hw_ecc_enable(struct nand_chip *nand)
 {
+	struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
 	struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
-	struct sunxi_nand_hw_ecc *data = nand->ecc.priv;
 	u32 ecc_ctl;
 
 	ecc_ctl = readl(nfc->regs + NFC_REG_ECC_CTL);
 	ecc_ctl &= ~(NFC_ECC_MODE_MSK | NFC_ECC_PIPELINE |
 		     NFC_ECC_BLOCK_SIZE_MSK);
-	ecc_ctl |= NFC_ECC_EN | NFC_ECC_MODE(data->mode) | NFC_ECC_EXCEPTION |
-		   NFC_ECC_PIPELINE;
+	ecc_ctl |= NFC_ECC_EN | NFC_ECC_MODE(sunxi_nand->ecc->mode) |
+		   NFC_ECC_EXCEPTION | NFC_ECC_PIPELINE;
 
 	if (nand->ecc.size == 512)
 		ecc_ctl |= NFC_ECC_BLOCK_512;
@@ -1597,9 +1599,9 @@ static const struct mtd_ooblayout_ops sunxi_nand_ooblayout_ops = {
 	.free = sunxi_nand_ooblayout_free,
 };
 
-static void sunxi_nand_hw_ecc_ctrl_cleanup(struct nand_ecc_ctrl *ecc)
+static void sunxi_nand_hw_ecc_ctrl_cleanup(struct sunxi_nand_chip *sunxi_nand)
 {
-	kfree(ecc->priv);
+	kfree(sunxi_nand->ecc);
 }
 
 static int sunxi_nand_hw_ecc_ctrl_init(struct nand_chip *nand,
@@ -1607,10 +1609,10 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct nand_chip *nand,
 				       struct device_node *np)
 {
 	static const u8 strengths[] = { 16, 24, 28, 32, 40, 48, 56, 60, 64 };
+	struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
 	struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
 	struct mtd_info *mtd = nand_to_mtd(nand);
 	struct nand_device *nanddev = mtd_to_nanddev(mtd);
-	struct sunxi_nand_hw_ecc *data;
 	int nsectors;
 	int ret;
 	int i;
@@ -1647,8 +1649,8 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct nand_chip *nand,
 	if (ecc->size != 512 && ecc->size != 1024)
 		return -EINVAL;
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
+	sunxi_nand->ecc = kzalloc(sizeof(*sunxi_nand->ecc), GFP_KERNEL);
+	if (!sunxi_nand->ecc)
 		return -ENOMEM;
 
 	/* Prefer 1k ECC chunk over 512 ones */
@@ -1675,7 +1677,7 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct nand_chip *nand,
 		goto err;
 	}
 
-	data->mode = i;
+	sunxi_nand->ecc->mode = i;
 
 	/* HW ECC always request ECC bytes for 1024 bytes blocks */
 	ecc->bytes = DIV_ROUND_UP(ecc->strength * fls(8 * 1024), 8);
@@ -1693,7 +1695,6 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct nand_chip *nand,
 	ecc->read_oob = sunxi_nfc_hw_ecc_read_oob;
 	ecc->write_oob = sunxi_nfc_hw_ecc_write_oob;
 	mtd_set_ooblayout(mtd, &sunxi_nand_ooblayout_ops);
-	ecc->priv = data;
 
 	if (nfc->dmac) {
 		ecc->read_page = sunxi_nfc_hw_ecc_read_page_dma;
@@ -1714,16 +1715,18 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct nand_chip *nand,
 	return 0;
 
 err:
-	kfree(data);
+	kfree(sunxi_nand->ecc);
 
 	return ret;
 }
 
-static void sunxi_nand_ecc_cleanup(struct nand_ecc_ctrl *ecc)
+static void sunxi_nand_ecc_cleanup(struct sunxi_nand_chip *sunxi_nand)
 {
+	struct nand_ecc_ctrl *ecc = &sunxi_nand->nand.ecc;
+
 	switch (ecc->engine_type) {
 	case NAND_ECC_ENGINE_TYPE_ON_HOST:
-		sunxi_nand_hw_ecc_ctrl_cleanup(ecc);
+		sunxi_nand_hw_ecc_ctrl_cleanup(sunxi_nand);
 		break;
 	case NAND_ECC_ENGINE_TYPE_NONE:
 	default:
@@ -2053,7 +2056,7 @@ static void sunxi_nand_chips_cleanup(struct sunxi_nfc *nfc)
 		ret = mtd_device_unregister(nand_to_mtd(chip));
 		WARN_ON(ret);
 		nand_cleanup(chip);
-		sunxi_nand_ecc_cleanup(&chip->ecc);
+		sunxi_nand_ecc_cleanup(sunxi_nand);
 		list_del(&sunxi_nand->node);
 	}
 }
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 23623beaad1d..15743c2c2cab 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -302,7 +302,6 @@ static const struct nand_ecc_caps __name = {			\
  * @prepad:	padding information for syndrome based ECC generators
  * @postpad:	padding information for syndrome based ECC generators
  * @options:	ECC specific options (see NAND_ECC_XXX flags defined above)
- * @priv:	pointer to private ECC control data
  * @calc_buf:	buffer for calculated ECC, size is oobsize.
  * @code_buf:	buffer for ECC read from flash, size is oobsize.
  * @hwctl:	function to control hardware ECC generator. Must only
@@ -355,7 +354,6 @@ struct nand_ecc_ctrl {
 	int prepad;
 	int postpad;
 	unsigned int options;
-	void *priv;
 	u8 *calc_buf;
 	u8 *code_buf;
 	void (*hwctl)(struct nand_chip *chip, int mode);
-- 
cgit 


From e5acf9c862974041f7b2f581d1a40ccd29769add Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:15 +0200
Subject: mtd: nand: ecc-hamming: Move Hamming code to the generic NAND layer

Hamming ECC code might be later re-used by the SPI NAND layer.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-12-miquel.raynal@bootlin.com
---
 arch/arm/mach-s3c/common-smdk-s3c24xx.c |   2 +-
 arch/arm/mach-s3c/mach-anubis.c         |   2 +-
 arch/arm/mach-s3c/mach-at2440evb.c      |   2 +-
 arch/arm/mach-s3c/mach-bast.c           |   2 +-
 arch/arm/mach-s3c/mach-gta02.c          |   2 +-
 arch/arm/mach-s3c/mach-jive.c           |   2 +-
 arch/arm/mach-s3c/mach-mini2440.c       |   2 +-
 arch/arm/mach-s3c/mach-osiris.c         |   2 +-
 arch/arm/mach-s3c/mach-qt2410.c         |   2 +-
 arch/arm/mach-s3c/mach-rx3715.c         |   2 +-
 arch/arm/mach-s3c/mach-vstms.c          |   2 +-
 drivers/mtd/Kconfig                     |   1 +
 drivers/mtd/nand/Kconfig                |  11 +
 drivers/mtd/nand/Makefile               |   1 +
 drivers/mtd/nand/ecc-sw-hamming.c       | 484 ++++++++++++++++++++++++++++++++
 drivers/mtd/nand/raw/Kconfig            |  11 -
 drivers/mtd/nand/raw/Makefile           |   1 -
 drivers/mtd/nand/raw/cs553x_nand.c      |   2 +-
 drivers/mtd/nand/raw/fsl_elbc_nand.c    |   2 +-
 drivers/mtd/nand/raw/fsl_ifc_nand.c     |   2 +-
 drivers/mtd/nand/raw/fsl_upm.c          |   2 +-
 drivers/mtd/nand/raw/fsmc_nand.c        |   2 +-
 drivers/mtd/nand/raw/lpc32xx_mlc.c      |   2 +-
 drivers/mtd/nand/raw/lpc32xx_slc.c      |   2 +-
 drivers/mtd/nand/raw/mxic_nand.c        |   2 +-
 drivers/mtd/nand/raw/nand_base.c        |   2 +-
 drivers/mtd/nand/raw/nand_ecc.c         | 484 --------------------------------
 drivers/mtd/nand/raw/ndfc.c             |   2 +-
 drivers/mtd/nand/raw/pasemi_nand.c      |   2 +-
 drivers/mtd/nand/raw/s3c2410.c          |   2 +-
 drivers/mtd/nand/raw/sharpsl.c          |   2 +-
 drivers/mtd/nand/raw/tmio_nand.c        |   2 +-
 drivers/mtd/nand/raw/txx9ndfmc.c        |   2 +-
 drivers/mtd/sm_ftl.c                    |   2 +-
 drivers/mtd/tests/mtd_nandecctest.c     |   2 +-
 include/linux/mtd/nand-ecc-sw-hamming.h |  39 +++
 include/linux/mtd/nand_ecc.h            |  39 ---
 include/linux/mtd/sharpsl.h             |   2 +-
 38 files changed, 565 insertions(+), 564 deletions(-)
 create mode 100644 drivers/mtd/nand/ecc-sw-hamming.c
 delete mode 100644 drivers/mtd/nand/raw/nand_ecc.c
 create mode 100644 include/linux/mtd/nand-ecc-sw-hamming.h
 delete mode 100644 include/linux/mtd/nand_ecc.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c/common-smdk-s3c24xx.c b/arch/arm/mach-s3c/common-smdk-s3c24xx.c
index f860d8bcba0e..6d124bbd384c 100644
--- a/arch/arm/mach-s3c/common-smdk-s3c24xx.c
+++ b/arch/arm/mach-s3c/common-smdk-s3c24xx.c
@@ -20,7 +20,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
 
diff --git a/arch/arm/mach-s3c/mach-anubis.c b/arch/arm/mach-s3c/mach-anubis.c
index 90e3fd98a3ac..04147cc0adcc 100644
--- a/arch/arm/mach-s3c/mach-anubis.c
+++ b/arch/arm/mach-s3c/mach-anubis.c
@@ -34,7 +34,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include <net/ax88796.h>
diff --git a/arch/arm/mach-s3c/mach-at2440evb.c b/arch/arm/mach-s3c/mach-at2440evb.c
index 5fa49d4e2650..c6a5a51d84aa 100644
--- a/arch/arm/mach-s3c/mach-at2440evb.c
+++ b/arch/arm/mach-s3c/mach-at2440evb.c
@@ -35,7 +35,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include "devs.h"
diff --git a/arch/arm/mach-s3c/mach-bast.c b/arch/arm/mach-s3c/mach-bast.c
index 328f5d9ae9f9..27e8d5950228 100644
--- a/arch/arm/mach-s3c/mach-bast.c
+++ b/arch/arm/mach-s3c/mach-bast.c
@@ -24,7 +24,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include <linux/platform_data/asoc-s3c24xx_simtec.h>
diff --git a/arch/arm/mach-s3c/mach-gta02.c b/arch/arm/mach-s3c/mach-gta02.c
index 3c75c7d112ea..aec8b451c016 100644
--- a/arch/arm/mach-s3c/mach-gta02.c
+++ b/arch/arm/mach-s3c/mach-gta02.c
@@ -37,7 +37,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 
diff --git a/arch/arm/mach-s3c/mach-jive.c b/arch/arm/mach-s3c/mach-jive.c
index 2a29c3eca559..0785638a9069 100644
--- a/arch/arm/mach-s3c/mach-jive.c
+++ b/arch/arm/mach-s3c/mach-jive.c
@@ -40,7 +40,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include "gpio-cfg.h"
diff --git a/arch/arm/mach-s3c/mach-mini2440.c b/arch/arm/mach-s3c/mach-mini2440.c
index dc22ab839b95..4100905dfbd0 100644
--- a/arch/arm/mach-s3c/mach-mini2440.c
+++ b/arch/arm/mach-s3c/mach-mini2440.c
@@ -44,7 +44,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include "gpio-cfg.h"
diff --git a/arch/arm/mach-s3c/mach-osiris.c b/arch/arm/mach-s3c/mach-osiris.c
index 81744ca67d1d..3aefb9d22340 100644
--- a/arch/arm/mach-s3c/mach-osiris.c
+++ b/arch/arm/mach-s3c/mach-osiris.c
@@ -33,7 +33,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include "cpu.h"
diff --git a/arch/arm/mach-s3c/mach-qt2410.c b/arch/arm/mach-s3c/mach-qt2410.c
index 151e8e373d40..f88b961798fd 100644
--- a/arch/arm/mach-s3c/mach-qt2410.c
+++ b/arch/arm/mach-s3c/mach-qt2410.c
@@ -21,7 +21,7 @@
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include <asm/mach/arch.h>
diff --git a/arch/arm/mach-s3c/mach-rx3715.c b/arch/arm/mach-s3c/mach-rx3715.c
index a03662a47b38..9fd2d9dc3689 100644
--- a/arch/arm/mach-s3c/mach-rx3715.c
+++ b/arch/arm/mach-s3c/mach-rx3715.c
@@ -22,7 +22,7 @@
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include <asm/mach/arch.h>
diff --git a/arch/arm/mach-s3c/mach-vstms.c b/arch/arm/mach-s3c/mach-vstms.c
index 05f19f5ffabb..ec024af7b0ce 100644
--- a/arch/arm/mach-s3c/mach-vstms.c
+++ b/arch/arm/mach-s3c/mach-vstms.c
@@ -16,7 +16,7 @@
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/memblock.h>
 
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index 6ddab796216d..8bab6f8718a9 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -152,6 +152,7 @@ config SM_FTL
 	tristate "SmartMedia/xD new translation layer"
 	depends on BLOCK
 	select MTD_BLKDEVS
+	select MTD_NAND_CORE
 	select MTD_NAND_ECC_SW_HAMMING
 	help
 	  This enables EXPERIMENTAL R/W support for SmartMedia/xD
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 55c17fb0dee1..306c33ca3448 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -15,6 +15,17 @@ config MTD_NAND_ECC
        bool
        depends on MTD_NAND_CORE
 
+config MTD_NAND_ECC_SW_HAMMING
+	bool
+
+config MTD_NAND_ECC_SW_HAMMING_SMC
+	bool "NAND ECC Smart Media byte order"
+	depends on MTD_NAND_ECC_SW_HAMMING
+	default n
+	help
+	  Software ECC according to the Smart Media Specification.
+	  The original Linux implementation had byte 0 and 1 swapped.
+
 config MTD_NAND_ECC_SW_BCH
 	bool "Software BCH ECC engine"
 	select BCH
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index c7179ff23753..1c0b46960eb1 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -8,4 +8,5 @@ obj-y	+= raw/
 obj-y	+= spi/
 
 nandcore-$(CONFIG_MTD_NAND_ECC) += ecc.o
+nandcore-$(CONFIG_MTD_NAND_ECC_SW_HAMMING) += ecc-sw-hamming.o
 nandcore-$(CONFIG_MTD_NAND_ECC_SW_BCH) += ecc-sw-bch.o
diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c
new file mode 100644
index 000000000000..76966f5ff13f
--- /dev/null
+++ b/drivers/mtd/nand/ecc-sw-hamming.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains an ECC algorithm that detects and corrects 1 bit
+ * errors in a 256 byte block of data.
+ *
+ * Copyright © 2008 Koninklijke Philips Electronics NV.
+ *                  Author: Frans Meulenbroeks
+ *
+ * Completely replaces the previous ECC implementation which was written by:
+ *   Steven J. Hill (sjhill@realitydiluted.com)
+ *   Thomas Gleixner (tglx@linutronix.de)
+ *
+ * Information on how this algorithm works and how it was developed
+ * can be found in Documentation/driver-api/mtd/nand_ecc.rst
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/rawnand.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
+#include <asm/byteorder.h>
+
+/*
+ * invparity is a 256 byte table that contains the odd parity
+ * for each byte. So if the number of bits in a byte is even,
+ * the array element is 1, and when the number of bits is odd
+ * the array eleemnt is 0.
+ */
+static const char invparity[256] = {
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1
+};
+
+/*
+ * bitsperbyte contains the number of bits per byte
+ * this is only used for testing and repairing parity
+ * (a precalculated value slightly improves performance)
+ */
+static const char bitsperbyte[256] = {
+	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
+/*
+ * addressbits is a lookup table to filter out the bits from the xor-ed
+ * ECC data that identify the faulty location.
+ * this is only used for repairing parity
+ * see the comments in nand_correct_data for more details
+ */
+static const char addressbits[256] = {
+	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
+	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
+	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
+	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
+	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
+	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
+	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
+	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
+	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
+	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
+	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
+	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
+	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
+	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
+	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
+	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
+	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
+	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
+	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
+	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f
+};
+
+/**
+ * __nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte
+ *			 block
+ * @buf:	input buffer with raw data
+ * @eccsize:	data bytes per ECC step (256 or 512)
+ * @code:	output buffer with ECC
+ * @sm_order:	Smart Media byte ordering
+ */
+void __nand_calculate_ecc(const unsigned char *buf, unsigned int eccsize,
+			  unsigned char *code, bool sm_order)
+{
+	int i;
+	const uint32_t *bp = (uint32_t *)buf;
+	/* 256 or 512 bytes/ecc  */
+	const uint32_t eccsize_mult = eccsize >> 8;
+	uint32_t cur;		/* current value in buffer */
+	/* rp0..rp15..rp17 are the various accumulated parities (per byte) */
+	uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+	uint32_t rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15, rp16;
+	uint32_t rp17;
+	uint32_t par;		/* the cumulative parity for all data */
+	uint32_t tmppar;	/* the cumulative parity for this iteration;
+				   for rp12, rp14 and rp16 at the end of the
+				   loop */
+
+	par = 0;
+	rp4 = 0;
+	rp6 = 0;
+	rp8 = 0;
+	rp10 = 0;
+	rp12 = 0;
+	rp14 = 0;
+	rp16 = 0;
+
+	/*
+	 * The loop is unrolled a number of times;
+	 * This avoids if statements to decide on which rp value to update
+	 * Also we process the data by longwords.
+	 * Note: passing unaligned data might give a performance penalty.
+	 * It is assumed that the buffers are aligned.
+	 * tmppar is the cumulative sum of this iteration.
+	 * needed for calculating rp12, rp14, rp16 and par
+	 * also used as a performance improvement for rp6, rp8 and rp10
+	 */
+	for (i = 0; i < eccsize_mult << 2; i++) {
+		cur = *bp++;
+		tmppar = cur;
+		rp4 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp6 ^= tmppar;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp8 ^= tmppar;
+
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		rp6 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp6 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp10 ^= tmppar;
+
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		rp6 ^= cur;
+		rp8 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp6 ^= cur;
+		rp8 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		rp8 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp8 ^= cur;
+
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		rp6 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp6 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+
+		par ^= tmppar;
+		if ((i & 0x1) == 0)
+			rp12 ^= tmppar;
+		if ((i & 0x2) == 0)
+			rp14 ^= tmppar;
+		if (eccsize_mult == 2 && (i & 0x4) == 0)
+			rp16 ^= tmppar;
+	}
+
+	/*
+	 * handle the fact that we use longword operations
+	 * we'll bring rp4..rp14..rp16 back to single byte entities by
+	 * shifting and xoring first fold the upper and lower 16 bits,
+	 * then the upper and lower 8 bits.
+	 */
+	rp4 ^= (rp4 >> 16);
+	rp4 ^= (rp4 >> 8);
+	rp4 &= 0xff;
+	rp6 ^= (rp6 >> 16);
+	rp6 ^= (rp6 >> 8);
+	rp6 &= 0xff;
+	rp8 ^= (rp8 >> 16);
+	rp8 ^= (rp8 >> 8);
+	rp8 &= 0xff;
+	rp10 ^= (rp10 >> 16);
+	rp10 ^= (rp10 >> 8);
+	rp10 &= 0xff;
+	rp12 ^= (rp12 >> 16);
+	rp12 ^= (rp12 >> 8);
+	rp12 &= 0xff;
+	rp14 ^= (rp14 >> 16);
+	rp14 ^= (rp14 >> 8);
+	rp14 &= 0xff;
+	if (eccsize_mult == 2) {
+		rp16 ^= (rp16 >> 16);
+		rp16 ^= (rp16 >> 8);
+		rp16 &= 0xff;
+	}
+
+	/*
+	 * we also need to calculate the row parity for rp0..rp3
+	 * This is present in par, because par is now
+	 * rp3 rp3 rp2 rp2 in little endian and
+	 * rp2 rp2 rp3 rp3 in big endian
+	 * as well as
+	 * rp1 rp0 rp1 rp0 in little endian and
+	 * rp0 rp1 rp0 rp1 in big endian
+	 * First calculate rp2 and rp3
+	 */
+#ifdef __BIG_ENDIAN
+	rp2 = (par >> 16);
+	rp2 ^= (rp2 >> 8);
+	rp2 &= 0xff;
+	rp3 = par & 0xffff;
+	rp3 ^= (rp3 >> 8);
+	rp3 &= 0xff;
+#else
+	rp3 = (par >> 16);
+	rp3 ^= (rp3 >> 8);
+	rp3 &= 0xff;
+	rp2 = par & 0xffff;
+	rp2 ^= (rp2 >> 8);
+	rp2 &= 0xff;
+#endif
+
+	/* reduce par to 16 bits then calculate rp1 and rp0 */
+	par ^= (par >> 16);
+#ifdef __BIG_ENDIAN
+	rp0 = (par >> 8) & 0xff;
+	rp1 = (par & 0xff);
+#else
+	rp1 = (par >> 8) & 0xff;
+	rp0 = (par & 0xff);
+#endif
+
+	/* finally reduce par to 8 bits */
+	par ^= (par >> 8);
+	par &= 0xff;
+
+	/*
+	 * and calculate rp5..rp15..rp17
+	 * note that par = rp4 ^ rp5 and due to the commutative property
+	 * of the ^ operator we can say:
+	 * rp5 = (par ^ rp4);
+	 * The & 0xff seems superfluous, but benchmarking learned that
+	 * leaving it out gives slightly worse results. No idea why, probably
+	 * it has to do with the way the pipeline in pentium is organized.
+	 */
+	rp5 = (par ^ rp4) & 0xff;
+	rp7 = (par ^ rp6) & 0xff;
+	rp9 = (par ^ rp8) & 0xff;
+	rp11 = (par ^ rp10) & 0xff;
+	rp13 = (par ^ rp12) & 0xff;
+	rp15 = (par ^ rp14) & 0xff;
+	if (eccsize_mult == 2)
+		rp17 = (par ^ rp16) & 0xff;
+
+	/*
+	 * Finally calculate the ECC bits.
+	 * Again here it might seem that there are performance optimisations
+	 * possible, but benchmarks showed that on the system this is developed
+	 * the code below is the fastest
+	 */
+	if (sm_order) {
+		code[0] = (invparity[rp7] << 7) | (invparity[rp6] << 6) |
+			  (invparity[rp5] << 5) | (invparity[rp4] << 4) |
+			  (invparity[rp3] << 3) | (invparity[rp2] << 2) |
+			  (invparity[rp1] << 1) | (invparity[rp0]);
+		code[1] = (invparity[rp15] << 7) | (invparity[rp14] << 6) |
+			  (invparity[rp13] << 5) | (invparity[rp12] << 4) |
+			  (invparity[rp11] << 3) | (invparity[rp10] << 2) |
+			  (invparity[rp9] << 1) | (invparity[rp8]);
+	} else {
+		code[1] = (invparity[rp7] << 7) | (invparity[rp6] << 6) |
+			  (invparity[rp5] << 5) | (invparity[rp4] << 4) |
+			  (invparity[rp3] << 3) | (invparity[rp2] << 2) |
+			  (invparity[rp1] << 1) | (invparity[rp0]);
+		code[0] = (invparity[rp15] << 7) | (invparity[rp14] << 6) |
+			  (invparity[rp13] << 5) | (invparity[rp12] << 4) |
+			  (invparity[rp11] << 3) | (invparity[rp10] << 2) |
+			  (invparity[rp9] << 1) | (invparity[rp8]);
+	}
+
+	if (eccsize_mult == 1)
+		code[2] =
+		    (invparity[par & 0xf0] << 7) |
+		    (invparity[par & 0x0f] << 6) |
+		    (invparity[par & 0xcc] << 5) |
+		    (invparity[par & 0x33] << 4) |
+		    (invparity[par & 0xaa] << 3) |
+		    (invparity[par & 0x55] << 2) |
+		    3;
+	else
+		code[2] =
+		    (invparity[par & 0xf0] << 7) |
+		    (invparity[par & 0x0f] << 6) |
+		    (invparity[par & 0xcc] << 5) |
+		    (invparity[par & 0x33] << 4) |
+		    (invparity[par & 0xaa] << 3) |
+		    (invparity[par & 0x55] << 2) |
+		    (invparity[rp17] << 1) |
+		    (invparity[rp16] << 0);
+}
+EXPORT_SYMBOL(__nand_calculate_ecc);
+
+/**
+ * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte
+ *			 block
+ * @chip:	NAND chip object
+ * @buf:	input buffer with raw data
+ * @code:	output buffer with ECC
+ */
+int nand_calculate_ecc(struct nand_chip *chip, const unsigned char *buf,
+		       unsigned char *code)
+{
+	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
+
+	__nand_calculate_ecc(buf, chip->ecc.size, code, sm_order);
+
+	return 0;
+}
+EXPORT_SYMBOL(nand_calculate_ecc);
+
+/**
+ * __nand_correct_data - [NAND Interface] Detect and correct bit error(s)
+ * @buf:	raw data read from the chip
+ * @read_ecc:	ECC from the chip
+ * @calc_ecc:	the ECC calculated from raw data
+ * @eccsize:	data bytes per ECC step (256 or 512)
+ * @sm_order:	Smart Media byte order
+ *
+ * Detect and correct a 1 bit error for eccsize byte block
+ */
+int __nand_correct_data(unsigned char *buf,
+			unsigned char *read_ecc, unsigned char *calc_ecc,
+			unsigned int eccsize, bool sm_order)
+{
+	unsigned char b0, b1, b2, bit_addr;
+	unsigned int byte_addr;
+	/* 256 or 512 bytes/ecc  */
+	const uint32_t eccsize_mult = eccsize >> 8;
+
+	/*
+	 * b0 to b2 indicate which bit is faulty (if any)
+	 * we might need the xor result  more than once,
+	 * so keep them in a local var
+	*/
+	if (sm_order) {
+		b0 = read_ecc[0] ^ calc_ecc[0];
+		b1 = read_ecc[1] ^ calc_ecc[1];
+	} else {
+		b0 = read_ecc[1] ^ calc_ecc[1];
+		b1 = read_ecc[0] ^ calc_ecc[0];
+	}
+
+	b2 = read_ecc[2] ^ calc_ecc[2];
+
+	/* check if there are any bitfaults */
+
+	/* repeated if statements are slightly more efficient than switch ... */
+	/* ordered in order of likelihood */
+
+	if ((b0 | b1 | b2) == 0)
+		return 0;	/* no error */
+
+	if ((((b0 ^ (b0 >> 1)) & 0x55) == 0x55) &&
+	    (((b1 ^ (b1 >> 1)) & 0x55) == 0x55) &&
+	    ((eccsize_mult == 1 && ((b2 ^ (b2 >> 1)) & 0x54) == 0x54) ||
+	     (eccsize_mult == 2 && ((b2 ^ (b2 >> 1)) & 0x55) == 0x55))) {
+	/* single bit error */
+		/*
+		 * rp17/rp15/13/11/9/7/5/3/1 indicate which byte is the faulty
+		 * byte, cp 5/3/1 indicate the faulty bit.
+		 * A lookup table (called addressbits) is used to filter
+		 * the bits from the byte they are in.
+		 * A marginal optimisation is possible by having three
+		 * different lookup tables.
+		 * One as we have now (for b0), one for b2
+		 * (that would avoid the >> 1), and one for b1 (with all values
+		 * << 4). However it was felt that introducing two more tables
+		 * hardly justify the gain.
+		 *
+		 * The b2 shift is there to get rid of the lowest two bits.
+		 * We could also do addressbits[b2] >> 1 but for the
+		 * performance it does not make any difference
+		 */
+		if (eccsize_mult == 1)
+			byte_addr = (addressbits[b1] << 4) + addressbits[b0];
+		else
+			byte_addr = (addressbits[b2 & 0x3] << 8) +
+				    (addressbits[b1] << 4) + addressbits[b0];
+		bit_addr = addressbits[b2 >> 2];
+		/* flip the bit */
+		buf[byte_addr] ^= (1 << bit_addr);
+		return 1;
+
+	}
+	/* count nr of bits; use table lookup, faster than calculating it */
+	if ((bitsperbyte[b0] + bitsperbyte[b1] + bitsperbyte[b2]) == 1)
+		return 1;	/* error in ECC data; no action needed */
+
+	pr_err("%s: uncorrectable ECC error\n", __func__);
+	return -EBADMSG;
+}
+EXPORT_SYMBOL(__nand_correct_data);
+
+/**
+ * nand_correct_data - [NAND Interface] Detect and correct bit error(s)
+ * @chip:	NAND chip object
+ * @buf:	raw data read from the chip
+ * @read_ecc:	ECC from the chip
+ * @calc_ecc:	the ECC calculated from raw data
+ *
+ * Detect and correct a 1 bit error for 256/512 byte block
+ */
+int nand_correct_data(struct nand_chip *chip, unsigned char *buf,
+		      unsigned char *read_ecc, unsigned char *calc_ecc)
+{
+	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
+
+	return __nand_correct_data(buf, read_ecc, calc_ecc, chip->ecc.size,
+				   sm_order);
+}
+EXPORT_SYMBOL(nand_correct_data);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>");
+MODULE_DESCRIPTION("Generic NAND ECC support");
diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
index b73860aa77c6..6149096e32cb 100644
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -1,15 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-config MTD_NAND_ECC_SW_HAMMING
-	tristate
-
-config MTD_NAND_ECC_SW_HAMMING_SMC
-	bool "NAND ECC Smart Media byte order"
-	depends on MTD_NAND_ECC_SW_HAMMING
-	default n
-	help
-	  Software ECC according to the Smart Media Specification.
-	  The original Linux implementation had byte 0 and 1 swapped.
-
 menuconfig MTD_RAW_NAND
 	tristate "Raw/Parallel NAND Device Support"
 	select MTD_NAND_CORE
diff --git a/drivers/mtd/nand/raw/Makefile b/drivers/mtd/nand/raw/Makefile
index 76904305d091..dc38c087c693 100644
--- a/drivers/mtd/nand/raw/Makefile
+++ b/drivers/mtd/nand/raw/Makefile
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_MTD_RAW_NAND)		+= nand.o
-obj-$(CONFIG_MTD_NAND_ECC_SW_HAMMING)	+= nand_ecc.o
 obj-$(CONFIG_MTD_SM_COMMON) 		+= sm_common.o
 
 obj-$(CONFIG_MTD_NAND_CAFE)		+= cafe_nand.o
diff --git a/drivers/mtd/nand/raw/cs553x_nand.c b/drivers/mtd/nand/raw/cs553x_nand.c
index 282203debd0c..9a2bdb45a771 100644
--- a/drivers/mtd/nand/raw/cs553x_nand.c
+++ b/drivers/mtd/nand/raw/cs553x_nand.c
@@ -19,7 +19,7 @@
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/iopoll.h>
 
diff --git a/drivers/mtd/nand/raw/fsl_elbc_nand.c b/drivers/mtd/nand/raw/fsl_elbc_nand.c
index b2af7f81fdf8..eb255cf83e32 100644
--- a/drivers/mtd/nand/raw/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/raw/fsl_elbc_nand.c
@@ -22,7 +22,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include <asm/io.h>
diff --git a/drivers/mtd/nand/raw/fsl_ifc_nand.c b/drivers/mtd/nand/raw/fsl_ifc_nand.c
index e345f9d9f8e8..d2f84917fa8b 100644
--- a/drivers/mtd/nand/raw/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/raw/fsl_ifc_nand.c
@@ -15,7 +15,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/fsl_ifc.h>
 #include <linux/iopoll.h>
 
diff --git a/drivers/mtd/nand/raw/fsl_upm.c b/drivers/mtd/nand/raw/fsl_upm.c
index d5813b9abc8e..3824361928a1 100644
--- a/drivers/mtd/nand/raw/fsl_upm.c
+++ b/drivers/mtd/nand/raw/fsl_upm.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/mtd.h>
 #include <linux/of_platform.h>
diff --git a/drivers/mtd/nand/raw/fsmc_nand.c b/drivers/mtd/nand/raw/fsmc_nand.c
index 984b05e6bd38..e880db5852d8 100644
--- a/drivers/mtd/nand/raw/fsmc_nand.c
+++ b/drivers/mtd/nand/raw/fsmc_nand.c
@@ -26,7 +26,7 @@
 #include <linux/types.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/nand/raw/lpc32xx_mlc.c b/drivers/mtd/nand/raw/lpc32xx_mlc.c
index 9e728c731795..885b03b7af52 100644
--- a/drivers/mtd/nand/raw/lpc32xx_mlc.c
+++ b/drivers/mtd/nand/raw/lpc32xx_mlc.c
@@ -31,7 +31,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 
 #define DRV_NAME "lpc32xx_mlc"
 
diff --git a/drivers/mtd/nand/raw/lpc32xx_slc.c b/drivers/mtd/nand/raw/lpc32xx_slc.c
index dc7785e30d2f..0bf9c3fbcd82 100644
--- a/drivers/mtd/nand/raw/lpc32xx_slc.c
+++ b/drivers/mtd/nand/raw/lpc32xx_slc.c
@@ -23,7 +23,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/gpio.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
diff --git a/drivers/mtd/nand/raw/mxic_nand.c b/drivers/mtd/nand/raw/mxic_nand.c
index d66b5b0971fa..da1070993994 100644
--- a/drivers/mtd/nand/raw/mxic_nand.c
+++ b/drivers/mtd/nand/raw/mxic_nand.c
@@ -12,8 +12,8 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
 #include <linux/platform_device.h>
 
 #include "internals.h"
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index ebaf3bbfc1b2..bee7c64a0a4e 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -35,7 +35,7 @@
 #include <linux/types.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/nand-ecc-sw-bch.h>
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
diff --git a/drivers/mtd/nand/raw/nand_ecc.c b/drivers/mtd/nand/raw/nand_ecc.c
deleted file mode 100644
index b6a46b1b7781..000000000000
--- a/drivers/mtd/nand/raw/nand_ecc.c
+++ /dev/null
@@ -1,484 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * This file contains an ECC algorithm that detects and corrects 1 bit
- * errors in a 256 byte block of data.
- *
- * Copyright © 2008 Koninklijke Philips Electronics NV.
- *                  Author: Frans Meulenbroeks
- *
- * Completely replaces the previous ECC implementation which was written by:
- *   Steven J. Hill (sjhill@realitydiluted.com)
- *   Thomas Gleixner (tglx@linutronix.de)
- *
- * Information on how this algorithm works and how it was developed
- * can be found in Documentation/driver-api/mtd/nand_ecc.rst
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
-#include <asm/byteorder.h>
-
-/*
- * invparity is a 256 byte table that contains the odd parity
- * for each byte. So if the number of bits in a byte is even,
- * the array element is 1, and when the number of bits is odd
- * the array eleemnt is 0.
- */
-static const char invparity[256] = {
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1
-};
-
-/*
- * bitsperbyte contains the number of bits per byte
- * this is only used for testing and repairing parity
- * (a precalculated value slightly improves performance)
- */
-static const char bitsperbyte[256] = {
-	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
-};
-
-/*
- * addressbits is a lookup table to filter out the bits from the xor-ed
- * ECC data that identify the faulty location.
- * this is only used for repairing parity
- * see the comments in nand_correct_data for more details
- */
-static const char addressbits[256] = {
-	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
-	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
-	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
-	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
-	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
-	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
-	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
-	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
-	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
-	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
-	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
-	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
-	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
-	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
-	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
-	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
-	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
-	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
-	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
-	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
-	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
-	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
-	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
-	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
-	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
-	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
-	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
-	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
-	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
-	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
-	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
-	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f
-};
-
-/**
- * __nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte
- *			 block
- * @buf:	input buffer with raw data
- * @eccsize:	data bytes per ECC step (256 or 512)
- * @code:	output buffer with ECC
- * @sm_order:	Smart Media byte ordering
- */
-void __nand_calculate_ecc(const unsigned char *buf, unsigned int eccsize,
-			  unsigned char *code, bool sm_order)
-{
-	int i;
-	const uint32_t *bp = (uint32_t *)buf;
-	/* 256 or 512 bytes/ecc  */
-	const uint32_t eccsize_mult = eccsize >> 8;
-	uint32_t cur;		/* current value in buffer */
-	/* rp0..rp15..rp17 are the various accumulated parities (per byte) */
-	uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
-	uint32_t rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15, rp16;
-	uint32_t rp17;
-	uint32_t par;		/* the cumulative parity for all data */
-	uint32_t tmppar;	/* the cumulative parity for this iteration;
-				   for rp12, rp14 and rp16 at the end of the
-				   loop */
-
-	par = 0;
-	rp4 = 0;
-	rp6 = 0;
-	rp8 = 0;
-	rp10 = 0;
-	rp12 = 0;
-	rp14 = 0;
-	rp16 = 0;
-
-	/*
-	 * The loop is unrolled a number of times;
-	 * This avoids if statements to decide on which rp value to update
-	 * Also we process the data by longwords.
-	 * Note: passing unaligned data might give a performance penalty.
-	 * It is assumed that the buffers are aligned.
-	 * tmppar is the cumulative sum of this iteration.
-	 * needed for calculating rp12, rp14, rp16 and par
-	 * also used as a performance improvement for rp6, rp8 and rp10
-	 */
-	for (i = 0; i < eccsize_mult << 2; i++) {
-		cur = *bp++;
-		tmppar = cur;
-		rp4 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp6 ^= tmppar;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp4 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp8 ^= tmppar;
-
-		cur = *bp++;
-		tmppar ^= cur;
-		rp4 ^= cur;
-		rp6 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp6 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp4 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp10 ^= tmppar;
-
-		cur = *bp++;
-		tmppar ^= cur;
-		rp4 ^= cur;
-		rp6 ^= cur;
-		rp8 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp6 ^= cur;
-		rp8 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp4 ^= cur;
-		rp8 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp8 ^= cur;
-
-		cur = *bp++;
-		tmppar ^= cur;
-		rp4 ^= cur;
-		rp6 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp6 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-		rp4 ^= cur;
-		cur = *bp++;
-		tmppar ^= cur;
-
-		par ^= tmppar;
-		if ((i & 0x1) == 0)
-			rp12 ^= tmppar;
-		if ((i & 0x2) == 0)
-			rp14 ^= tmppar;
-		if (eccsize_mult == 2 && (i & 0x4) == 0)
-			rp16 ^= tmppar;
-	}
-
-	/*
-	 * handle the fact that we use longword operations
-	 * we'll bring rp4..rp14..rp16 back to single byte entities by
-	 * shifting and xoring first fold the upper and lower 16 bits,
-	 * then the upper and lower 8 bits.
-	 */
-	rp4 ^= (rp4 >> 16);
-	rp4 ^= (rp4 >> 8);
-	rp4 &= 0xff;
-	rp6 ^= (rp6 >> 16);
-	rp6 ^= (rp6 >> 8);
-	rp6 &= 0xff;
-	rp8 ^= (rp8 >> 16);
-	rp8 ^= (rp8 >> 8);
-	rp8 &= 0xff;
-	rp10 ^= (rp10 >> 16);
-	rp10 ^= (rp10 >> 8);
-	rp10 &= 0xff;
-	rp12 ^= (rp12 >> 16);
-	rp12 ^= (rp12 >> 8);
-	rp12 &= 0xff;
-	rp14 ^= (rp14 >> 16);
-	rp14 ^= (rp14 >> 8);
-	rp14 &= 0xff;
-	if (eccsize_mult == 2) {
-		rp16 ^= (rp16 >> 16);
-		rp16 ^= (rp16 >> 8);
-		rp16 &= 0xff;
-	}
-
-	/*
-	 * we also need to calculate the row parity for rp0..rp3
-	 * This is present in par, because par is now
-	 * rp3 rp3 rp2 rp2 in little endian and
-	 * rp2 rp2 rp3 rp3 in big endian
-	 * as well as
-	 * rp1 rp0 rp1 rp0 in little endian and
-	 * rp0 rp1 rp0 rp1 in big endian
-	 * First calculate rp2 and rp3
-	 */
-#ifdef __BIG_ENDIAN
-	rp2 = (par >> 16);
-	rp2 ^= (rp2 >> 8);
-	rp2 &= 0xff;
-	rp3 = par & 0xffff;
-	rp3 ^= (rp3 >> 8);
-	rp3 &= 0xff;
-#else
-	rp3 = (par >> 16);
-	rp3 ^= (rp3 >> 8);
-	rp3 &= 0xff;
-	rp2 = par & 0xffff;
-	rp2 ^= (rp2 >> 8);
-	rp2 &= 0xff;
-#endif
-
-	/* reduce par to 16 bits then calculate rp1 and rp0 */
-	par ^= (par >> 16);
-#ifdef __BIG_ENDIAN
-	rp0 = (par >> 8) & 0xff;
-	rp1 = (par & 0xff);
-#else
-	rp1 = (par >> 8) & 0xff;
-	rp0 = (par & 0xff);
-#endif
-
-	/* finally reduce par to 8 bits */
-	par ^= (par >> 8);
-	par &= 0xff;
-
-	/*
-	 * and calculate rp5..rp15..rp17
-	 * note that par = rp4 ^ rp5 and due to the commutative property
-	 * of the ^ operator we can say:
-	 * rp5 = (par ^ rp4);
-	 * The & 0xff seems superfluous, but benchmarking learned that
-	 * leaving it out gives slightly worse results. No idea why, probably
-	 * it has to do with the way the pipeline in pentium is organized.
-	 */
-	rp5 = (par ^ rp4) & 0xff;
-	rp7 = (par ^ rp6) & 0xff;
-	rp9 = (par ^ rp8) & 0xff;
-	rp11 = (par ^ rp10) & 0xff;
-	rp13 = (par ^ rp12) & 0xff;
-	rp15 = (par ^ rp14) & 0xff;
-	if (eccsize_mult == 2)
-		rp17 = (par ^ rp16) & 0xff;
-
-	/*
-	 * Finally calculate the ECC bits.
-	 * Again here it might seem that there are performance optimisations
-	 * possible, but benchmarks showed that on the system this is developed
-	 * the code below is the fastest
-	 */
-	if (sm_order) {
-		code[0] = (invparity[rp7] << 7) | (invparity[rp6] << 6) |
-			  (invparity[rp5] << 5) | (invparity[rp4] << 4) |
-			  (invparity[rp3] << 3) | (invparity[rp2] << 2) |
-			  (invparity[rp1] << 1) | (invparity[rp0]);
-		code[1] = (invparity[rp15] << 7) | (invparity[rp14] << 6) |
-			  (invparity[rp13] << 5) | (invparity[rp12] << 4) |
-			  (invparity[rp11] << 3) | (invparity[rp10] << 2) |
-			  (invparity[rp9] << 1) | (invparity[rp8]);
-	} else {
-		code[1] = (invparity[rp7] << 7) | (invparity[rp6] << 6) |
-			  (invparity[rp5] << 5) | (invparity[rp4] << 4) |
-			  (invparity[rp3] << 3) | (invparity[rp2] << 2) |
-			  (invparity[rp1] << 1) | (invparity[rp0]);
-		code[0] = (invparity[rp15] << 7) | (invparity[rp14] << 6) |
-			  (invparity[rp13] << 5) | (invparity[rp12] << 4) |
-			  (invparity[rp11] << 3) | (invparity[rp10] << 2) |
-			  (invparity[rp9] << 1) | (invparity[rp8]);
-	}
-
-	if (eccsize_mult == 1)
-		code[2] =
-		    (invparity[par & 0xf0] << 7) |
-		    (invparity[par & 0x0f] << 6) |
-		    (invparity[par & 0xcc] << 5) |
-		    (invparity[par & 0x33] << 4) |
-		    (invparity[par & 0xaa] << 3) |
-		    (invparity[par & 0x55] << 2) |
-		    3;
-	else
-		code[2] =
-		    (invparity[par & 0xf0] << 7) |
-		    (invparity[par & 0x0f] << 6) |
-		    (invparity[par & 0xcc] << 5) |
-		    (invparity[par & 0x33] << 4) |
-		    (invparity[par & 0xaa] << 3) |
-		    (invparity[par & 0x55] << 2) |
-		    (invparity[rp17] << 1) |
-		    (invparity[rp16] << 0);
-}
-EXPORT_SYMBOL(__nand_calculate_ecc);
-
-/**
- * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte
- *			 block
- * @chip:	NAND chip object
- * @buf:	input buffer with raw data
- * @code:	output buffer with ECC
- */
-int nand_calculate_ecc(struct nand_chip *chip, const unsigned char *buf,
-		       unsigned char *code)
-{
-	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
-
-	__nand_calculate_ecc(buf, chip->ecc.size, code, sm_order);
-
-	return 0;
-}
-EXPORT_SYMBOL(nand_calculate_ecc);
-
-/**
- * __nand_correct_data - [NAND Interface] Detect and correct bit error(s)
- * @buf:	raw data read from the chip
- * @read_ecc:	ECC from the chip
- * @calc_ecc:	the ECC calculated from raw data
- * @eccsize:	data bytes per ECC step (256 or 512)
- * @sm_order:	Smart Media byte order
- *
- * Detect and correct a 1 bit error for eccsize byte block
- */
-int __nand_correct_data(unsigned char *buf,
-			unsigned char *read_ecc, unsigned char *calc_ecc,
-			unsigned int eccsize, bool sm_order)
-{
-	unsigned char b0, b1, b2, bit_addr;
-	unsigned int byte_addr;
-	/* 256 or 512 bytes/ecc  */
-	const uint32_t eccsize_mult = eccsize >> 8;
-
-	/*
-	 * b0 to b2 indicate which bit is faulty (if any)
-	 * we might need the xor result  more than once,
-	 * so keep them in a local var
-	*/
-	if (sm_order) {
-		b0 = read_ecc[0] ^ calc_ecc[0];
-		b1 = read_ecc[1] ^ calc_ecc[1];
-	} else {
-		b0 = read_ecc[1] ^ calc_ecc[1];
-		b1 = read_ecc[0] ^ calc_ecc[0];
-	}
-
-	b2 = read_ecc[2] ^ calc_ecc[2];
-
-	/* check if there are any bitfaults */
-
-	/* repeated if statements are slightly more efficient than switch ... */
-	/* ordered in order of likelihood */
-
-	if ((b0 | b1 | b2) == 0)
-		return 0;	/* no error */
-
-	if ((((b0 ^ (b0 >> 1)) & 0x55) == 0x55) &&
-	    (((b1 ^ (b1 >> 1)) & 0x55) == 0x55) &&
-	    ((eccsize_mult == 1 && ((b2 ^ (b2 >> 1)) & 0x54) == 0x54) ||
-	     (eccsize_mult == 2 && ((b2 ^ (b2 >> 1)) & 0x55) == 0x55))) {
-	/* single bit error */
-		/*
-		 * rp17/rp15/13/11/9/7/5/3/1 indicate which byte is the faulty
-		 * byte, cp 5/3/1 indicate the faulty bit.
-		 * A lookup table (called addressbits) is used to filter
-		 * the bits from the byte they are in.
-		 * A marginal optimisation is possible by having three
-		 * different lookup tables.
-		 * One as we have now (for b0), one for b2
-		 * (that would avoid the >> 1), and one for b1 (with all values
-		 * << 4). However it was felt that introducing two more tables
-		 * hardly justify the gain.
-		 *
-		 * The b2 shift is there to get rid of the lowest two bits.
-		 * We could also do addressbits[b2] >> 1 but for the
-		 * performance it does not make any difference
-		 */
-		if (eccsize_mult == 1)
-			byte_addr = (addressbits[b1] << 4) + addressbits[b0];
-		else
-			byte_addr = (addressbits[b2 & 0x3] << 8) +
-				    (addressbits[b1] << 4) + addressbits[b0];
-		bit_addr = addressbits[b2 >> 2];
-		/* flip the bit */
-		buf[byte_addr] ^= (1 << bit_addr);
-		return 1;
-
-	}
-	/* count nr of bits; use table lookup, faster than calculating it */
-	if ((bitsperbyte[b0] + bitsperbyte[b1] + bitsperbyte[b2]) == 1)
-		return 1;	/* error in ECC data; no action needed */
-
-	pr_err("%s: uncorrectable ECC error\n", __func__);
-	return -EBADMSG;
-}
-EXPORT_SYMBOL(__nand_correct_data);
-
-/**
- * nand_correct_data - [NAND Interface] Detect and correct bit error(s)
- * @chip:	NAND chip object
- * @buf:	raw data read from the chip
- * @read_ecc:	ECC from the chip
- * @calc_ecc:	the ECC calculated from raw data
- *
- * Detect and correct a 1 bit error for 256/512 byte block
- */
-int nand_correct_data(struct nand_chip *chip, unsigned char *buf,
-		      unsigned char *read_ecc, unsigned char *calc_ecc)
-{
-	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
-
-	return __nand_correct_data(buf, read_ecc, calc_ecc, chip->ecc.size,
-				   sm_order);
-}
-EXPORT_SYMBOL(nand_correct_data);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>");
-MODULE_DESCRIPTION("Generic NAND ECC support");
diff --git a/drivers/mtd/nand/raw/ndfc.c b/drivers/mtd/nand/raw/ndfc.c
index 0fb4ba93c41e..8cda6633280b 100644
--- a/drivers/mtd/nand/raw/ndfc.c
+++ b/drivers/mtd/nand/raw/ndfc.c
@@ -18,7 +18,7 @@
  */
 #include <linux/module.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/ndfc.h>
 #include <linux/slab.h>
diff --git a/drivers/mtd/nand/raw/pasemi_nand.c b/drivers/mtd/nand/raw/pasemi_nand.c
index 4dfff34800f4..2dddbdca9e94 100644
--- a/drivers/mtd/nand/raw/pasemi_nand.c
+++ b/drivers/mtd/nand/raw/pasemi_nand.c
@@ -14,7 +14,7 @@
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
diff --git a/drivers/mtd/nand/raw/s3c2410.c b/drivers/mtd/nand/raw/s3c2410.c
index fbd0fa48e063..f7eb3b5fa792 100644
--- a/drivers/mtd/nand/raw/s3c2410.c
+++ b/drivers/mtd/nand/raw/s3c2410.c
@@ -30,7 +30,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include <linux/platform_data/mtd-nand-s3c2410.h>
diff --git a/drivers/mtd/nand/raw/sharpsl.c b/drivers/mtd/nand/raw/sharpsl.c
index af98bcc9d689..2fae6ade8b13 100644
--- a/drivers/mtd/nand/raw/sharpsl.c
+++ b/drivers/mtd/nand/raw/sharpsl.c
@@ -12,7 +12,7 @@
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/sharpsl.h>
 #include <linux/interrupt.h>
diff --git a/drivers/mtd/nand/raw/tmio_nand.c b/drivers/mtd/nand/raw/tmio_nand.c
index aa6c7e7bbf1b..fadca0b9e21f 100644
--- a/drivers/mtd/nand/raw/tmio_nand.c
+++ b/drivers/mtd/nand/raw/tmio_nand.c
@@ -35,7 +35,7 @@
 #include <linux/ioport.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/slab.h>
 
diff --git a/drivers/mtd/nand/raw/txx9ndfmc.c b/drivers/mtd/nand/raw/txx9ndfmc.c
index fe8ed2441588..68b920eab6c8 100644
--- a/drivers/mtd/nand/raw/txx9ndfmc.c
+++ b/drivers/mtd/nand/raw/txx9ndfmc.c
@@ -14,7 +14,7 @@
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
 #include <linux/platform_data/txx9/ndfmc.h>
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index b9f272408c4d..db430ae6a1a2 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -13,7 +13,7 @@
 #include <linux/sysfs.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include "nand/raw/sm_common.h"
 #include "sm_ftl.h"
 
diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c
index 13bca9ea0cae..e92e3fb287b6 100644
--- a/drivers/mtd/tests/mtd_nandecctest.c
+++ b/drivers/mtd/tests/mtd_nandecctest.c
@@ -8,7 +8,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 
 #include "mtd_test.h"
 
diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h
new file mode 100644
index 000000000000..30a0cfa50eed
--- /dev/null
+++ b/include/linux/mtd/nand-ecc-sw-hamming.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *  Copyright (C) 2000-2010 Steven J. Hill <sjhill@realitydiluted.com>
+ *			    David Woodhouse <dwmw2@infradead.org>
+ *			    Thomas Gleixner <tglx@linutronix.de>
+ *
+ * This file is the header for the ECC algorithm.
+ */
+
+#ifndef __MTD_NAND_ECC_SW_HAMMING_H__
+#define __MTD_NAND_ECC_SW_HAMMING_H__
+
+struct nand_chip;
+
+/*
+ * Calculate 3 byte ECC code for eccsize byte block
+ */
+void __nand_calculate_ecc(const u_char *dat, unsigned int eccsize,
+			  u_char *ecc_code, bool sm_order);
+
+/*
+ * Calculate 3 byte ECC code for 256/512 byte block
+ */
+int nand_calculate_ecc(struct nand_chip *chip, const u_char *dat,
+		       u_char *ecc_code);
+
+/*
+ * Detect and correct a 1 bit error for eccsize byte block
+ */
+int __nand_correct_data(u_char *dat, u_char *read_ecc, u_char *calc_ecc,
+			unsigned int eccsize, bool sm_order);
+
+/*
+ * Detect and correct a 1 bit error for 256/512 byte block
+ */
+int nand_correct_data(struct nand_chip *chip, u_char *dat, u_char *read_ecc,
+		      u_char *calc_ecc);
+
+#endif /* __MTD_NAND_ECC_SW_HAMMING_H__ */
diff --git a/include/linux/mtd/nand_ecc.h b/include/linux/mtd/nand_ecc.h
deleted file mode 100644
index d423916b94f0..000000000000
--- a/include/linux/mtd/nand_ecc.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- *  Copyright (C) 2000-2010 Steven J. Hill <sjhill@realitydiluted.com>
- *			    David Woodhouse <dwmw2@infradead.org>
- *			    Thomas Gleixner <tglx@linutronix.de>
- *
- * This file is the header for the ECC algorithm.
- */
-
-#ifndef __MTD_NAND_ECC_H__
-#define __MTD_NAND_ECC_H__
-
-struct nand_chip;
-
-/*
- * Calculate 3 byte ECC code for eccsize byte block
- */
-void __nand_calculate_ecc(const u_char *dat, unsigned int eccsize,
-			  u_char *ecc_code, bool sm_order);
-
-/*
- * Calculate 3 byte ECC code for 256/512 byte block
- */
-int nand_calculate_ecc(struct nand_chip *chip, const u_char *dat,
-		       u_char *ecc_code);
-
-/*
- * Detect and correct a 1 bit error for eccsize byte block
- */
-int __nand_correct_data(u_char *dat, u_char *read_ecc, u_char *calc_ecc,
-			unsigned int eccsize, bool sm_order);
-
-/*
- * Detect and correct a 1 bit error for 256/512 byte block
- */
-int nand_correct_data(struct nand_chip *chip, u_char *dat, u_char *read_ecc,
-		      u_char *calc_ecc);
-
-#endif /* __MTD_NAND_ECC_H__ */
diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h
index d2c3cf29e0d1..3762a90077d6 100644
--- a/include/linux/mtd/sharpsl.h
+++ b/include/linux/mtd/sharpsl.h
@@ -9,7 +9,7 @@
 #define _MTD_SHARPSL_H
 
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 struct sharpsl_nand_platform_data {
-- 
cgit 


From 2dbe0192efa02f2f405e193f4de84bf07c7f91fb Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:16 +0200
Subject: mtd: nand: ecc-hamming: Clarify the driver descriptions

The include file pretends being the header for "ECC algorithm", while
it is just the header for the Hamming implementation. Make this clear
by rewording the sentence.

Do the same with the module description.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-13-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-hamming.c       | 2 +-
 include/linux/mtd/nand-ecc-sw-hamming.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c
index 76966f5ff13f..a17faed374a8 100644
--- a/drivers/mtd/nand/ecc-sw-hamming.c
+++ b/drivers/mtd/nand/ecc-sw-hamming.c
@@ -481,4 +481,4 @@ EXPORT_SYMBOL(nand_correct_data);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>");
-MODULE_DESCRIPTION("Generic NAND ECC support");
+MODULE_DESCRIPTION("NAND software Hamming ECC support");
diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h
index 30a0cfa50eed..85e9a929b5f9 100644
--- a/include/linux/mtd/nand-ecc-sw-hamming.h
+++ b/include/linux/mtd/nand-ecc-sw-hamming.h
@@ -4,7 +4,7 @@
  *			    David Woodhouse <dwmw2@infradead.org>
  *			    Thomas Gleixner <tglx@linutronix.de>
  *
- * This file is the header for the ECC algorithm.
+ * This file is the header for the NAND Hamming ECC implementation.
  */
 
 #ifndef __MTD_NAND_ECC_SW_HAMMING_H__
-- 
cgit 


From 90ccf0a0192f7fa06e52de80cb528c5217e3e297 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:19 +0200
Subject: mtd: nand: ecc-hamming: Rename the exported functions

Prefix by ecc_sw_hamming_ the functions which should be internal only
but are exported for "raw" operations.

Prefix by nand_ecc_sw_hamming_ the other functions which will be used
in the context of the declaration of an Hamming proper ECC engine
object.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-16-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-hamming.c       | 53 +++++++++++++++++----------------
 drivers/mtd/nand/raw/cs553x_nand.c      |  3 +-
 drivers/mtd/nand/raw/fsmc_nand.c        |  2 +-
 drivers/mtd/nand/raw/lpc32xx_slc.c      |  2 +-
 drivers/mtd/nand/raw/nand_base.c        | 25 ++++++++++++++--
 drivers/mtd/nand/raw/ndfc.c             |  3 +-
 drivers/mtd/nand/raw/sharpsl.c          |  2 +-
 drivers/mtd/nand/raw/tmio_nand.c        |  6 ++--
 drivers/mtd/nand/raw/txx9ndfmc.c        |  4 +--
 drivers/mtd/sm_ftl.c                    | 28 ++++++++---------
 drivers/mtd/tests/mtd_nandecctest.c     | 29 +++++++++---------
 include/linux/mtd/nand-ecc-sw-hamming.h | 36 ++++++++--------------
 include/linux/mtd/rawnand.h             |  7 +++++
 13 files changed, 108 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c
index 771e27ed8883..7ee0387ecb29 100644
--- a/drivers/mtd/nand/ecc-sw-hamming.c
+++ b/drivers/mtd/nand/ecc-sw-hamming.c
@@ -75,7 +75,7 @@ static const char bitsperbyte[256] = {
  * addressbits is a lookup table to filter out the bits from the xor-ed
  * ECC data that identify the faulty location.
  * this is only used for repairing parity
- * see the comments in nand_correct_data for more details
+ * see the comments in nand_ecc_sw_hamming_correct for more details
  */
 static const char addressbits[256] = {
 	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
@@ -112,11 +112,11 @@ static const char addressbits[256] = {
 	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f
 };
 
-void __nand_calculate_ecc(const unsigned char *buf, unsigned int eccsize,
-			  unsigned char *code, bool sm_order)
+int ecc_sw_hamming_calculate(const unsigned char *buf, unsigned int step_size,
+			     unsigned char *code, bool sm_order)
 {
 	const u32 *bp = (uint32_t *)buf;
-	const u32 eccsize_mult = eccsize >> 8;
+	const u32 eccsize_mult = step_size >> 8;
 	/* current value in buffer */
 	u32 cur;
 	/* rp0..rp17 are the various accumulated parities (per byte) */
@@ -347,31 +347,32 @@ void __nand_calculate_ecc(const unsigned char *buf, unsigned int eccsize,
 		    (invparity[par & 0x55] << 2) |
 		    (invparity[rp17] << 1) |
 		    (invparity[rp16] << 0);
+
+	return 0;
 }
-EXPORT_SYMBOL(__nand_calculate_ecc);
+EXPORT_SYMBOL(ecc_sw_hamming_calculate);
 
 /**
- * nand_calculate_ecc - Calculate 3-byte ECC for 256/512-byte block
- * @chip: NAND chip object
+ * nand_ecc_sw_hamming_calculate - Calculate 3-byte ECC for 256/512-byte block
+ * @nand: NAND device
  * @buf: Input buffer with raw data
  * @code: Output buffer with ECC
  */
-int nand_calculate_ecc(struct nand_chip *chip, const unsigned char *buf,
-		       unsigned char *code)
+int nand_ecc_sw_hamming_calculate(struct nand_device *nand,
+				  const unsigned char *buf, unsigned char *code)
 {
+	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
 	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
 
-	__nand_calculate_ecc(buf, chip->ecc.size, code, sm_order);
-
-	return 0;
+	return ecc_sw_hamming_calculate(buf, chip->ecc.size, code, sm_order);
 }
-EXPORT_SYMBOL(nand_calculate_ecc);
+EXPORT_SYMBOL(nand_ecc_sw_hamming_calculate);
 
-int __nand_correct_data(unsigned char *buf,
-			unsigned char *read_ecc, unsigned char *calc_ecc,
-			unsigned int eccsize, bool sm_order)
+int ecc_sw_hamming_correct(unsigned char *buf, unsigned char *read_ecc,
+			   unsigned char *calc_ecc, unsigned int step_size,
+			   bool sm_order)
 {
-	const u32 eccsize_mult = eccsize >> 8;
+	const u32 eccsize_mult = step_size >> 8;
 	unsigned char b0, b1, b2, bit_addr;
 	unsigned int byte_addr;
 
@@ -437,26 +438,28 @@ int __nand_correct_data(unsigned char *buf,
 	pr_err("%s: uncorrectable ECC error\n", __func__);
 	return -EBADMSG;
 }
-EXPORT_SYMBOL(__nand_correct_data);
+EXPORT_SYMBOL(ecc_sw_hamming_correct);
 
 /**
- * nand_correct_data - Detect and correct bit error(s)
- * @chip: NAND chip object
+ * nand_ecc_sw_hamming_correct - Detect and correct bit error(s)
+ * @nand: NAND device
  * @buf: Raw data read from the chip
  * @read_ecc: ECC bytes read from the chip
  * @calc_ecc: ECC calculated from the raw data
  *
  * Detect and correct up to 1 bit error per 256/512-byte block.
  */
-int nand_correct_data(struct nand_chip *chip, unsigned char *buf,
-		      unsigned char *read_ecc, unsigned char *calc_ecc)
+int nand_ecc_sw_hamming_correct(struct nand_device *nand, unsigned char *buf,
+				unsigned char *read_ecc,
+				unsigned char *calc_ecc)
 {
+	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
 	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
 
-	return __nand_correct_data(buf, read_ecc, calc_ecc, chip->ecc.size,
-				   sm_order);
+	return ecc_sw_hamming_correct(buf, read_ecc, calc_ecc, chip->ecc.size,
+				      sm_order);
 }
-EXPORT_SYMBOL(nand_correct_data);
+EXPORT_SYMBOL(nand_ecc_sw_hamming_correct);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>");
diff --git a/drivers/mtd/nand/raw/cs553x_nand.c b/drivers/mtd/nand/raw/cs553x_nand.c
index 9a2bdb45a771..6edf78c16fc8 100644
--- a/drivers/mtd/nand/raw/cs553x_nand.c
+++ b/drivers/mtd/nand/raw/cs553x_nand.c
@@ -19,7 +19,6 @@
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/iopoll.h>
 
@@ -252,7 +251,7 @@ static int cs553x_attach_chip(struct nand_chip *chip)
 	chip->ecc.bytes = 3;
 	chip->ecc.hwctl  = cs_enable_hwecc;
 	chip->ecc.calculate = cs_calculate_ecc;
-	chip->ecc.correct  = nand_correct_data;
+	chip->ecc.correct  = rawnand_sw_hamming_correct;
 	chip->ecc.strength = 1;
 
 	return 0;
diff --git a/drivers/mtd/nand/raw/fsmc_nand.c b/drivers/mtd/nand/raw/fsmc_nand.c
index e880db5852d8..1e2ed45be2f1 100644
--- a/drivers/mtd/nand/raw/fsmc_nand.c
+++ b/drivers/mtd/nand/raw/fsmc_nand.c
@@ -918,7 +918,7 @@ static int fsmc_nand_attach_chip(struct nand_chip *nand)
 	case NAND_ECC_ENGINE_TYPE_ON_HOST:
 		dev_info(host->dev, "Using 1-bit HW ECC scheme\n");
 		nand->ecc.calculate = fsmc_read_hwecc_ecc1;
-		nand->ecc.correct = nand_correct_data;
+		nand->ecc.correct = rawnand_sw_hamming_correct;
 		nand->ecc.hwctl = fsmc_enable_hwecc;
 		nand->ecc.bytes = 3;
 		nand->ecc.strength = 1;
diff --git a/drivers/mtd/nand/raw/lpc32xx_slc.c b/drivers/mtd/nand/raw/lpc32xx_slc.c
index 0bf9c3fbcd82..beecabfa5181 100644
--- a/drivers/mtd/nand/raw/lpc32xx_slc.c
+++ b/drivers/mtd/nand/raw/lpc32xx_slc.c
@@ -803,7 +803,7 @@ static int lpc32xx_nand_attach_chip(struct nand_chip *chip)
 	chip->ecc.write_oob = lpc32xx_nand_write_oob_syndrome;
 	chip->ecc.read_oob = lpc32xx_nand_read_oob_syndrome;
 	chip->ecc.calculate = lpc32xx_nand_ecc_calculate;
-	chip->ecc.correct = nand_correct_data;
+	chip->ecc.correct = rawnand_sw_hamming_correct;
 	chip->ecc.hwctl = lpc32xx_nand_ecc_enable;
 
 	/*
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index bee7c64a0a4e..32e9870a2cf8 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5139,6 +5139,27 @@ static void nand_scan_ident_cleanup(struct nand_chip *chip)
 	kfree(chip->parameters.onfi);
 }
 
+int rawnand_sw_hamming_calculate(struct nand_chip *chip,
+				 const unsigned char *buf,
+				 unsigned char *code)
+{
+	struct nand_device *base = &chip->base;
+
+	return nand_ecc_sw_hamming_calculate(base, buf, code);
+}
+EXPORT_SYMBOL(rawnand_sw_hamming_calculate);
+
+int rawnand_sw_hamming_correct(struct nand_chip *chip,
+			       unsigned char *buf,
+			       unsigned char *read_ecc,
+			       unsigned char *calc_ecc)
+{
+	struct nand_device *base = &chip->base;
+
+	return nand_ecc_sw_hamming_correct(base, buf, read_ecc, calc_ecc);
+}
+EXPORT_SYMBOL(rawnand_sw_hamming_correct);
+
 int rawnand_sw_bch_init(struct nand_chip *chip)
 {
 	struct nand_device *base = &chip->base;
@@ -5263,8 +5284,8 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 
 	switch (ecc->algo) {
 	case NAND_ECC_ALGO_HAMMING:
-		ecc->calculate = nand_calculate_ecc;
-		ecc->correct = nand_correct_data;
+		ecc->calculate = rawnand_sw_hamming_calculate;
+		ecc->correct = rawnand_sw_hamming_correct;
 		ecc->read_page = nand_read_page_swecc;
 		ecc->read_subpage = nand_read_subpage;
 		ecc->write_page = nand_write_page_swecc;
diff --git a/drivers/mtd/nand/raw/ndfc.c b/drivers/mtd/nand/raw/ndfc.c
index 8cda6633280b..338d6b1a189e 100644
--- a/drivers/mtd/nand/raw/ndfc.c
+++ b/drivers/mtd/nand/raw/ndfc.c
@@ -18,7 +18,6 @@
  */
 #include <linux/module.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/ndfc.h>
 #include <linux/slab.h>
@@ -146,7 +145,7 @@ static int ndfc_chip_init(struct ndfc_controller *ndfc,
 	chip->controller = &ndfc->ndfc_control;
 	chip->legacy.read_buf = ndfc_read_buf;
 	chip->legacy.write_buf = ndfc_write_buf;
-	chip->ecc.correct = nand_correct_data;
+	chip->ecc.correct = rawnand_sw_hamming_correct;
 	chip->ecc.hwctl = ndfc_enable_hwecc;
 	chip->ecc.calculate = ndfc_calculate_ecc;
 	chip->ecc.engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST;
diff --git a/drivers/mtd/nand/raw/sharpsl.c b/drivers/mtd/nand/raw/sharpsl.c
index 2fae6ade8b13..3f55b523aead 100644
--- a/drivers/mtd/nand/raw/sharpsl.c
+++ b/drivers/mtd/nand/raw/sharpsl.c
@@ -107,7 +107,7 @@ static int sharpsl_attach_chip(struct nand_chip *chip)
 	chip->ecc.strength = 1;
 	chip->ecc.hwctl = sharpsl_nand_enable_hwecc;
 	chip->ecc.calculate = sharpsl_nand_calculate_ecc;
-	chip->ecc.correct = nand_correct_data;
+	chip->ecc.correct = rawnand_sw_hamming_correct;
 
 	return 0;
 }
diff --git a/drivers/mtd/nand/raw/tmio_nand.c b/drivers/mtd/nand/raw/tmio_nand.c
index fadca0b9e21f..29edacbec826 100644
--- a/drivers/mtd/nand/raw/tmio_nand.c
+++ b/drivers/mtd/nand/raw/tmio_nand.c
@@ -293,11 +293,11 @@ static int tmio_nand_correct_data(struct nand_chip *chip, unsigned char *buf,
 	int r0, r1;
 
 	/* assume ecc.size = 512 and ecc.bytes = 6 */
-	r0 = __nand_correct_data(buf, read_ecc, calc_ecc, 256, false);
+	r0 = rawnand_sw_hamming_correct(chip, buf, read_ecc, calc_ecc);
 	if (r0 < 0)
 		return r0;
-	r1 = __nand_correct_data(buf + 256, read_ecc + 3, calc_ecc + 3, 256,
-				 false);
+	r1 = rawnand_sw_hamming_correct(chip, buf + 256, read_ecc + 3,
+					calc_ecc + 3);
 	if (r1 < 0)
 		return r1;
 	return r0 + r1;
diff --git a/drivers/mtd/nand/raw/txx9ndfmc.c b/drivers/mtd/nand/raw/txx9ndfmc.c
index 68b920eab6c8..a705523eb916 100644
--- a/drivers/mtd/nand/raw/txx9ndfmc.c
+++ b/drivers/mtd/nand/raw/txx9ndfmc.c
@@ -194,8 +194,8 @@ static int txx9ndfmc_correct_data(struct nand_chip *chip, unsigned char *buf,
 	int stat;
 
 	for (eccsize = chip->ecc.size; eccsize > 0; eccsize -= 256) {
-		stat = __nand_correct_data(buf, read_ecc, calc_ecc, 256,
-					   false);
+		stat = rawnand_sw_hamming_correct(chip, buf, read_ecc,
+						  calc_ecc);
 		if (stat < 0)
 			return stat;
 		corrected += stat;
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index db430ae6a1a2..4d1ae25507ab 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -216,20 +216,19 @@ static void sm_break_offset(struct sm_ftl *ftl, loff_t loffset,
 
 static int sm_correct_sector(uint8_t *buffer, struct sm_oob *oob)
 {
+	bool sm_order = IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC);
 	uint8_t ecc[3];
 
-	__nand_calculate_ecc(buffer, SM_SMALL_PAGE, ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
-	if (__nand_correct_data(buffer, ecc, oob->ecc1, SM_SMALL_PAGE,
-				IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC)) < 0)
+	ecc_sw_hamming_calculate(buffer, SM_SMALL_PAGE, ecc, sm_order);
+	if (ecc_sw_hamming_correct(buffer, ecc, oob->ecc1, SM_SMALL_PAGE,
+				   sm_order) < 0)
 		return -EIO;
 
 	buffer += SM_SMALL_PAGE;
 
-	__nand_calculate_ecc(buffer, SM_SMALL_PAGE, ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
-	if (__nand_correct_data(buffer, ecc, oob->ecc2, SM_SMALL_PAGE,
-				IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC)) < 0)
+	ecc_sw_hamming_calculate(buffer, SM_SMALL_PAGE, ecc, sm_order);
+	if (ecc_sw_hamming_correct(buffer, ecc, oob->ecc2, SM_SMALL_PAGE,
+				   sm_order) < 0)
 		return -EIO;
 	return 0;
 }
@@ -369,6 +368,7 @@ static int sm_write_block(struct sm_ftl *ftl, uint8_t *buf,
 			  int zone, int block, int lba,
 			  unsigned long invalid_bitmap)
 {
+	bool sm_order = IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC);
 	struct sm_oob oob;
 	int boffset;
 	int retry = 0;
@@ -395,13 +395,13 @@ restart:
 		}
 
 		if (ftl->smallpagenand) {
-			__nand_calculate_ecc(buf + boffset, SM_SMALL_PAGE,
-					oob.ecc1,
-					IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
+			ecc_sw_hamming_calculate(buf + boffset,
+						 SM_SMALL_PAGE, oob.ecc1,
+						 sm_order);
 
-			__nand_calculate_ecc(buf + boffset + SM_SMALL_PAGE,
-					SM_SMALL_PAGE, oob.ecc2,
-					IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
+			ecc_sw_hamming_calculate(buf + boffset + SM_SMALL_PAGE,
+						 SM_SMALL_PAGE, oob.ecc2,
+						 sm_order);
 		}
 		if (!sm_write_sector(ftl, zone, block, boffset,
 							buf + boffset, &oob))
diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c
index e92e3fb287b6..c4f271314f52 100644
--- a/drivers/mtd/tests/mtd_nandecctest.c
+++ b/drivers/mtd/tests/mtd_nandecctest.c
@@ -119,13 +119,13 @@ static void no_bit_error(void *error_data, void *error_ecc,
 static int no_bit_error_verify(void *error_data, void *error_ecc,
 				void *correct_data, const size_t size)
 {
+	bool sm_order = IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC);
 	unsigned char calc_ecc[3];
 	int ret;
 
-	__nand_calculate_ecc(error_data, size, calc_ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
-	ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size,
-				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
+	ecc_sw_hamming_calculate(error_data, size, calc_ecc, sm_order);
+	ret = ecc_sw_hamming_correct(error_data, error_ecc, calc_ecc, size,
+				     sm_order);
 	if (ret == 0 && !memcmp(correct_data, error_data, size))
 		return 0;
 
@@ -149,13 +149,13 @@ static void single_bit_error_in_ecc(void *error_data, void *error_ecc,
 static int single_bit_error_correct(void *error_data, void *error_ecc,
 				void *correct_data, const size_t size)
 {
+	bool sm_order = IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC);
 	unsigned char calc_ecc[3];
 	int ret;
 
-	__nand_calculate_ecc(error_data, size, calc_ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
-	ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size,
-				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
+	ecc_sw_hamming_calculate(error_data, size, calc_ecc, sm_order);
+	ret = ecc_sw_hamming_correct(error_data, error_ecc, calc_ecc, size,
+				     sm_order);
 	if (ret == 1 && !memcmp(correct_data, error_data, size))
 		return 0;
 
@@ -186,13 +186,13 @@ static void double_bit_error_in_ecc(void *error_data, void *error_ecc,
 static int double_bit_error_detect(void *error_data, void *error_ecc,
 				void *correct_data, const size_t size)
 {
+	bool sm_order = IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC);
 	unsigned char calc_ecc[3];
 	int ret;
 
-	__nand_calculate_ecc(error_data, size, calc_ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
-	ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size,
-				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
+	ecc_sw_hamming_calculate(error_data, size, calc_ecc, sm_order);
+	ret = ecc_sw_hamming_correct(error_data, error_ecc, calc_ecc, size,
+				     sm_order);
 
 	return (ret == -EBADMSG) ? 0 : -EINVAL;
 }
@@ -248,6 +248,7 @@ static void dump_data_ecc(void *error_data, void *error_ecc, void *correct_data,
 
 static int nand_ecc_test_run(const size_t size)
 {
+	bool sm_order = IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC);
 	int i;
 	int err = 0;
 	void *error_data;
@@ -266,9 +267,7 @@ static int nand_ecc_test_run(const size_t size)
 	}
 
 	prandom_bytes(correct_data, size);
-	__nand_calculate_ecc(correct_data, size, correct_ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
-
+	ecc_sw_hamming_calculate(correct_data, size, correct_ecc, sm_order);
 	for (i = 0; i < ARRAY_SIZE(nand_ecc_test); i++) {
 		nand_ecc_test[i].prepare(error_data, error_ecc,
 				correct_data, correct_ecc, size);
diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h
index 85e9a929b5f9..84a123bf45f3 100644
--- a/include/linux/mtd/nand-ecc-sw-hamming.h
+++ b/include/linux/mtd/nand-ecc-sw-hamming.h
@@ -10,30 +10,18 @@
 #ifndef __MTD_NAND_ECC_SW_HAMMING_H__
 #define __MTD_NAND_ECC_SW_HAMMING_H__
 
-struct nand_chip;
+#include <linux/mtd/nand.h>
 
-/*
- * Calculate 3 byte ECC code for eccsize byte block
- */
-void __nand_calculate_ecc(const u_char *dat, unsigned int eccsize,
-			  u_char *ecc_code, bool sm_order);
-
-/*
- * Calculate 3 byte ECC code for 256/512 byte block
- */
-int nand_calculate_ecc(struct nand_chip *chip, const u_char *dat,
-		       u_char *ecc_code);
-
-/*
- * Detect and correct a 1 bit error for eccsize byte block
- */
-int __nand_correct_data(u_char *dat, u_char *read_ecc, u_char *calc_ecc,
-			unsigned int eccsize, bool sm_order);
-
-/*
- * Detect and correct a 1 bit error for 256/512 byte block
- */
-int nand_correct_data(struct nand_chip *chip, u_char *dat, u_char *read_ecc,
-		      u_char *calc_ecc);
+int ecc_sw_hamming_calculate(const unsigned char *buf, unsigned int step_size,
+			     unsigned char *code, bool sm_order);
+int nand_ecc_sw_hamming_calculate(struct nand_device *nand,
+				  const unsigned char *buf,
+				  unsigned char *code);
+int ecc_sw_hamming_correct(unsigned char *buf, unsigned char *read_ecc,
+			   unsigned char *calc_ecc, unsigned int step_size,
+			   bool sm_order);
+int nand_ecc_sw_hamming_correct(struct nand_device *nand, unsigned char *buf,
+				unsigned char *read_ecc,
+				unsigned char *calc_ecc);
 
 #endif /* __MTD_NAND_ECC_SW_HAMMING_H__ */
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 15743c2c2cab..685d24557e95 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1301,6 +1301,13 @@ static inline int nand_opcode_8bits(unsigned int command)
 	return 0;
 }
 
+int rawnand_sw_hamming_calculate(struct nand_chip *chip,
+				 const unsigned char *buf,
+				 unsigned char *code);
+int rawnand_sw_hamming_correct(struct nand_chip *chip,
+			       unsigned char *buf,
+			       unsigned char *read_ecc,
+			       unsigned char *calc_ecc);
 int rawnand_sw_bch_init(struct nand_chip *chip);
 int rawnand_sw_bch_correct(struct nand_chip *chip, unsigned char *buf,
 			   unsigned char *read_ecc, unsigned char *calc_ecc);
-- 
cgit 


From 19b2ce184b9f404d6620adf667a9019e6abcae51 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:20 +0200
Subject: mtd: nand: ecc-hamming: Stop using raw NAND structures

This code is meant to be reused by the SPI-NAND core. Now that the
driver has been cleaned and reorganized, use a generic ECC engine
object to store the driver's data instead of accessing members of the
nand_chip structure. This means adding proper init/cleanup helpers.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-17-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-hamming.c       | 17 +++++----
 drivers/mtd/nand/raw/nand_base.c        | 63 +++++++++++++++++++++++++++++++--
 include/linux/mtd/nand-ecc-sw-hamming.h | 20 +++++++++++
 include/linux/mtd/rawnand.h             |  2 ++
 4 files changed, 90 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c
index 7ee0387ecb29..9cd70b4637c1 100644
--- a/drivers/mtd/nand/ecc-sw-hamming.c
+++ b/drivers/mtd/nand/ecc-sw-hamming.c
@@ -17,8 +17,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <asm/byteorder.h>
 
@@ -361,10 +359,11 @@ EXPORT_SYMBOL(ecc_sw_hamming_calculate);
 int nand_ecc_sw_hamming_calculate(struct nand_device *nand,
 				  const unsigned char *buf, unsigned char *code)
 {
-	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
-	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
+	struct nand_ecc_sw_hamming_conf *engine_conf = nand->ecc.ctx.priv;
+	unsigned int step_size = nand->ecc.ctx.conf.step_size;
 
-	return ecc_sw_hamming_calculate(buf, chip->ecc.size, code, sm_order);
+	return ecc_sw_hamming_calculate(buf, step_size, code,
+					engine_conf->sm_order);
 }
 EXPORT_SYMBOL(nand_ecc_sw_hamming_calculate);
 
@@ -453,11 +452,11 @@ int nand_ecc_sw_hamming_correct(struct nand_device *nand, unsigned char *buf,
 				unsigned char *read_ecc,
 				unsigned char *calc_ecc)
 {
-	struct nand_chip *chip = mtd_to_nand(nanddev_to_mtd(nand));
-	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
+	struct nand_ecc_sw_hamming_conf *engine_conf = nand->ecc.ctx.priv;
+	unsigned int step_size = nand->ecc.ctx.conf.step_size;
 
-	return ecc_sw_hamming_correct(buf, read_ecc, calc_ecc, chip->ecc.size,
-				      sm_order);
+	return ecc_sw_hamming_correct(buf, read_ecc, calc_ecc, step_size,
+				      engine_conf->sm_order);
 }
 EXPORT_SYMBOL(nand_ecc_sw_hamming_correct);
 
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 32e9870a2cf8..c36d2d4a72d5 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5139,6 +5139,46 @@ static void nand_scan_ident_cleanup(struct nand_chip *chip)
 	kfree(chip->parameters.onfi);
 }
 
+int rawnand_sw_hamming_init(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+	struct nand_ecc_sw_hamming_conf *engine_conf;
+	struct nand_device *base = &chip->base;
+
+	base->ecc.user_conf.engine_type = NAND_ECC_ENGINE_TYPE_SOFT;
+	base->ecc.user_conf.algo = NAND_ECC_ALGO_HAMMING;
+	base->ecc.user_conf.strength = chip->ecc.strength;
+	base->ecc.user_conf.step_size = chip->ecc.size;
+
+	if (base->ecc.user_conf.strength != 1 ||
+	    (base->ecc.user_conf.step_size != 256 &&
+	     base->ecc.user_conf.step_size != 512)) {
+		pr_err("%s: unsupported strength or step size\n", __func__);
+		return -EINVAL;
+	}
+
+	engine_conf = kzalloc(sizeof(*engine_conf), GFP_KERNEL);
+	if (!engine_conf)
+		return -ENOMEM;
+
+	engine_conf->code_size = 3;
+	engine_conf->nsteps = mtd->writesize / base->ecc.user_conf.step_size;
+
+	if (chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER)
+		engine_conf->sm_order = true;
+
+	base->ecc.ctx.priv = engine_conf;
+
+	chip->ecc.size = base->ecc.ctx.conf.step_size;
+	chip->ecc.strength = base->ecc.ctx.conf.strength;
+	chip->ecc.total = base->ecc.ctx.total;
+	chip->ecc.steps = engine_conf->nsteps;
+	chip->ecc.bytes = engine_conf->code_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(rawnand_sw_hamming_init);
+
 int rawnand_sw_hamming_calculate(struct nand_chip *chip,
 				 const unsigned char *buf,
 				 unsigned char *code)
@@ -5160,6 +5200,14 @@ int rawnand_sw_hamming_correct(struct nand_chip *chip,
 }
 EXPORT_SYMBOL(rawnand_sw_hamming_correct);
 
+void rawnand_sw_hamming_cleanup(struct nand_chip *chip)
+{
+	struct nand_device *base = &chip->base;
+
+	kfree(base->ecc.ctx.priv);
+}
+EXPORT_SYMBOL(rawnand_sw_hamming_cleanup);
+
 int rawnand_sw_bch_init(struct nand_chip *chip)
 {
 	struct nand_device *base = &chip->base;
@@ -5303,6 +5351,12 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 		if (IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC))
 			ecc->options |= NAND_ECC_SOFT_HAMMING_SM_ORDER;
 
+		ret = rawnand_sw_hamming_init(chip);
+		if (ret) {
+			WARN(1, "Hamming ECC initialization failed!\n");
+			return ret;
+		}
+
 		return 0;
 	case NAND_ECC_ALGO_BCH:
 		if (!IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)) {
@@ -5996,9 +6050,12 @@ EXPORT_SYMBOL(nand_scan_with_ids);
  */
 void nand_cleanup(struct nand_chip *chip)
 {
-	if (chip->ecc.engine_type == NAND_ECC_ENGINE_TYPE_SOFT &&
-	    chip->ecc.algo == NAND_ECC_ALGO_BCH)
-		rawnand_sw_bch_cleanup(chip);
+	if (chip->ecc.engine_type == NAND_ECC_ENGINE_TYPE_SOFT) {
+		if (chip->ecc.algo == NAND_ECC_ALGO_HAMMING)
+			rawnand_sw_hamming_cleanup(chip);
+		else if (chip->ecc.algo == NAND_ECC_ALGO_BCH)
+			rawnand_sw_bch_cleanup(chip);
+	}
 
 	nanddev_cleanup(&chip->base);
 
diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h
index 84a123bf45f3..9d4b4d623741 100644
--- a/include/linux/mtd/nand-ecc-sw-hamming.h
+++ b/include/linux/mtd/nand-ecc-sw-hamming.h
@@ -12,6 +12,26 @@
 
 #include <linux/mtd/nand.h>
 
+/**
+ * struct nand_ecc_sw_hamming_conf - private software Hamming ECC engine structure
+ * @reqooblen: Save the actual user OOB length requested before overwriting it
+ * @spare_oobbuf: Spare OOB buffer if none is provided
+ * @code_size: Number of bytes needed to store a code (one code per step)
+ * @nsteps: Number of steps
+ * @calc_buf: Buffer to use when calculating ECC bytes
+ * @code_buf: Buffer to use when reading (raw) ECC bytes from the chip
+ * @sm_order: Smart Media special ordering
+ */
+struct nand_ecc_sw_hamming_conf {
+	unsigned int reqooblen;
+	void *spare_oobbuf;
+	unsigned int code_size;
+	unsigned int nsteps;
+	u8 *calc_buf;
+	u8 *code_buf;
+	unsigned int sm_order;
+};
+
 int ecc_sw_hamming_calculate(const unsigned char *buf, unsigned int step_size,
 			     unsigned char *code, bool sm_order);
 int nand_ecc_sw_hamming_calculate(struct nand_device *nand,
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 685d24557e95..0a90a8792d18 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1301,6 +1301,7 @@ static inline int nand_opcode_8bits(unsigned int command)
 	return 0;
 }
 
+int rawnand_sw_hamming_init(struct nand_chip *chip);
 int rawnand_sw_hamming_calculate(struct nand_chip *chip,
 				 const unsigned char *buf,
 				 unsigned char *code);
@@ -1308,6 +1309,7 @@ int rawnand_sw_hamming_correct(struct nand_chip *chip,
 			       unsigned char *buf,
 			       unsigned char *read_ecc,
 			       unsigned char *calc_ecc);
+void rawnand_sw_hamming_cleanup(struct nand_chip *chip);
 int rawnand_sw_bch_init(struct nand_chip *chip);
 int rawnand_sw_bch_correct(struct nand_chip *chip, unsigned char *buf,
 			   unsigned char *read_ecc, unsigned char *calc_ecc);
-- 
cgit 


From eb08376a5dd943cf2a7360f236fe20bbd709fa95 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:21 +0200
Subject: mtd: nand: ecc-hamming: Remove useless includes

Most of the includes are simply useless, drop them.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-18-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/fsl_elbc_nand.c | 1 -
 drivers/mtd/nand/raw/fsl_ifc_nand.c  | 1 -
 drivers/mtd/nand/raw/fsl_upm.c       | 1 -
 drivers/mtd/nand/raw/fsmc_nand.c     | 1 -
 drivers/mtd/nand/raw/lpc32xx_mlc.c   | 1 -
 drivers/mtd/nand/raw/lpc32xx_slc.c   | 1 -
 drivers/mtd/nand/raw/pasemi_nand.c   | 1 -
 drivers/mtd/nand/raw/s3c2410.c       | 1 -
 drivers/mtd/nand/raw/sharpsl.c       | 1 -
 drivers/mtd/nand/raw/tmio_nand.c     | 1 -
 drivers/mtd/nand/raw/txx9ndfmc.c     | 1 -
 include/linux/mtd/sharpsl.h          | 1 -
 12 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/fsl_elbc_nand.c b/drivers/mtd/nand/raw/fsl_elbc_nand.c
index eb255cf83e32..aab93b9e6052 100644
--- a/drivers/mtd/nand/raw/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/raw/fsl_elbc_nand.c
@@ -22,7 +22,6 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include <asm/io.h>
diff --git a/drivers/mtd/nand/raw/fsl_ifc_nand.c b/drivers/mtd/nand/raw/fsl_ifc_nand.c
index d2f84917fa8b..02d500176838 100644
--- a/drivers/mtd/nand/raw/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/raw/fsl_ifc_nand.c
@@ -15,7 +15,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/fsl_ifc.h>
 #include <linux/iopoll.h>
 
diff --git a/drivers/mtd/nand/raw/fsl_upm.c b/drivers/mtd/nand/raw/fsl_upm.c
index 3824361928a1..b3cc427100a2 100644
--- a/drivers/mtd/nand/raw/fsl_upm.c
+++ b/drivers/mtd/nand/raw/fsl_upm.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/mtd.h>
 #include <linux/of_platform.h>
diff --git a/drivers/mtd/nand/raw/fsmc_nand.c b/drivers/mtd/nand/raw/fsmc_nand.c
index 1e2ed45be2f1..0101c0fab50a 100644
--- a/drivers/mtd/nand/raw/fsmc_nand.c
+++ b/drivers/mtd/nand/raw/fsmc_nand.c
@@ -26,7 +26,6 @@
 #include <linux/types.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/nand/raw/lpc32xx_mlc.c b/drivers/mtd/nand/raw/lpc32xx_mlc.c
index 885b03b7af52..452ecaf7775a 100644
--- a/drivers/mtd/nand/raw/lpc32xx_mlc.c
+++ b/drivers/mtd/nand/raw/lpc32xx_mlc.c
@@ -31,7 +31,6 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 
 #define DRV_NAME "lpc32xx_mlc"
 
diff --git a/drivers/mtd/nand/raw/lpc32xx_slc.c b/drivers/mtd/nand/raw/lpc32xx_slc.c
index beecabfa5181..6b7269cfb7d8 100644
--- a/drivers/mtd/nand/raw/lpc32xx_slc.c
+++ b/drivers/mtd/nand/raw/lpc32xx_slc.c
@@ -23,7 +23,6 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/gpio.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
diff --git a/drivers/mtd/nand/raw/pasemi_nand.c b/drivers/mtd/nand/raw/pasemi_nand.c
index 2dddbdca9e94..5741887d7a24 100644
--- a/drivers/mtd/nand/raw/pasemi_nand.c
+++ b/drivers/mtd/nand/raw/pasemi_nand.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
diff --git a/drivers/mtd/nand/raw/s3c2410.c b/drivers/mtd/nand/raw/s3c2410.c
index f7eb3b5fa792..cb2d1b4e278c 100644
--- a/drivers/mtd/nand/raw/s3c2410.c
+++ b/drivers/mtd/nand/raw/s3c2410.c
@@ -30,7 +30,6 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 #include <linux/platform_data/mtd-nand-s3c2410.h>
diff --git a/drivers/mtd/nand/raw/sharpsl.c b/drivers/mtd/nand/raw/sharpsl.c
index 3f55b523aead..5612ee628425 100644
--- a/drivers/mtd/nand/raw/sharpsl.c
+++ b/drivers/mtd/nand/raw/sharpsl.c
@@ -12,7 +12,6 @@
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/sharpsl.h>
 #include <linux/interrupt.h>
diff --git a/drivers/mtd/nand/raw/tmio_nand.c b/drivers/mtd/nand/raw/tmio_nand.c
index 29edacbec826..de8e919d0ebe 100644
--- a/drivers/mtd/nand/raw/tmio_nand.c
+++ b/drivers/mtd/nand/raw/tmio_nand.c
@@ -35,7 +35,6 @@
 #include <linux/ioport.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/slab.h>
 
diff --git a/drivers/mtd/nand/raw/txx9ndfmc.c b/drivers/mtd/nand/raw/txx9ndfmc.c
index a705523eb916..1a9449e53bf9 100644
--- a/drivers/mtd/nand/raw/txx9ndfmc.c
+++ b/drivers/mtd/nand/raw/txx9ndfmc.c
@@ -14,7 +14,6 @@
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
 #include <linux/platform_data/txx9/ndfmc.h>
diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h
index 3762a90077d6..231bd1c3f408 100644
--- a/include/linux/mtd/sharpsl.h
+++ b/include/linux/mtd/sharpsl.h
@@ -9,7 +9,6 @@
 #define _MTD_SHARPSL_H
 
 #include <linux/mtd/rawnand.h>
-#include <linux/mtd/nand-ecc-sw-hamming.h>
 #include <linux/mtd/partitions.h>
 
 struct sharpsl_nand_platform_data {
-- 
cgit 


From 5180a62c12497aa491a7c79c062a9e3a884c9762 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:22 +0200
Subject: mtd: nand: ecc-hamming: Let the software Hamming ECC engine be
 unselected

There is no reason to always embed the software Hamming ECC engine
implementation. By default it is (with raw NAND), but we can let the
user decide.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-19-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/Kconfig                | 11 +++++++++-
 drivers/mtd/nand/raw/Kconfig            |  2 +-
 include/linux/mtd/nand-ecc-sw-hamming.h | 36 +++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 306c33ca3448..15ffe1a1b863 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -16,7 +16,16 @@ config MTD_NAND_ECC
        depends on MTD_NAND_CORE
 
 config MTD_NAND_ECC_SW_HAMMING
-	bool
+	bool "Software Hamming ECC engine"
+	default y if MTD_RAW_NAND
+	select MTD_NAND_ECC
+	help
+	  This enables support for software Hamming error
+	  correction. This correction can correct up to 1 bit error
+	  per chunk and detect up to 2 bit errors. While it used to be
+	  widely used with old parts, newer NAND chips usually require
+	  more strength correction and in this case BCH or RS will be
+	  preferred.
 
 config MTD_NAND_ECC_SW_HAMMING_SMC
 	bool "NAND ECC Smart Media byte order"
diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
index 6149096e32cb..cc86d6e4e042 100644
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -3,7 +3,6 @@ menuconfig MTD_RAW_NAND
 	tristate "Raw/Parallel NAND Device Support"
 	select MTD_NAND_CORE
 	select MTD_NAND_ECC
-	select MTD_NAND_ECC_SW_HAMMING
 	help
 	  This enables support for accessing all type of raw/parallel
 	  NAND flash devices. For further information see
@@ -72,6 +71,7 @@ config MTD_NAND_AU1550
 config MTD_NAND_NDFC
 	tristate "IBM/MCC 4xx NAND controller"
 	depends on 4xx
+	select MTD_NAND_ECC_SW_HAMMING
 	select MTD_NAND_ECC_SW_HAMMING_SMC
 	help
 	  NDFC Nand Flash Controllers are integrated in IBM/AMCC's 4xx SoCs
diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h
index 9d4b4d623741..5a39e96c3546 100644
--- a/include/linux/mtd/nand-ecc-sw-hamming.h
+++ b/include/linux/mtd/nand-ecc-sw-hamming.h
@@ -32,6 +32,8 @@ struct nand_ecc_sw_hamming_conf {
 	unsigned int sm_order;
 };
 
+#if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING)
+
 int ecc_sw_hamming_calculate(const unsigned char *buf, unsigned int step_size,
 			     unsigned char *code, bool sm_order);
 int nand_ecc_sw_hamming_calculate(struct nand_device *nand,
@@ -44,4 +46,38 @@ int nand_ecc_sw_hamming_correct(struct nand_device *nand, unsigned char *buf,
 				unsigned char *read_ecc,
 				unsigned char *calc_ecc);
 
+#else /* !CONFIG_MTD_NAND_ECC_SW_HAMMING */
+
+static inline int ecc_sw_hamming_calculate(const unsigned char *buf,
+					   unsigned int step_size,
+					   unsigned char *code, bool sm_order)
+{
+	return -ENOTSUPP;
+}
+
+static inline int nand_ecc_sw_hamming_calculate(struct nand_device *nand,
+						const unsigned char *buf,
+						unsigned char *code)
+{
+	return -ENOTSUPP;
+}
+
+static inline int ecc_sw_hamming_correct(unsigned char *buf,
+					 unsigned char *read_ecc,
+					 unsigned char *calc_ecc,
+					 unsigned int step_size, bool sm_order)
+{
+	return -ENOTSUPP;
+}
+
+static inline int nand_ecc_sw_hamming_correct(struct nand_device *nand,
+					      unsigned char *buf,
+					      unsigned char *read_ecc,
+					      unsigned char *calc_ecc)
+{
+	return -ENOTSUPP;
+}
+
+#endif /* CONFIG_MTD_NAND_ECC_SW_HAMMING */
+
 #endif /* __MTD_NAND_ECC_SW_HAMMING_H__ */
-- 
cgit 


From 35fe1b98a0082ad3f576bcc420c74dab435da307 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:23 +0200
Subject: mtd: nand: ecc-hamming: Create the software Hamming engine

Let's continue introducing the generic ECC engine abstraction in the
NAND subsystem by instantiating a second ECC engine: software
Hamming.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-20-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-hamming.c       | 193 ++++++++++++++++++++++++++++++++
 drivers/mtd/nand/raw/nand_base.c        |  26 ++---
 include/linux/mtd/nand-ecc-sw-hamming.h |  16 ++-
 include/linux/mtd/nand.h                |   9 ++
 4 files changed, 223 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c
index 9cd70b4637c1..c95aadf1eb43 100644
--- a/drivers/mtd/nand/ecc-sw-hamming.c
+++ b/drivers/mtd/nand/ecc-sw-hamming.c
@@ -17,7 +17,9 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mtd/nand.h>
 #include <linux/mtd/nand-ecc-sw-hamming.h>
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 
 /*
@@ -460,6 +462,197 @@ int nand_ecc_sw_hamming_correct(struct nand_device *nand, unsigned char *buf,
 }
 EXPORT_SYMBOL(nand_ecc_sw_hamming_correct);
 
+int nand_ecc_sw_hamming_init_ctx(struct nand_device *nand)
+{
+	struct nand_ecc_props *conf = &nand->ecc.ctx.conf;
+	struct nand_ecc_sw_hamming_conf *engine_conf;
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	int ret;
+
+	if (!mtd->ooblayout) {
+		switch (mtd->oobsize) {
+		case 8:
+		case 16:
+			mtd_set_ooblayout(mtd, nand_get_small_page_ooblayout());
+			break;
+		case 64:
+		case 128:
+			mtd_set_ooblayout(mtd,
+					  nand_get_large_page_hamming_ooblayout());
+			break;
+		default:
+			return -ENOTSUPP;
+		}
+	}
+
+	conf->engine_type = NAND_ECC_ENGINE_TYPE_SOFT;
+	conf->algo = NAND_ECC_ALGO_HAMMING;
+	conf->step_size = nand->ecc.user_conf.step_size;
+	conf->strength = 1;
+
+	/* Use the strongest configuration by default */
+	if (conf->step_size != 256 && conf->step_size != 512)
+		conf->step_size = 256;
+
+	engine_conf = kzalloc(sizeof(*engine_conf), GFP_KERNEL);
+	if (!engine_conf)
+		return -ENOMEM;
+
+	ret = nand_ecc_init_req_tweaking(&engine_conf->req_ctx, nand);
+	if (ret)
+		goto free_engine_conf;
+
+	engine_conf->code_size = 3;
+	engine_conf->nsteps = mtd->writesize / conf->step_size;
+	engine_conf->calc_buf = kzalloc(mtd->oobsize, GFP_KERNEL);
+	engine_conf->code_buf = kzalloc(mtd->oobsize, GFP_KERNEL);
+	if (!engine_conf->calc_buf || !engine_conf->code_buf) {
+		ret = -ENOMEM;
+		goto free_bufs;
+	}
+
+	nand->ecc.ctx.priv = engine_conf;
+	nand->ecc.ctx.total = engine_conf->nsteps * engine_conf->code_size;
+
+	return 0;
+
+free_bufs:
+	nand_ecc_cleanup_req_tweaking(&engine_conf->req_ctx);
+	kfree(engine_conf->calc_buf);
+	kfree(engine_conf->code_buf);
+free_engine_conf:
+	kfree(engine_conf);
+
+	return ret;
+}
+EXPORT_SYMBOL(nand_ecc_sw_hamming_init_ctx);
+
+void nand_ecc_sw_hamming_cleanup_ctx(struct nand_device *nand)
+{
+	struct nand_ecc_sw_hamming_conf *engine_conf = nand->ecc.ctx.priv;
+
+	if (engine_conf) {
+		nand_ecc_cleanup_req_tweaking(&engine_conf->req_ctx);
+		kfree(engine_conf->calc_buf);
+		kfree(engine_conf->code_buf);
+		kfree(engine_conf);
+	}
+}
+EXPORT_SYMBOL(nand_ecc_sw_hamming_cleanup_ctx);
+
+static int nand_ecc_sw_hamming_prepare_io_req(struct nand_device *nand,
+					      struct nand_page_io_req *req)
+{
+	struct nand_ecc_sw_hamming_conf *engine_conf = nand->ecc.ctx.priv;
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	int eccsize = nand->ecc.ctx.conf.step_size;
+	int eccbytes = engine_conf->code_size;
+	int eccsteps = engine_conf->nsteps;
+	int total = nand->ecc.ctx.total;
+	u8 *ecccalc = engine_conf->calc_buf;
+	const u8 *data;
+	int i;
+
+	/* Nothing to do for a raw operation */
+	if (req->mode == MTD_OPS_RAW)
+		return 0;
+
+	/* This engine does not provide BBM/free OOB bytes protection */
+	if (!req->datalen)
+		return 0;
+
+	nand_ecc_tweak_req(&engine_conf->req_ctx, req);
+
+	/* No more preparation for page read */
+	if (req->type == NAND_PAGE_READ)
+		return 0;
+
+	/* Preparation for page write: derive the ECC bytes and place them */
+	for (i = 0, data = req->databuf.out;
+	     eccsteps;
+	     eccsteps--, i += eccbytes, data += eccsize)
+		nand_ecc_sw_hamming_calculate(nand, data, &ecccalc[i]);
+
+	return mtd_ooblayout_set_eccbytes(mtd, ecccalc, (void *)req->oobbuf.out,
+					  0, total);
+}
+
+static int nand_ecc_sw_hamming_finish_io_req(struct nand_device *nand,
+					     struct nand_page_io_req *req)
+{
+	struct nand_ecc_sw_hamming_conf *engine_conf = nand->ecc.ctx.priv;
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	int eccsize = nand->ecc.ctx.conf.step_size;
+	int total = nand->ecc.ctx.total;
+	int eccbytes = engine_conf->code_size;
+	int eccsteps = engine_conf->nsteps;
+	u8 *ecccalc = engine_conf->calc_buf;
+	u8 *ecccode = engine_conf->code_buf;
+	unsigned int max_bitflips = 0;
+	u8 *data = req->databuf.in;
+	int i, ret;
+
+	/* Nothing to do for a raw operation */
+	if (req->mode == MTD_OPS_RAW)
+		return 0;
+
+	/* This engine does not provide BBM/free OOB bytes protection */
+	if (!req->datalen)
+		return 0;
+
+	/* No more preparation for page write */
+	if (req->type == NAND_PAGE_WRITE) {
+		nand_ecc_restore_req(&engine_conf->req_ctx, req);
+		return 0;
+	}
+
+	/* Finish a page read: retrieve the (raw) ECC bytes*/
+	ret = mtd_ooblayout_get_eccbytes(mtd, ecccode, req->oobbuf.in, 0,
+					 total);
+	if (ret)
+		return ret;
+
+	/* Calculate the ECC bytes */
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, data += eccsize)
+		nand_ecc_sw_hamming_calculate(nand, data, &ecccalc[i]);
+
+	/* Finish a page read: compare and correct */
+	for (eccsteps = engine_conf->nsteps, i = 0, data = req->databuf.in;
+	     eccsteps;
+	     eccsteps--, i += eccbytes, data += eccsize) {
+		int stat =  nand_ecc_sw_hamming_correct(nand, data,
+							&ecccode[i],
+							&ecccalc[i]);
+		if (stat < 0) {
+			mtd->ecc_stats.failed++;
+		} else {
+			mtd->ecc_stats.corrected += stat;
+			max_bitflips = max_t(unsigned int, max_bitflips, stat);
+		}
+	}
+
+	nand_ecc_restore_req(&engine_conf->req_ctx, req);
+
+	return max_bitflips;
+}
+
+static struct nand_ecc_engine_ops nand_ecc_sw_hamming_engine_ops = {
+	.init_ctx = nand_ecc_sw_hamming_init_ctx,
+	.cleanup_ctx = nand_ecc_sw_hamming_cleanup_ctx,
+	.prepare_io_req = nand_ecc_sw_hamming_prepare_io_req,
+	.finish_io_req = nand_ecc_sw_hamming_finish_io_req,
+};
+
+static struct nand_ecc_engine nand_ecc_sw_hamming_engine = {
+	.ops = &nand_ecc_sw_hamming_engine_ops,
+};
+
+struct nand_ecc_engine *nand_ecc_sw_hamming_get_engine(void)
+{
+	return &nand_ecc_sw_hamming_engine;
+}
+EXPORT_SYMBOL(nand_ecc_sw_hamming_get_engine);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>");
 MODULE_DESCRIPTION("NAND software Hamming ECC support");
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index c36d2d4a72d5..c33fa1b1847f 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5141,34 +5141,24 @@ static void nand_scan_ident_cleanup(struct nand_chip *chip)
 
 int rawnand_sw_hamming_init(struct nand_chip *chip)
 {
-	struct mtd_info *mtd = nand_to_mtd(chip);
 	struct nand_ecc_sw_hamming_conf *engine_conf;
 	struct nand_device *base = &chip->base;
+	int ret;
 
 	base->ecc.user_conf.engine_type = NAND_ECC_ENGINE_TYPE_SOFT;
 	base->ecc.user_conf.algo = NAND_ECC_ALGO_HAMMING;
 	base->ecc.user_conf.strength = chip->ecc.strength;
 	base->ecc.user_conf.step_size = chip->ecc.size;
 
-	if (base->ecc.user_conf.strength != 1 ||
-	    (base->ecc.user_conf.step_size != 256 &&
-	     base->ecc.user_conf.step_size != 512)) {
-		pr_err("%s: unsupported strength or step size\n", __func__);
-		return -EINVAL;
-	}
-
-	engine_conf = kzalloc(sizeof(*engine_conf), GFP_KERNEL);
-	if (!engine_conf)
-		return -ENOMEM;
+	ret = nand_ecc_sw_hamming_init_ctx(base);
+	if (ret)
+		return ret;
 
-	engine_conf->code_size = 3;
-	engine_conf->nsteps = mtd->writesize / base->ecc.user_conf.step_size;
+	engine_conf = base->ecc.ctx.priv;
 
 	if (chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER)
 		engine_conf->sm_order = true;
 
-	base->ecc.ctx.priv = engine_conf;
-
 	chip->ecc.size = base->ecc.ctx.conf.step_size;
 	chip->ecc.strength = base->ecc.ctx.conf.strength;
 	chip->ecc.total = base->ecc.ctx.total;
@@ -5204,7 +5194,7 @@ void rawnand_sw_hamming_cleanup(struct nand_chip *chip)
 {
 	struct nand_device *base = &chip->base;
 
-	kfree(base->ecc.ctx.priv);
+	nand_ecc_sw_hamming_cleanup_ctx(base);
 }
 EXPORT_SYMBOL(rawnand_sw_hamming_cleanup);
 
@@ -5733,7 +5723,9 @@ static int nand_scan_tail(struct nand_chip *chip)
 	 */
 	if (!mtd->ooblayout &&
 	    !(ecc->engine_type == NAND_ECC_ENGINE_TYPE_SOFT &&
-	      ecc->algo == NAND_ECC_ALGO_BCH)) {
+	      ecc->algo == NAND_ECC_ALGO_BCH) &&
+	    !(ecc->engine_type == NAND_ECC_ENGINE_TYPE_SOFT &&
+	      ecc->algo == NAND_ECC_ALGO_HAMMING)) {
 		switch (mtd->oobsize) {
 		case 8:
 		case 16:
diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h
index 5a39e96c3546..9f9073d86ff3 100644
--- a/include/linux/mtd/nand-ecc-sw-hamming.h
+++ b/include/linux/mtd/nand-ecc-sw-hamming.h
@@ -14,8 +14,8 @@
 
 /**
  * struct nand_ecc_sw_hamming_conf - private software Hamming ECC engine structure
- * @reqooblen: Save the actual user OOB length requested before overwriting it
- * @spare_oobbuf: Spare OOB buffer if none is provided
+ * @req_ctx: Save request context and tweak the original request to fit the
+ *           engine needs
  * @code_size: Number of bytes needed to store a code (one code per step)
  * @nsteps: Number of steps
  * @calc_buf: Buffer to use when calculating ECC bytes
@@ -23,8 +23,7 @@
  * @sm_order: Smart Media special ordering
  */
 struct nand_ecc_sw_hamming_conf {
-	unsigned int reqooblen;
-	void *spare_oobbuf;
+	struct nand_ecc_req_tweak_ctx req_ctx;
 	unsigned int code_size;
 	unsigned int nsteps;
 	u8 *calc_buf;
@@ -34,6 +33,8 @@ struct nand_ecc_sw_hamming_conf {
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING)
 
+int nand_ecc_sw_hamming_init_ctx(struct nand_device *nand);
+void nand_ecc_sw_hamming_cleanup_ctx(struct nand_device *nand);
 int ecc_sw_hamming_calculate(const unsigned char *buf, unsigned int step_size,
 			     unsigned char *code, bool sm_order);
 int nand_ecc_sw_hamming_calculate(struct nand_device *nand,
@@ -48,6 +49,13 @@ int nand_ecc_sw_hamming_correct(struct nand_device *nand, unsigned char *buf,
 
 #else /* !CONFIG_MTD_NAND_ECC_SW_HAMMING */
 
+static inline int nand_ecc_sw_hamming_init_ctx(struct nand_device *nand)
+{
+	return -ENOTSUPP;
+}
+
+static inline void nand_ecc_sw_hamming_cleanup_ctx(struct nand_device *nand) {}
+
 static inline int ecc_sw_hamming_calculate(const unsigned char *buf,
 					   unsigned int step_size,
 					   unsigned char *code, bool sm_order)
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index df8548187713..3616fa27eaa1 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -278,6 +278,15 @@ int nand_ecc_finish_io_req(struct nand_device *nand,
 			   struct nand_page_io_req *req);
 bool nand_ecc_is_strong_enough(struct nand_device *nand);
 
+#if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING)
+struct nand_ecc_engine *nand_ecc_sw_hamming_get_engine(void);
+#else
+static inline struct nand_ecc_engine *nand_ecc_sw_hamming_get_engine(void)
+{
+	return NULL;
+}
+#endif /* CONFIG_MTD_NAND_ECC_SW_HAMMING */
+
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
 struct nand_ecc_engine *nand_ecc_sw_bch_get_engine(void);
 #else
-- 
cgit 


From 53fbdeeb57a0168a88547e22f8d433810c531169 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 01:01:24 +0200
Subject: mtd: nand: Let software ECC engines be retrieved from the NAND core

Before making use of the ECC engines, we must retrieve them. Add the
boilerplate for the ones already available: software engines (Hamming
and BCH).

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200929230124.31491-21-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc.c   | 20 ++++++++++++++++++++
 include/linux/mtd/nand.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc.c b/drivers/mtd/nand/ecc.c
index 541137736fd3..57bc8c5ef08c 100644
--- a/drivers/mtd/nand/ecc.c
+++ b/drivers/mtd/nand/ecc.c
@@ -585,6 +585,26 @@ void nand_ecc_restore_req(struct nand_ecc_req_tweak_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(nand_ecc_restore_req);
 
+struct nand_ecc_engine *nand_ecc_get_sw_engine(struct nand_device *nand)
+{
+	unsigned int algo = nand->ecc.user_conf.algo;
+
+	if (algo == NAND_ECC_ALGO_UNKNOWN)
+		algo = nand->ecc.defaults.algo;
+
+	switch (algo) {
+	case NAND_ECC_ALGO_HAMMING:
+		return nand_ecc_sw_hamming_get_engine();
+	case NAND_ECC_ALGO_BCH:
+		return nand_ecc_sw_bch_get_engine();
+	default:
+		break;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(nand_ecc_get_sw_engine);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Miquel Raynal <miquel.raynal@bootlin.com>");
 MODULE_DESCRIPTION("Generic ECC engine");
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 3616fa27eaa1..f78f61c9a9ee 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -277,6 +277,7 @@ int nand_ecc_prepare_io_req(struct nand_device *nand,
 int nand_ecc_finish_io_req(struct nand_device *nand,
 			   struct nand_page_io_req *req);
 bool nand_ecc_is_strong_enough(struct nand_device *nand);
+struct nand_ecc_engine *nand_ecc_get_sw_engine(struct nand_device *nand);
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING)
 struct nand_ecc_engine *nand_ecc_sw_hamming_get_engine(void);
-- 
cgit 


From 945845b54c9cf61809d1963492bb728ce8937964 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 17:41:07 +0200
Subject: mtd: spinand: Instantiate a SPI-NAND on-die ECC engine

Make use of the existing functions taken from the SPI-NAND core to
instantiate an on-die ECC engine specific to the SPI-NAND core. The
next step will be to tweak the core to use this object instead of
calling the helpers directly.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200930154109.3922-4-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/spi/core.c | 67 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h |  9 ++++++
 2 files changed, 76 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index b2830e19a7f6..069ebf85dd3f 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -246,6 +246,73 @@ static const struct mtd_ooblayout_ops spinand_noecc_ooblayout = {
 	.free = spinand_noecc_ooblayout_free,
 };
 
+static int spinand_ondie_ecc_init_ctx(struct nand_device *nand)
+{
+	struct spinand_device *spinand = nand_to_spinand(nand);
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	struct spinand_ondie_ecc_conf *engine_conf;
+
+	nand->ecc.ctx.conf.engine_type = NAND_ECC_ENGINE_TYPE_ON_DIE;
+	nand->ecc.ctx.conf.step_size = nand->ecc.requirements.step_size;
+	nand->ecc.ctx.conf.strength = nand->ecc.requirements.strength;
+
+	engine_conf = kzalloc(sizeof(*engine_conf), GFP_KERNEL);
+	if (!engine_conf)
+		return -ENOMEM;
+
+	nand->ecc.ctx.priv = engine_conf;
+
+	if (spinand->eccinfo.ooblayout)
+		mtd_set_ooblayout(mtd, spinand->eccinfo.ooblayout);
+	else
+		mtd_set_ooblayout(mtd, &spinand_noecc_ooblayout);
+
+	return 0;
+}
+
+static void spinand_ondie_ecc_cleanup_ctx(struct nand_device *nand)
+{
+	kfree(nand->ecc.ctx.priv);
+}
+
+static int spinand_ondie_ecc_prepare_io_req(struct nand_device *nand,
+					    struct nand_page_io_req *req)
+{
+	struct spinand_device *spinand = nand_to_spinand(nand);
+	bool enable = (req->mode != MTD_OPS_RAW);
+
+	/* Only enable or disable the engine */
+	return spinand_ecc_enable(spinand, enable);
+}
+
+static int spinand_ondie_ecc_finish_io_req(struct nand_device *nand,
+					   struct nand_page_io_req *req)
+{
+	struct spinand_ondie_ecc_conf *engine_conf = nand->ecc.ctx.priv;
+	struct spinand_device *spinand = nand_to_spinand(nand);
+
+	if (req->mode == MTD_OPS_RAW)
+		return 0;
+
+	/* Nothing to do when finishing a page write */
+	if (req->type == NAND_PAGE_WRITE)
+		return 0;
+
+	/* Finish a page write: check the status, report errors/bitflips */
+	return spinand_check_ecc_status(spinand, engine_conf->status);
+}
+
+static struct nand_ecc_engine_ops spinand_ondie_ecc_engine_ops = {
+	.init_ctx = spinand_ondie_ecc_init_ctx,
+	.cleanup_ctx = spinand_ondie_ecc_cleanup_ctx,
+	.prepare_io_req = spinand_ondie_ecc_prepare_io_req,
+	.finish_io_req = spinand_ondie_ecc_finish_io_req,
+};
+
+static __maybe_unused struct nand_ecc_engine spinand_ondie_ecc_engine = {
+	.ops = &spinand_ondie_ecc_engine_ops,
+};
+
 static int spinand_write_enable_op(struct spinand_device *spinand)
 {
 	struct spi_mem_op op = SPINAND_WR_EN_DIS_OP(true);
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 7b78c4ba9b3e..6bb92f26833e 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -286,6 +286,15 @@ struct spinand_ecc_info {
 #define SPINAND_HAS_QE_BIT		BIT(0)
 #define SPINAND_HAS_CR_FEAT_BIT		BIT(1)
 
+/**
+ * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure
+ * @status: status of the last wait operation that will be used in case
+ *          ->get_status() is not populated by the spinand device.
+ */
+struct spinand_ondie_ecc_conf {
+	u8 status;
+};
+
 /**
  * struct spinand_info - Structure used to describe SPI NAND chips
  * @model: model name
-- 
cgit 


From da429b9615803b6f19e5734c4c4d99136e1e3bfd Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 30 Sep 2020 17:41:08 +0200
Subject: mtd: nand: Let on-die ECC engines be retrieved from the NAND core

Before making use of the ECC engines, we must retrieve them. Add the
necessary boilerplate.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20200930154109.3922-5-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc.c   | 6 ++++++
 include/linux/mtd/nand.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc.c b/drivers/mtd/nand/ecc.c
index 57bc8c5ef08c..9afdde70824a 100644
--- a/drivers/mtd/nand/ecc.c
+++ b/drivers/mtd/nand/ecc.c
@@ -605,6 +605,12 @@ struct nand_ecc_engine *nand_ecc_get_sw_engine(struct nand_device *nand)
 }
 EXPORT_SYMBOL(nand_ecc_get_sw_engine);
 
+struct nand_ecc_engine *nand_ecc_get_on_die_hw_engine(struct nand_device *nand)
+{
+	return nand->ecc.ondie_engine;
+}
+EXPORT_SYMBOL(nand_ecc_get_on_die_hw_engine);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Miquel Raynal <miquel.raynal@bootlin.com>");
 MODULE_DESCRIPTION("Generic ECC engine");
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index f78f61c9a9ee..6c6f91c03c42 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -278,6 +278,7 @@ int nand_ecc_finish_io_req(struct nand_device *nand,
 			   struct nand_page_io_req *req);
 bool nand_ecc_is_strong_enough(struct nand_device *nand);
 struct nand_ecc_engine *nand_ecc_get_sw_engine(struct nand_device *nand);
+struct nand_ecc_engine *nand_ecc_get_on_die_hw_engine(struct nand_device *nand);
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING)
 struct nand_ecc_engine *nand_ecc_sw_hamming_get_engine(void);
-- 
cgit 


From 6b0c3b84156125e029956e46d2b44e72f513a9fa Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 1 Oct 2020 12:20:09 +0200
Subject: mtd: nand: Add helpers to manage ECC engines and configurations

Add the logic in the NAND core to find the right ECC engine depending
on the NAND chip requirements and the user desires. Right now, the
choice may be made between (more will come):
* software Hamming
* software BCH
* on-die (SPI-NAND devices only)

Once the ECC engine has been found, the ECC engine must be
configured.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20201001102014.20100-2-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/core.c  | 124 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h |   4 ++
 2 files changed, 128 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/core.c b/drivers/mtd/nand/core.c
index b6de955ac8bf..5e13a03d2b32 100644
--- a/drivers/mtd/nand/core.c
+++ b/drivers/mtd/nand/core.c
@@ -207,6 +207,130 @@ int nanddev_mtd_max_bad_blocks(struct mtd_info *mtd, loff_t offs, size_t len)
 }
 EXPORT_SYMBOL_GPL(nanddev_mtd_max_bad_blocks);
 
+/**
+ * nanddev_get_ecc_engine() - Find and get a suitable ECC engine
+ * @nand: NAND device
+ */
+static int nanddev_get_ecc_engine(struct nand_device *nand)
+{
+	int engine_type;
+
+	/* Read the user desires in terms of ECC engine/configuration */
+	of_get_nand_ecc_user_config(nand);
+
+	engine_type = nand->ecc.user_conf.engine_type;
+	if (engine_type == NAND_ECC_ENGINE_TYPE_INVALID)
+		engine_type = nand->ecc.defaults.engine_type;
+
+	switch (engine_type) {
+	case NAND_ECC_ENGINE_TYPE_NONE:
+		return 0;
+	case NAND_ECC_ENGINE_TYPE_SOFT:
+		nand->ecc.engine = nand_ecc_get_sw_engine(nand);
+		break;
+	case NAND_ECC_ENGINE_TYPE_ON_DIE:
+		nand->ecc.engine = nand_ecc_get_on_die_hw_engine(nand);
+		break;
+	case NAND_ECC_ENGINE_TYPE_ON_HOST:
+		pr_err("On-host hardware ECC engines not supported yet\n");
+		break;
+	default:
+		pr_err("Missing ECC engine type\n");
+	}
+
+	if (!nand->ecc.engine)
+		return  -EINVAL;
+
+	return 0;
+}
+
+/**
+ * nanddev_put_ecc_engine() - Dettach and put the in-use ECC engine
+ * @nand: NAND device
+ */
+static int nanddev_put_ecc_engine(struct nand_device *nand)
+{
+	switch (nand->ecc.ctx.conf.engine_type) {
+	case NAND_ECC_ENGINE_TYPE_ON_HOST:
+		pr_err("On-host hardware ECC engines not supported yet\n");
+		break;
+	case NAND_ECC_ENGINE_TYPE_NONE:
+	case NAND_ECC_ENGINE_TYPE_SOFT:
+	case NAND_ECC_ENGINE_TYPE_ON_DIE:
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * nanddev_find_ecc_configuration() - Find a suitable ECC configuration
+ * @nand: NAND device
+ */
+static int nanddev_find_ecc_configuration(struct nand_device *nand)
+{
+	int ret;
+
+	if (!nand->ecc.engine)
+		return -ENOTSUPP;
+
+	ret = nand_ecc_init_ctx(nand);
+	if (ret)
+		return ret;
+
+	if (!nand_ecc_is_strong_enough(nand))
+		pr_warn("WARNING: %s: the ECC used on your system is too weak compared to the one required by the NAND chip\n",
+			nand->mtd.name);
+
+	return 0;
+}
+
+/**
+ * nanddev_ecc_engine_init() - Initialize an ECC engine for the chip
+ * @nand: NAND device
+ */
+int nanddev_ecc_engine_init(struct nand_device *nand)
+{
+	int ret;
+
+	/* Look for the ECC engine to use */
+	ret = nanddev_get_ecc_engine(nand);
+	if (ret) {
+		pr_err("No ECC engine found\n");
+		return ret;
+	}
+
+	/* No ECC engine requested */
+	if (!nand->ecc.engine)
+		return 0;
+
+	/* Configure the engine: balance user input and chip requirements */
+	ret = nanddev_find_ecc_configuration(nand);
+	if (ret) {
+		pr_err("No suitable ECC configuration\n");
+		nanddev_put_ecc_engine(nand);
+
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nanddev_ecc_engine_init);
+
+/**
+ * nanddev_ecc_engine_cleanup() - Cleanup ECC engine initializations
+ * @nand: NAND device
+ */
+void nanddev_ecc_engine_cleanup(struct nand_device *nand)
+{
+	if (nand->ecc.engine)
+		nand_ecc_cleanup_ctx(nand);
+
+	nanddev_put_ecc_engine(nand);
+}
+EXPORT_SYMBOL_GPL(nanddev_ecc_engine_cleanup);
+
 /**
  * nanddev_init() - Initialize a NAND device
  * @nand: NAND device
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 6c6f91c03c42..414f8a4d2853 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -936,6 +936,10 @@ bool nanddev_isreserved(struct nand_device *nand, const struct nand_pos *pos);
 int nanddev_erase(struct nand_device *nand, const struct nand_pos *pos);
 int nanddev_markbad(struct nand_device *nand, const struct nand_pos *pos);
 
+/* ECC related functions */
+int nanddev_ecc_engine_init(struct nand_device *nand);
+void nanddev_ecc_engine_cleanup(struct nand_device *nand);
+
 /* BBT related functions */
 enum nand_bbt_block_status {
 	NAND_BBT_BLOCK_STATUS_UNKNOWN,
-- 
cgit 


From 7998d89875177a5fac9f963e230dbb828c218cb9 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 23 Oct 2020 18:33:04 +0200
Subject: mtd: rawnand: fix a kernel-doc markup

Some identifiers have different names between their prototypes
and the kernel-doc markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/9ed47a57d12c40e73a9b01612ee119d39baa6236.1603469755.git.mchehab+huawei@kernel.org
---
 drivers/mtd/nand/raw/nand_bbt.c | 2 +-
 include/linux/mtd/rawnand.h     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_bbt.c b/drivers/mtd/nand/raw/nand_bbt.c
index 344a24fd2ca8..dced32a126d9 100644
--- a/drivers/mtd/nand/raw/nand_bbt.c
+++ b/drivers/mtd/nand/raw/nand_bbt.c
@@ -1087,7 +1087,7 @@ static int nand_update_bbt(struct nand_chip *this, loff_t offs)
 }
 
 /**
- * mark_bbt_regions - [GENERIC] mark the bad block table regions
+ * mark_bbt_region - [GENERIC] mark the bad block table regions
  * @this: the NAND device
  * @td: bad block table descriptor
  *
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 0a90a8792d18..6b3240e44310 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1284,7 +1284,8 @@ static inline bool nand_is_slc(struct nand_chip *chip)
 }
 
 /**
- * Check if the opcode's address should be sent only on the lower 8 bits
+ * nand_opcode_8bits - Check if the opcode's address should be sent only on the
+ *	lower 8 bits
  * @command: opcode to check
  */
 static inline int nand_opcode_8bits(unsigned int command)
-- 
cgit 


From 0f6b791955a6365b5ebe8b6a5b01de69a47ee92e Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Tue, 10 Nov 2020 09:19:08 -0300
Subject: mtd: rawnand: mxc: Remove platform data support

i.MX is a devicetree-only platform now and the existing platform data
support in this driver was only useful for old non-devicetree platforms.

Get rid of the platform data support since it is no longer used.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Reviewed-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20201110121908.19400-1-festevam@gmail.com
---
 drivers/mtd/nand/raw/Kconfig               |  2 +-
 drivers/mtd/nand/raw/mxc_nand.c            | 75 ++----------------------------
 include/linux/platform_data/mtd-mxc_nand.h | 19 --------
 3 files changed, 5 insertions(+), 91 deletions(-)
 delete mode 100644 include/linux/platform_data/mtd-mxc_nand.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
index cc86d6e4e042..b93e6cd23259 100644
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -292,7 +292,7 @@ config MTD_NAND_VF610_NFC
 config MTD_NAND_MXC
 	tristate "Freescale MXC NAND controller"
 	depends on ARCH_MXC || COMPILE_TEST
-	depends on HAS_IOMEM
+	depends on HAS_IOMEM && OF
 	help
 	  This enables the driver for the NAND flash controller on the
 	  MXC processors.
diff --git a/drivers/mtd/nand/raw/mxc_nand.c b/drivers/mtd/nand/raw/mxc_nand.c
index 684c51e5e60d..f896364968d8 100644
--- a/drivers/mtd/nand/raw/mxc_nand.c
+++ b/drivers/mtd/nand/raw/mxc_nand.c
@@ -21,7 +21,6 @@
 #include <linux/completion.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/platform_data/mtd-mxc_nand.h>
 
 #define DRIVER_NAME "mxc_nand"
 
@@ -184,7 +183,6 @@ struct mxc_nand_host {
 	unsigned int		buf_start;
 
 	const struct mxc_nand_devtype_data *devtype_data;
-	struct mxc_nand_platform_data pdata;
 };
 
 static const char * const part_probes[] = {
@@ -1611,29 +1609,6 @@ static inline int is_imx53_nfc(struct mxc_nand_host *host)
 	return host->devtype_data == &imx53_nand_devtype_data;
 }
 
-static const struct platform_device_id mxcnd_devtype[] = {
-	{
-		.name = "imx21-nand",
-		.driver_data = (kernel_ulong_t) &imx21_nand_devtype_data,
-	}, {
-		.name = "imx27-nand",
-		.driver_data = (kernel_ulong_t) &imx27_nand_devtype_data,
-	}, {
-		.name = "imx25-nand",
-		.driver_data = (kernel_ulong_t) &imx25_nand_devtype_data,
-	}, {
-		.name = "imx51-nand",
-		.driver_data = (kernel_ulong_t) &imx51_nand_devtype_data,
-	}, {
-		.name = "imx53-nand",
-		.driver_data = (kernel_ulong_t) &imx53_nand_devtype_data,
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(platform, mxcnd_devtype);
-
-#ifdef CONFIG_OF
 static const struct of_device_id mxcnd_dt_ids[] = {
 	{
 		.compatible = "fsl,imx21-nand",
@@ -1655,26 +1630,6 @@ static const struct of_device_id mxcnd_dt_ids[] = {
 };
 MODULE_DEVICE_TABLE(of, mxcnd_dt_ids);
 
-static int mxcnd_probe_dt(struct mxc_nand_host *host)
-{
-	struct device_node *np = host->dev->of_node;
-	const struct of_device_id *of_id =
-		of_match_device(mxcnd_dt_ids, host->dev);
-
-	if (!np)
-		return 1;
-
-	host->devtype_data = of_id->data;
-
-	return 0;
-}
-#else
-static int mxcnd_probe_dt(struct mxc_nand_host *host)
-{
-	return 1;
-}
-#endif
-
 static int mxcnd_attach_chip(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
@@ -1759,6 +1714,7 @@ static const struct nand_controller_ops mxcnd_controller_ops = {
 
 static int mxcnd_probe(struct platform_device *pdev)
 {
+	const struct of_device_id *of_id;
 	struct nand_chip *this;
 	struct mtd_info *mtd;
 	struct mxc_nand_host *host;
@@ -1800,20 +1756,8 @@ static int mxcnd_probe(struct platform_device *pdev)
 	if (IS_ERR(host->clk))
 		return PTR_ERR(host->clk);
 
-	err = mxcnd_probe_dt(host);
-	if (err > 0) {
-		struct mxc_nand_platform_data *pdata =
-					dev_get_platdata(&pdev->dev);
-		if (pdata) {
-			host->pdata = *pdata;
-			host->devtype_data = (struct mxc_nand_devtype_data *)
-						pdev->id_entry->driver_data;
-		} else {
-			err = -ENODEV;
-		}
-	}
-	if (err < 0)
-		return err;
+	of_id = of_match_device(mxcnd_dt_ids, host->dev);
+	host->devtype_data = of_id->data;
 
 	if (!host->devtype_data->setup_interface)
 		this->options |= NAND_KEEP_TIMINGS;
@@ -1843,14 +1787,6 @@ static int mxcnd_probe(struct platform_device *pdev)
 
 	this->legacy.select_chip = host->devtype_data->select_chip;
 
-	/* NAND bus width determines access functions used by upper layer */
-	if (host->pdata.width == 2)
-		this->options |= NAND_BUSWIDTH_16;
-
-	/* update flash based bbt */
-	if (host->pdata.flash_bbt)
-		this->bbt_options |= NAND_BBT_USE_FLASH;
-
 	init_completion(&host->op_completion);
 
 	host->irq = platform_get_irq(pdev, 0);
@@ -1891,9 +1827,7 @@ static int mxcnd_probe(struct platform_device *pdev)
 		goto escan;
 
 	/* Register the partitions */
-	err = mtd_device_parse_register(mtd, part_probes, NULL,
-					host->pdata.parts,
-					host->pdata.nr_parts);
+	err = mtd_device_parse_register(mtd, part_probes, NULL, NULL, 0);
 	if (err)
 		goto cleanup_nand;
 
@@ -1930,7 +1864,6 @@ static struct platform_driver mxcnd_driver = {
 		   .name = DRIVER_NAME,
 		   .of_match_table = of_match_ptr(mxcnd_dt_ids),
 	},
-	.id_table = mxcnd_devtype,
 	.probe = mxcnd_probe,
 	.remove = mxcnd_remove,
 };
diff --git a/include/linux/platform_data/mtd-mxc_nand.h b/include/linux/platform_data/mtd-mxc_nand.h
deleted file mode 100644
index d1230030c6db..000000000000
--- a/include/linux/platform_data/mtd-mxc_nand.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved.
- * Copyright 2008 Sascha Hauer, kernel@pengutronix.de
- */
-
-#ifndef __ASM_ARCH_NAND_H
-#define __ASM_ARCH_NAND_H
-
-#include <linux/mtd/partitions.h>
-
-struct mxc_nand_platform_data {
-	unsigned int width;	/* data bus width in bytes */
-	unsigned int hw_ecc:1;	/* 0 if suppress hardware ECC */
-	unsigned int flash_bbt:1; /* set to 1 to use a flash based bbt */
-	struct mtd_partition *parts;	/* partition table */
-	int nr_parts;			/* size of parts */
-};
-#endif /* __ASM_ARCH_NAND_H */
-- 
cgit 


From bdfae1c9a913930eae5ea506733aa7c285e12a06 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 9 Dec 2020 09:44:44 +0800
Subject: vfio/type1: Add vfio_group_iommu_domain()

Add the API for getting the domain from a vfio group. This could be used
by the physical device drivers which rely on the vfio/mdev framework for
mediated device user level access. The typical use case like below:

	unsigned int pasid;
	struct vfio_group *vfio_group;
	struct iommu_domain *iommu_domain;
	struct device *dev = mdev_dev(mdev);
	struct device *iommu_device = mdev_get_iommu_device(dev);

	if (!iommu_device ||
	    !iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
		return -EINVAL;

	vfio_group = vfio_group_get_external_user_from_dev(dev);
	if (IS_ERR_OR_NULL(vfio_group))
		return -EFAULT;

	iommu_domain = vfio_group_iommu_domain(vfio_group);
	if (IS_ERR_OR_NULL(iommu_domain)) {
		vfio_group_put_external_user(vfio_group);
		return -EFAULT;
	}

	pasid = iommu_aux_get_pasid(iommu_domain, iommu_device);
	if (pasid < 0) {
		vfio_group_put_external_user(vfio_group);
		return -EFAULT;
	}

	/* Program device context with pasid value. */
	...

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c             | 18 ++++++++++++++++++
 drivers/vfio/vfio_iommu_type1.c | 24 ++++++++++++++++++++++++
 include/linux/vfio.h            |  4 ++++
 3 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2151bc7f87ab..4ad8a35667a7 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2331,6 +2331,24 @@ int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
 }
 EXPORT_SYMBOL(vfio_unregister_notifier);
 
+struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+
+	if (!group)
+		return ERR_PTR(-EINVAL);
+
+	container = group->container;
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->group_iommu_domain))
+		return driver->ops->group_iommu_domain(container->iommu_data,
+						       group->iommu_group);
+
+	return ERR_PTR(-ENOTTY);
+}
+EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 67e827638995..0b4dedaa9128 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2980,6 +2980,29 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
 	return ret;
 }
 
+static struct iommu_domain *
+vfio_iommu_type1_group_iommu_domain(void *iommu_data,
+				    struct iommu_group *iommu_group)
+{
+	struct iommu_domain *domain = ERR_PTR(-ENODEV);
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *d;
+
+	if (!iommu || !iommu_group)
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		if (find_iommu_group(d, iommu_group)) {
+			domain = d->domain;
+			break;
+		}
+	}
+	mutex_unlock(&iommu->lock);
+
+	return domain;
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.name			= "vfio-iommu-type1",
 	.owner			= THIS_MODULE,
@@ -2993,6 +3016,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.register_notifier	= vfio_iommu_type1_register_notifier,
 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
 	.dma_rw			= vfio_iommu_type1_dma_rw,
+	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 38d3c6a8dc7e..f45940b38a02 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -90,6 +90,8 @@ struct vfio_iommu_driver_ops {
 					       struct notifier_block *nb);
 	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
 				  void *data, size_t count, bool write);
+	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
+						   struct iommu_group *group);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -126,6 +128,8 @@ extern int vfio_group_unpin_pages(struct vfio_group *group,
 extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
 		       void *data, size_t len, bool write);
 
+extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group);
+
 /* each type has independent events */
 enum vfio_notify_type {
 	VFIO_IOMMU_NOTIFY = 0,
-- 
cgit 


From 88149082bb8ef31b289673669e080ec6a00c2e59 Mon Sep 17 00:00:00 2001
From: Hao Li <lihao2018.fnst@cn.fujitsu.com>
Date: Tue, 8 Dec 2020 10:08:43 +0800
Subject: fs: Handle I_DONTCACHE in iput_final() instead of
 generic_drop_inode()

If generic_drop_inode() returns true, it means iput_final() can evict
this inode regardless of whether it is dirty or not. If we check
I_DONTCACHE in generic_drop_inode(), any inode with this bit set will be
evicted unconditionally. This is not the desired behavior because
I_DONTCACHE only means the inode shouldn't be cached on the LRU list.
As for whether we need to evict this inode, this is what
generic_drop_inode() should do. This patch corrects the usage of
I_DONTCACHE.

This patch was proposed in [1].

[1]: https://lore.kernel.org/linux-fsdevel/20200831003407.GE12096@dread.disaster.area/

Fixes: dae2f8ed7992 ("fs: Lift XFS_IDONTCACHE to the VFS layer")
Signed-off-by: Hao Li <lihao2018.fnst@cn.fujitsu.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 4 +++-
 include/linux/fs.h | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 9d78c37b00b8..5eea9912a0b9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1627,7 +1627,9 @@ static void iput_final(struct inode *inode)
 	else
 		drop = generic_drop_inode(inode);
 
-	if (!drop && (sb->s_flags & SB_ACTIVE)) {
+	if (!drop &&
+	    !(inode->i_state & I_DONTCACHE) &&
+	    (sb->s_flags & SB_ACTIVE)) {
 		inode_add_lru(inode);
 		spin_unlock(&inode->i_lock);
 		return;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8667d0cdc71e..8bde32cf9711 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2878,8 +2878,7 @@ extern int inode_needs_sync(struct inode *inode);
 extern int generic_delete_inode(struct inode *inode);
 static inline int generic_drop_inode(struct inode *inode)
 {
-	return !inode->i_nlink || inode_unhashed(inode) ||
-		(inode->i_state & I_DONTCACHE);
+	return !inode->i_nlink || inode_unhashed(inode);
 }
 extern void d_mark_dontcache(struct inode *inode);
 
-- 
cgit 


From c9e6189fb03123a7dfb93589280347b46f30b161 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Dec 2020 22:46:18 +0100
Subject: ntp: Make the RTC synchronization more reliable

Miroslav reported that the periodic RTC synchronization in the NTP code
fails more often than not to hit the specified update window.

The reason is that the code uses delayed_work to schedule the update which
needs to be in thread context as the underlying RTC might be connected via
a slow bus, e.g. I2C. In the update function it verifies whether the
current time is correct vs. the requirements of the underlying RTC.

But delayed_work is using the timer wheel for scheduling which is
inaccurate by design. Depending on the distance to the expiry the wheel
gets less granular to allow batching and to avoid the cascading of the
original timer wheel. See 500462a9de65 ("timers: Switch to a non-cascading
wheel") and the code for further details.

The code already deals with this by splitting the 660 seconds period into a
long 659 seconds timer and then retrying with a smaller delta.

But looking at the actual granularities of the timer wheel (which depend on
the HZ configuration) the 659 seconds timer ends up in an outer wheel level
and is affected by a worst case granularity of:

HZ          Granularity
1000        32s
 250        16s
 100        40s

So the initial timer can be already off by max 12.5% which is not a big
issue as the period of the sync is defined as ~11 minutes.

The fine grained second attempt schedules to the desired update point with
a timer expiring less than a second from now. Depending on the actual delta
and the HZ setting even the second attempt can end up in outer wheel levels
which have a large enough granularity to make the correctness check fail.

As this is a fundamental property of the timer wheel there is no way to
make this more accurate short of iterating in one jiffies steps towards the
update point.

Switch it to an hrtimer instead which schedules the actual update work. The
hrtimer will expire precisely (max 1 jiffie delay when high resolution
timers are not available). The actual scheduling delay of the work is the
same as before.

The update is triggered from do_adjtimex() which is a bit racy but not much
more racy than it was before:

     if (ntp_synced())
     	queue_delayed_work(system_power_efficient_wq, &sync_work, 0);

which is racy when the work is currently executed and has not managed to
reschedule itself.

This becomes now:

     if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer))
     	queue_work(system_power_efficient_wq, &sync_work, 0);

which is racy when the hrtimer has expired and the work is currently
executed and has not yet managed to rearm the hrtimer.

Not a big problem as it just schedules work for nothing.

The new implementation has a safe guard in place to catch the case where
the hrtimer is queued on entry to the work function and avoids an extra
update attempt of the RTC that way.

Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Miroslav Lichvar <mlichvar@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201206220542.062910520@linutronix.de
---
 include/linux/timex.h      |  1 -
 kernel/time/ntp.c          | 90 ++++++++++++++++++++++++----------------------
 kernel/time/ntp_internal.h |  7 ++++
 3 files changed, 55 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index ce0859763670..9c2e54faf9b7 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -157,7 +157,6 @@ extern int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex *
 extern void hardpps(const struct timespec64 *, const struct timespec64 *);
 
 int read_current_timer(unsigned long *timer_val);
-void ntp_notify_cmos_timer(void);
 
 /* The clock frequency of the i8253/i8254 PIT */
 #define PIT_TICK_RATE 1193182ul
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 069ca78fb0bf..ff1a7b8ec4ef 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -494,65 +494,55 @@ out:
 	return leap;
 }
 
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
 static void sync_hw_clock(struct work_struct *work);
-static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock);
-
-static void sched_sync_hw_clock(struct timespec64 now,
-				unsigned long target_nsec, bool fail)
+static DECLARE_WORK(sync_work, sync_hw_clock);
+static struct hrtimer sync_hrtimer;
+#define SYNC_PERIOD_NS (11UL * 60 * NSEC_PER_SEC)
 
+static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer)
 {
-	struct timespec64 next;
+	queue_work(system_power_efficient_wq, &sync_work);
 
-	ktime_get_real_ts64(&next);
-	if (!fail)
-		next.tv_sec = 659;
-	else {
-		/*
-		 * Try again as soon as possible. Delaying long periods
-		 * decreases the accuracy of the work queue timer. Due to this
-		 * the algorithm is very likely to require a short-sleep retry
-		 * after the above long sleep to synchronize ts_nsec.
-		 */
-		next.tv_sec = 0;
-	}
+	return HRTIMER_NORESTART;
+}
 
-	/* Compute the needed delay that will get to tv_nsec == target_nsec */
-	next.tv_nsec = target_nsec - next.tv_nsec;
-	if (next.tv_nsec <= 0)
-		next.tv_nsec += NSEC_PER_SEC;
-	if (next.tv_nsec >= NSEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_nsec -= NSEC_PER_SEC;
-	}
+static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry)
+{
+	ktime_t exp = ktime_set(ktime_get_real_seconds(), 0);
+
+	if (retry)
+		exp = ktime_add_ns(exp, 2 * NSEC_PER_SEC - offset_nsec);
+	else
+		exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec);
 
-	queue_delayed_work(system_power_efficient_wq, &sync_work,
-			   timespec64_to_jiffies(&next));
+	hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS);
 }
 
 static void sync_rtc_clock(void)
 {
-	unsigned long target_nsec;
-	struct timespec64 adjust, now;
+	unsigned long offset_nsec;
+	struct timespec64 adjust;
 	int rc;
 
 	if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
 		return;
 
-	ktime_get_real_ts64(&now);
+	ktime_get_real_ts64(&adjust);
 
-	adjust = now;
 	if (persistent_clock_is_local)
 		adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 
 	/*
-	 * The current RTC in use will provide the target_nsec it wants to be
-	 * called at, and does rtc_tv_nsec_ok internally.
+	 * The current RTC in use will provide the nanoseconds offset prior
+	 * to a full second it wants to be called at, and invokes
+	 * rtc_tv_nsec_ok() internally.
 	 */
-	rc = rtc_set_ntp_time(adjust, &target_nsec);
+	rc = rtc_set_ntp_time(adjust, &offset_nsec);
 	if (rc == -ENODEV)
 		return;
 
-	sched_sync_hw_clock(now, target_nsec, rc);
+	sched_sync_hw_clock(offset_nsec, rc != 0);
 }
 
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -599,7 +589,7 @@ static bool sync_cmos_clock(void)
 		}
 	}
 
-	sched_sync_hw_clock(now, target_nsec, rc);
+	sched_sync_hw_clock(target_nsec, rc != 0);
 	return true;
 }
 
@@ -613,7 +603,12 @@ static bool sync_cmos_clock(void)
  */
 static void sync_hw_clock(struct work_struct *work)
 {
-	if (!ntp_synced())
+	/*
+	 * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer()
+	 * managed to schedule the work between the timer firing and the
+	 * work being able to rearm the timer. Wait for the timer to expire.
+	 */
+	if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer))
 		return;
 
 	if (sync_cmos_clock())
@@ -624,13 +619,23 @@ static void sync_hw_clock(struct work_struct *work)
 
 void ntp_notify_cmos_timer(void)
 {
-	if (!ntp_synced())
-		return;
+	/*
+	 * When the work is currently executed but has not yet the timer
+	 * rearmed this queues the work immediately again. No big issue,
+	 * just a pointless work scheduled.
+	 */
+	if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer))
+		queue_work(system_power_efficient_wq, &sync_work);
+}
 
-	if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) ||
-	    IS_ENABLED(CONFIG_RTC_SYSTOHC))
-		queue_delayed_work(system_power_efficient_wq, &sync_work, 0);
+static void __init ntp_init_cmos_sync(void)
+{
+	hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	sync_hrtimer.function = sync_timer_callback;
 }
+#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
+static inline void __init ntp_init_cmos_sync(void) { }
+#endif /* !CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
 
 /*
  * Propagate a new txc->status value into the NTP state:
@@ -1044,4 +1049,5 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
 void __init ntp_init(void)
 {
 	ntp_clear();
+	ntp_init_cmos_sync();
 }
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 908ecaa65fc3..23d1b74c3065 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -12,4 +12,11 @@ extern int __do_adjtimex(struct __kernel_timex *txc,
 			 const struct timespec64 *ts,
 			 s32 *time_tai, struct audit_ntp_data *ad);
 extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
+
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
+extern void ntp_notify_cmos_timer(void);
+#else
+static inline void ntp_notify_cmos_timer(void) { }
+#endif
+
 #endif /* _LINUX_NTP_INTERNAL_H */
-- 
cgit 


From 33e62e832384c8cb523044e0e9d99d7133f98e93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Dec 2020 22:46:19 +0100
Subject: ntp, rtc: Move rtc_set_ntp_time() to ntp code

rtc_set_ntp_time() is not really RTC functionality as the code is just a
user of RTC. Move it into the NTP code which allows further cleanups.

Requested-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201206220542.166871172@linutronix.de
---
 drivers/rtc/Makefile  |  1 -
 drivers/rtc/systohc.c | 61 -----------------------------------
 include/linux/rtc.h   | 34 --------------------
 kernel/time/ntp.c     | 88 +++++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 85 insertions(+), 99 deletions(-)
 delete mode 100644 drivers/rtc/systohc.c

(limited to 'include/linux')

diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index bfb57464118d..bb8f319b09fb 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -6,7 +6,6 @@
 ccflags-$(CONFIG_RTC_DEBUG)	:= -DDEBUG
 
 obj-$(CONFIG_RTC_LIB)		+= lib.o
-obj-$(CONFIG_RTC_SYSTOHC)	+= systohc.o
 obj-$(CONFIG_RTC_CLASS)		+= rtc-core.o
 obj-$(CONFIG_RTC_MC146818_LIB)	+= rtc-mc146818-lib.o
 rtc-core-y			:= class.o interface.o
diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c
deleted file mode 100644
index 8b70f0520e13..000000000000
--- a/drivers/rtc/systohc.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/rtc.h>
-#include <linux/time.h>
-
-/**
- * rtc_set_ntp_time - Save NTP synchronized time to the RTC
- * @now: Current time of day
- * @target_nsec: pointer for desired now->tv_nsec value
- *
- * Replacement for the NTP platform function update_persistent_clock64
- * that stores time for later retrieval by rtc_hctosys.
- *
- * Returns 0 on successful RTC update, -ENODEV if a RTC update is not
- * possible at all, and various other -errno for specific temporary failure
- * cases.
- *
- * -EPROTO is returned if now.tv_nsec is not close enough to *target_nsec.
- *
- * If temporary failure is indicated the caller should try again 'soon'
- */
-int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
-{
-	struct rtc_device *rtc;
-	struct rtc_time tm;
-	struct timespec64 to_set;
-	int err = -ENODEV;
-	bool ok;
-
-	rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
-	if (!rtc)
-		goto out_err;
-
-	if (!rtc->ops || !rtc->ops->set_time)
-		goto out_close;
-
-	/* Compute the value of tv_nsec we require the caller to supply in
-	 * now.tv_nsec.  This is the value such that (now +
-	 * set_offset_nsec).tv_nsec == 0.
-	 */
-	set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec);
-	*target_nsec = to_set.tv_nsec;
-
-	/* The ntp code must call this with the correct value in tv_nsec, if
-	 * it does not we update target_nsec and return EPROTO to make the ntp
-	 * code try again later.
-	 */
-	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
-	if (!ok) {
-		err = -EPROTO;
-		goto out_close;
-	}
-
-	rtc_time64_to_tm(to_set.tv_sec, &tm);
-
-	err = rtc_set_time(rtc, &tm);
-
-out_close:
-	rtc_class_close(rtc);
-out_err:
-	return err;
-}
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 22d1575e4991..ff62680b48ca 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -165,7 +165,6 @@ int __rtc_register_device(struct module *owner, struct rtc_device *rtc);
 
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
-extern int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec);
 int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);
@@ -205,39 +204,6 @@ static inline bool is_leap_year(unsigned int year)
 	return (!(year % 4) && (year % 100)) || !(year % 400);
 }
 
-/* Determine if we can call to driver to set the time. Drivers can only be
- * called to set a second aligned time value, and the field set_offset_nsec
- * specifies how far away from the second aligned time to call the driver.
- *
- * This also computes 'to_set' which is the time we are trying to set, and has
- * a zero in tv_nsecs, such that:
- *    to_set - set_delay_nsec == now +/- FUZZ
- *
- */
-static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec,
-				  struct timespec64 *to_set,
-				  const struct timespec64 *now)
-{
-	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
-	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
-	struct timespec64 delay = {.tv_sec = 0,
-				   .tv_nsec = set_offset_nsec};
-
-	*to_set = timespec64_add(*now, delay);
-
-	if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
-		to_set->tv_nsec = 0;
-		return true;
-	}
-
-	if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
-		to_set->tv_sec++;
-		to_set->tv_nsec = 0;
-		return true;
-	}
-	return false;
-}
-
 #define rtc_register_device(device) \
 	__rtc_register_device(THIS_MODULE, device)
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ff1a7b8ec4ef..84a554622cee 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -519,15 +519,94 @@ static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry)
 	hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS);
 }
 
+/*
+ * Determine if we can call to driver to set the time. Drivers can only be
+ * called to set a second aligned time value, and the field set_offset_nsec
+ * specifies how far away from the second aligned time to call the driver.
+ *
+ * This also computes 'to_set' which is the time we are trying to set, and has
+ * a zero in tv_nsecs, such that:
+ *    to_set - set_delay_nsec == now +/- FUZZ
+ *
+ */
+static inline bool rtc_tv_nsec_ok(long set_offset_nsec,
+				  struct timespec64 *to_set,
+				  const struct timespec64 *now)
+{
+	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
+	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
+	struct timespec64 delay = {.tv_sec = 0,
+				   .tv_nsec = set_offset_nsec};
+
+	*to_set = timespec64_add(*now, delay);
+
+	if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
+		to_set->tv_nsec = 0;
+		return true;
+	}
+
+	if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
+		to_set->tv_sec++;
+		to_set->tv_nsec = 0;
+		return true;
+	}
+	return false;
+}
+
+#ifdef CONFIG_RTC_SYSTOHC
+/*
+ * rtc_set_ntp_time - Save NTP synchronized time to the RTC
+ */
+static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
+{
+	struct rtc_device *rtc;
+	struct rtc_time tm;
+	struct timespec64 to_set;
+	int err = -ENODEV;
+	bool ok;
+
+	rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
+	if (!rtc)
+		goto out_err;
+
+	if (!rtc->ops || !rtc->ops->set_time)
+		goto out_close;
+
+	/*
+	 * Compute the value of tv_nsec we require the caller to supply in
+	 * now.tv_nsec.  This is the value such that (now +
+	 * set_offset_nsec).tv_nsec == 0.
+	 */
+	set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec);
+	*target_nsec = to_set.tv_nsec;
+
+	/*
+	 * The ntp code must call this with the correct value in tv_nsec, if
+	 * it does not we update target_nsec and return EPROTO to make the ntp
+	 * code try again later.
+	 */
+	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
+	if (!ok) {
+		err = -EPROTO;
+		goto out_close;
+	}
+
+	rtc_time64_to_tm(to_set.tv_sec, &tm);
+
+	err = rtc_set_time(rtc, &tm);
+
+out_close:
+	rtc_class_close(rtc);
+out_err:
+	return err;
+}
+
 static void sync_rtc_clock(void)
 {
 	unsigned long offset_nsec;
 	struct timespec64 adjust;
 	int rc;
 
-	if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
-		return;
-
 	ktime_get_real_ts64(&adjust);
 
 	if (persistent_clock_is_local)
@@ -544,6 +623,9 @@ static void sync_rtc_clock(void)
 
 	sched_sync_hw_clock(offset_nsec, rc != 0);
 }
+#else
+static inline void sync_rtc_clock(void) { }
+#endif
 
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
 int __weak update_persistent_clock64(struct timespec64 now64)
-- 
cgit 


From 69eca258c85000564577642ba28335eb4e1df8f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Dec 2020 22:46:20 +0100
Subject: ntp: Make the RTC sync offset less obscure

The current RTC set_offset_nsec value is not really intuitive to
understand.

  tsched       twrite(t2.tv_sec - 1) 	 t2 (seconds increment)

The offset is calculated from twrite based on the assumption that t2 -
twrite == 1s. That means for the MC146818 RTC the offset needs to be
negative so that the write happens 500ms before t2.

It's easier to understand when the whole calculation is based on t2. That
avoids negative offsets and the meaning is obvious:

 t2 - twrite:     The time defined by the chip when seconds increment
      		  after the write.

 twrite - tsched: The time for the transport to the point where the chip
 	  	  is updated.

==> set_offset_nsec =  t2 - tsched
    ttransport      =  twrite - tsched
    tRTCinc         =  t2 - twrite
==> set_offset_nsec =  ttransport + tRTCinc

tRTCinc is a chip property and can be obtained from the data sheet.

ttransport depends on how the RTC is connected. It is close to 0 for
directly accessible RTCs. For RTCs behind a slow bus, e.g. i2c, it's the
time required to send the update over the bus. This can be estimated or
even calibrated, but that's a different problem.

Adjust the implementation and update comments accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201206220542.263204937@linutronix.de
---
 drivers/rtc/class.c    |  9 +++++++--
 drivers/rtc/rtc-cmos.c |  2 +-
 include/linux/rtc.h    | 35 +++++++++++++++++++++++++++++------
 kernel/time/ntp.c      | 47 ++++++++++++++++++++++++-----------------------
 4 files changed, 61 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index d7957376eb96..5855aa2eef62 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -200,8 +200,13 @@ static struct rtc_device *rtc_allocate_device(void)
 
 	device_initialize(&rtc->dev);
 
-	/* Drivers can revise this default after allocating the device. */
-	rtc->set_offset_nsec =  5 * NSEC_PER_MSEC;
+	/*
+	 * Drivers can revise this default after allocating the device.
+	 * The default is what most RTCs do: Increment seconds exactly one
+	 * second after the write happened. This adds a default transport
+	 * time of 5ms which is at least halfways close to reality.
+	 */
+	rtc->set_offset_nsec = NSEC_PER_SEC + 5 * NSEC_PER_MSEC;
 
 	rtc->irq_freq = 1;
 	rtc->max_user_freq = 64;
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 7728faca01b4..c5bcd2adc9fe 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -869,7 +869,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 		goto cleanup2;
 
 	/* Set the sync offset for the periodic 11min update correct */
-	cmos_rtc.rtc->set_offset_nsec = -(NSEC_PER_SEC / 2);
+	cmos_rtc.rtc->set_offset_nsec = NSEC_PER_SEC / 2;
 
 	/* export at least the first block of NVRAM */
 	nvmem_cfg.size = address_space - NVRAM_OFFSET;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index ff62680b48ca..b829382de6c3 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -110,13 +110,36 @@ struct rtc_device {
 	/* Some hardware can't support UIE mode */
 	int uie_unsupported;
 
-	/* Number of nsec it takes to set the RTC clock. This influences when
-	 * the set ops are called. An offset:
-	 *   - of 0.5 s will call RTC set for wall clock time 10.0 s at 9.5 s
-	 *   - of 1.5 s will call RTC set for wall clock time 10.0 s at 8.5 s
-	 *   - of -0.5 s will call RTC set for wall clock time 10.0 s at 10.5 s
+	/*
+	 * This offset specifies the update timing of the RTC.
+	 *
+	 * tsched     t1 write(t2.tv_sec - 1sec))  t2 RTC increments seconds
+	 *
+	 * The offset defines how tsched is computed so that the write to
+	 * the RTC (t2.tv_sec - 1sec) is correct versus the time required
+	 * for the transport of the write and the time which the RTC needs
+	 * to increment seconds the first time after the write (t2).
+	 *
+	 * For direct accessible RTCs tsched ~= t1 because the write time
+	 * is negligible. For RTCs behind slow busses the transport time is
+	 * significant and has to be taken into account.
+	 *
+	 * The time between the write (t1) and the first increment after
+	 * the write (t2) is RTC specific. For a MC146818 RTC it's 500ms,
+	 * for many others it's exactly 1 second. Consult the datasheet.
+	 *
+	 * The value of this offset is also used to calculate the to be
+	 * written value (t2.tv_sec - 1sec) at tsched.
+	 *
+	 * The default value for this is NSEC_PER_SEC + 10 msec default
+	 * transport time. The offset can be adjusted by drivers so the
+	 * calculation for the to be written value at tsched becomes
+	 * correct:
+	 *
+	 *	newval = tsched + set_offset_nsec - NSEC_PER_SEC
+	 * and  (tsched + set_offset_nsec) % NSEC_PER_SEC == 0
 	 */
-	long set_offset_nsec;
+	unsigned long set_offset_nsec;
 
 	bool registered;
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 84a554622cee..a34ac069335f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -520,22 +520,33 @@ static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry)
 }
 
 /*
- * Determine if we can call to driver to set the time. Drivers can only be
- * called to set a second aligned time value, and the field set_offset_nsec
- * specifies how far away from the second aligned time to call the driver.
+ * Check whether @now is correct versus the required time to update the RTC
+ * and calculate the value which needs to be written to the RTC so that the
+ * next seconds increment of the RTC after the write is aligned with the next
+ * seconds increment of clock REALTIME.
  *
- * This also computes 'to_set' which is the time we are trying to set, and has
- * a zero in tv_nsecs, such that:
- *    to_set - set_delay_nsec == now +/- FUZZ
+ * tsched     t1 write(t2.tv_sec - 1sec))	t2 RTC increments seconds
  *
+ * t2.tv_nsec == 0
+ * tsched = t2 - set_offset_nsec
+ * newval = t2 - NSEC_PER_SEC
+ *
+ * ==> neval = tsched + set_offset_nsec - NSEC_PER_SEC
+ *
+ * As the execution of this code is not guaranteed to happen exactly at
+ * tsched this allows it to happen within a fuzzy region:
+ *
+ *	abs(now - tsched) < FUZZ
+ *
+ * If @now is not inside the allowed window the function returns false.
  */
-static inline bool rtc_tv_nsec_ok(long set_offset_nsec,
+static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec,
 				  struct timespec64 *to_set,
 				  const struct timespec64 *now)
 {
 	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
 	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
-	struct timespec64 delay = {.tv_sec = 0,
+	struct timespec64 delay = {.tv_sec = -1,
 				   .tv_nsec = set_offset_nsec};
 
 	*to_set = timespec64_add(*now, delay);
@@ -557,11 +568,11 @@ static inline bool rtc_tv_nsec_ok(long set_offset_nsec,
 /*
  * rtc_set_ntp_time - Save NTP synchronized time to the RTC
  */
-static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
+static int rtc_set_ntp_time(struct timespec64 now, unsigned long *offset_nsec)
 {
+	struct timespec64 to_set;
 	struct rtc_device *rtc;
 	struct rtc_time tm;
-	struct timespec64 to_set;
 	int err = -ENODEV;
 	bool ok;
 
@@ -572,19 +583,9 @@ static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
 	if (!rtc->ops || !rtc->ops->set_time)
 		goto out_close;
 
-	/*
-	 * Compute the value of tv_nsec we require the caller to supply in
-	 * now.tv_nsec.  This is the value such that (now +
-	 * set_offset_nsec).tv_nsec == 0.
-	 */
-	set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec);
-	*target_nsec = to_set.tv_nsec;
+	/* Store the update offset for this RTC */
+	*offset_nsec = rtc->set_offset_nsec;
 
-	/*
-	 * The ntp code must call this with the correct value in tv_nsec, if
-	 * it does not we update target_nsec and return EPROTO to make the ntp
-	 * code try again later.
-	 */
 	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
 	if (!ok) {
 		err = -EPROTO;
@@ -657,7 +658,7 @@ static bool sync_cmos_clock(void)
 	 * implement this legacy API.
 	 */
 	ktime_get_real_ts64(&now);
-	if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) {
+	if (rtc_tv_nsec_ok(target_nsec, &adjust, &now)) {
 		if (persistent_clock_is_local)
 			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 		rc = update_persistent_clock64(adjust);
-- 
cgit 


From fe79d5de77204dd946cfad76a9bec23354b1a500 Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Thu, 10 Dec 2020 17:05:20 +0100
Subject: USB: typec: tcpm: Add a 30ms room for tPSSourceOn in PR_SWAP

TCPM state machine needs 20-25ms to enter the ErrorRecovery state after
tPSSourceOn timer timeouts. Change the timer from max 480ms to 450ms to
ensure that the timer complies with the Spec. In order to keep the
flexibility for other usecases using tPSSourceOn, add another timer only
for PR_SWAP.

Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Cc: Badhri Jagan Sridharan <badhri@google.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Kyle Tso <kyletso@google.com>
Signed-off-by: Will McVicker <willmcvicker@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20201210160521.3417426-5-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 2 +-
 include/linux/usb/pd.h        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 3f9edc8a9a57..36b6ce85093f 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -3738,7 +3738,7 @@ static void run_state_machine(struct tcpm_port *port)
 			tcpm_set_state(port, ERROR_RECOVERY, 0);
 			break;
 		}
-		tcpm_set_state(port, ERROR_RECOVERY, PD_T_PS_SOURCE_ON);
+		tcpm_set_state(port, ERROR_RECOVERY, PD_T_PS_SOURCE_ON_PRS);
 		break;
 	case PR_SWAP_SRC_SNK_SINK_ON:
 		/* Set the vbus disconnect threshold for implicit contract */
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index 63a66dd5d832..bb9a782e1411 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -466,6 +466,7 @@ static inline unsigned int rdo_max_power(u32 rdo)
 #define PD_T_DRP_SRC		30
 #define PD_T_PS_SOURCE_OFF	920
 #define PD_T_PS_SOURCE_ON	480
+#define PD_T_PS_SOURCE_ON_PRS	450	/* 390 - 480ms */
 #define PD_T_PS_HARD_RESET	30
 #define PD_T_SRC_RECOVER	760
 #define PD_T_SRC_RECOVER_MAX	1000
-- 
cgit 


From fecc4559780d52d174ea05e3bf543669165389c3 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 2 Dec 2020 14:07:09 +0200
Subject: fsnotify: fix events reported to watching parent and child

fsnotify_parent() used to send two separate events to backends when a
parent inode is watching children and the child inode is also watching.
In an attempt to avoid duplicate events in fanotify, we unified the two
backend callbacks to a single callback and handled the reporting of the
two separate events for the relevant backends (inotify and dnotify).
However the handling is buggy and can result in inotify and dnotify
listeners receiving events of the type they never asked for or spurious
events.

The problem is the unified event callback with two inode marks (parent and
child) is called when any of the parent and child inodes are watched and
interested in the event, but the parent inode's mark that is interested
in the event on the child is not necessarily the one we are currently
reporting to (it could belong to a different group).

So before reporting the parent or child event flavor to backend we need
to check that the mark is really interested in that event flavor.

The semantics of INODE and CHILD marks were hard to follow and made the
logic more complicated than it should have been.  Replace it with INODE
and PARENT marks semantics to hopefully make the logic more clear.

Thanks to Hugh Dickins for spotting a bug in the earlier version of this
patch.

Fixes: 497b0c5a7c06 ("fsnotify: send event to parent and child with single callback")
CC: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20201202120713.702387-4-amir73il@gmail.com
Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c    |  7 ++--
 fs/notify/fsnotify.c             | 84 ++++++++++++++++++++++++----------------
 include/linux/fsnotify_backend.h |  6 +--
 3 files changed, 57 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9167884a61ec..1192c9953620 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -268,12 +268,11 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
 			continue;
 
 		/*
-		 * If the event is for a child and this mark is on a parent not
+		 * If the event is on a child and this mark is on a parent not
 		 * watching children, don't send it!
 		 */
-		if (event_mask & FS_EVENT_ON_CHILD &&
-		    type == FSNOTIFY_OBJ_TYPE_INODE &&
-		     !(mark->mask & FS_EVENT_ON_CHILD))
+		if (type == FSNOTIFY_OBJ_TYPE_PARENT &&
+		    !(mark->mask & FS_EVENT_ON_CHILD))
 			continue;
 
 		marks_mask |= mark->mask;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index c5c68bcbaadf..30d422b8c0fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -152,6 +152,13 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
 	if (mask & FS_ISDIR)
 		return false;
 
+	/*
+	 * All events that are possible on child can also may be reported with
+	 * parent/name info to inode/sb/mount.  Otherwise, a watching parent
+	 * could result in events reported with unexpected name info to sb/mount.
+	 */
+	BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);
+
 	/* Did either inode/sb/mount subscribe for events with parent/name? */
 	marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
 	marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
@@ -249,6 +256,10 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
 	    path && d_unlinked(path->dentry))
 		return 0;
 
+	/* Check interest of this mark in case event was sent with two marks */
+	if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
+		return 0;
+
 	return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
 }
 
@@ -258,38 +269,46 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 				 u32 cookie, struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
-	struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info);
+	struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
 	int ret;
 
 	if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
 	    WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
 		return 0;
 
-	/*
-	 * An event can be sent on child mark iterator instead of inode mark
-	 * iterator because of other groups that have interest of this inode
-	 * and have marks on both parent and child.  We can simplify this case.
-	 */
-	if (!inode_mark) {
-		inode_mark = child_mark;
-		child_mark = NULL;
+	if (parent_mark) {
+		/*
+		 * parent_mark indicates that the parent inode is watching
+		 * children and interested in this event, which is an event
+		 * possible on child. But is *this mark* watching children and
+		 * interested in this event?
+		 */
+		if (parent_mark->mask & FS_EVENT_ON_CHILD) {
+			ret = fsnotify_handle_inode_event(group, parent_mark, mask,
+							  data, data_type, dir, name, 0);
+			if (ret)
+				return ret;
+		}
+		if (!inode_mark)
+			return 0;
+	}
+
+	if (mask & FS_EVENT_ON_CHILD) {
+		/*
+		 * Some events can be sent on both parent dir and child marks
+		 * (e.g. FS_ATTRIB).  If both parent dir and child are
+		 * watching, report the event once to parent dir with name (if
+		 * interested) and once to child without name (if interested).
+		 * The child watcher is expecting an event without a file name
+		 * and without the FS_EVENT_ON_CHILD flag.
+		 */
+		mask &= ~FS_EVENT_ON_CHILD;
 		dir = NULL;
 		name = NULL;
 	}
 
-	ret = fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
-					  dir, name, cookie);
-	if (ret || !child_mark)
-		return ret;
-
-	/*
-	 * Some events can be sent on both parent dir and child marks
-	 * (e.g. FS_ATTRIB).  If both parent dir and child are watching,
-	 * report the event once to parent dir with name and once to child
-	 * without name.
-	 */
-	return fsnotify_handle_inode_event(group, child_mark, mask, data, data_type,
-					   NULL, NULL, 0);
+	return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
+					   dir, name, cookie);
 }
 
 static int send_to_group(__u32 mask, const void *data, int data_type,
@@ -447,7 +466,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 	struct fsnotify_iter_info iter_info = {};
 	struct super_block *sb;
 	struct mount *mnt = NULL;
-	struct inode *child = NULL;
+	struct inode *parent = NULL;
 	int ret = 0;
 	__u32 test_mask, marks_mask;
 
@@ -459,11 +478,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 		inode = dir;
 	} else if (mask & FS_EVENT_ON_CHILD) {
 		/*
-		 * Event on child - report on TYPE_INODE to dir if it is
-		 * watching children and on TYPE_CHILD to child.
+		 * Event on child - report on TYPE_PARENT to dir if it is
+		 * watching children and on TYPE_INODE to child.
 		 */
-		child = inode;
-		inode = dir;
+		parent = dir;
 	}
 	sb = inode->i_sb;
 
@@ -477,7 +495,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 	if (!sb->s_fsnotify_marks &&
 	    (!mnt || !mnt->mnt_fsnotify_marks) &&
 	    (!inode || !inode->i_fsnotify_marks) &&
-	    (!child || !child->i_fsnotify_marks))
+	    (!parent || !parent->i_fsnotify_marks))
 		return 0;
 
 	marks_mask = sb->s_fsnotify_mask;
@@ -485,8 +503,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 		marks_mask |= mnt->mnt_fsnotify_mask;
 	if (inode)
 		marks_mask |= inode->i_fsnotify_mask;
-	if (child)
-		marks_mask |= child->i_fsnotify_mask;
+	if (parent)
+		marks_mask |= parent->i_fsnotify_mask;
 
 
 	/*
@@ -509,9 +527,9 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 		iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
 			fsnotify_first_mark(&inode->i_fsnotify_marks);
 	}
-	if (child) {
-		iter_info.marks[FSNOTIFY_OBJ_TYPE_CHILD] =
-			fsnotify_first_mark(&child->i_fsnotify_marks);
+	if (parent) {
+		iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] =
+			fsnotify_first_mark(&parent->i_fsnotify_marks);
 	}
 
 	/*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4ee3044eedd0..a2e42d3cd87c 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -278,7 +278,7 @@ static inline const struct path *fsnotify_data_path(const void *data,
 
 enum fsnotify_obj_type {
 	FSNOTIFY_OBJ_TYPE_INODE,
-	FSNOTIFY_OBJ_TYPE_CHILD,
+	FSNOTIFY_OBJ_TYPE_PARENT,
 	FSNOTIFY_OBJ_TYPE_VFSMOUNT,
 	FSNOTIFY_OBJ_TYPE_SB,
 	FSNOTIFY_OBJ_TYPE_COUNT,
@@ -286,7 +286,7 @@ enum fsnotify_obj_type {
 };
 
 #define FSNOTIFY_OBJ_TYPE_INODE_FL	(1U << FSNOTIFY_OBJ_TYPE_INODE)
-#define FSNOTIFY_OBJ_TYPE_CHILD_FL	(1U << FSNOTIFY_OBJ_TYPE_CHILD)
+#define FSNOTIFY_OBJ_TYPE_PARENT_FL	(1U << FSNOTIFY_OBJ_TYPE_PARENT)
 #define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL	(1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT)
 #define FSNOTIFY_OBJ_TYPE_SB_FL		(1U << FSNOTIFY_OBJ_TYPE_SB)
 #define FSNOTIFY_OBJ_ALL_TYPES_MASK	((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)
@@ -331,7 +331,7 @@ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
 }
 
 FSNOTIFY_ITER_FUNCS(inode, INODE)
-FSNOTIFY_ITER_FUNCS(child, CHILD)
+FSNOTIFY_ITER_FUNCS(parent, PARENT)
 FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
 FSNOTIFY_ITER_FUNCS(sb, SB)
 
-- 
cgit 


From 14486c82612a177cb910980c70ba900827ca0894 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Wed, 4 Nov 2020 15:46:41 +0200
Subject: rfkill: add a reason to the HW rfkill state

The WLAN device may exist yet not be usable. This can happen
when the WLAN device is controllable by both the host and
some platform internal component.
We need some arbritration that is vendor specific, but when
the device is not available for the host, we need to reflect
this state towards the user space.

Add a reason field to the rfkill object (and event) so that
userspace can know why the device is in rfkill: because some
other platform component currently owns the device, or
because the actual hw rfkill signal is asserted.

Capable userspace can now determine the reason for the rfkill
and possibly do some negotiation on a side band channel using
a proprietary protocol to gain ownership on the device in case
the device is owned by some other component. When the host
gains ownership on the device, the kernel can remove the
RFKILL_HARD_BLOCK_NOT_OWNER reason and the hw rfkill state
will be off. Then, the userspace can bring the device up and
start normal operation.

The rfkill_event structure is enlarged to include the additional
byte, it is now 9 bytes long. Old user space will ask to read
only 8 bytes so that the kernel can know not to feed them with
more data. When the user space writes 8 bytes, new kernels will
just read what is present in the file descriptor. This new byte
is read only from the userspace standpoint anyway.

If a new user space uses an old kernel, it'll ask to read 9 bytes
but will get only 8, and it'll know that it didn't get the new
state. When it'll write 9 bytes, the kernel will again ignore
this new byte which is read only from the userspace standpoint.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Link: https://lore.kernel.org/r/20201104134641.28816-1-emmanuel.grumbach@intel.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill.h      | 24 +++++++++++++++++++++++-
 include/uapi/linux/rfkill.h | 16 +++++++++++++++-
 net/rfkill/core.c           | 41 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 72 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 8ad2487a86d5..231e06b74b50 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -137,6 +137,17 @@ void rfkill_unregister(struct rfkill *rfkill);
  */
 void rfkill_destroy(struct rfkill *rfkill);
 
+/**
+ * rfkill_set_hw_state_reason - Set the internal rfkill hardware block state
+ *	with a reason
+ * @rfkill: pointer to the rfkill class to modify.
+ * @blocked: the current hardware block state to set
+ * @reason: one of &enum rfkill_hard_block_reasons
+ *
+ * Prefer to use rfkill_set_hw_state if you don't need any special reason.
+ */
+bool rfkill_set_hw_state_reason(struct rfkill *rfkill,
+				bool blocked, unsigned long reason);
 /**
  * rfkill_set_hw_state - Set the internal rfkill hardware block state
  * @rfkill: pointer to the rfkill class to modify.
@@ -156,7 +167,11 @@ void rfkill_destroy(struct rfkill *rfkill);
  * should be blocked) so that drivers need not keep track of the soft
  * block state -- which they might not be able to.
  */
-bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked);
+static inline bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
+{
+	return rfkill_set_hw_state_reason(rfkill, blocked,
+					  RFKILL_HARD_BLOCK_SIGNAL);
+}
 
 /**
  * rfkill_set_sw_state - Set the internal rfkill software block state
@@ -256,6 +271,13 @@ static inline void rfkill_destroy(struct rfkill *rfkill)
 {
 }
 
+static inline bool rfkill_set_hw_state_reason(struct rfkill *rfkill,
+					      bool blocked,
+					      unsigned long reason)
+{
+	return blocked;
+}
+
 static inline bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 {
 	return blocked;
diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 2e00dcebebd0..03e8af87b364 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -69,6 +69,16 @@ enum rfkill_operation {
 	RFKILL_OP_CHANGE_ALL,
 };
 
+/**
+ * enum rfkill_hard_block_reasons - hard block reasons
+ * @RFKILL_HARD_BLOCK_SIGNAL: the hardware rfkill signal is active
+ * @RFKILL_HARD_BLOCK_NOT_OWNER: the NIC is not owned by the host
+ */
+enum rfkill_hard_block_reasons {
+	RFKILL_HARD_BLOCK_SIGNAL	= 1 << 0,
+	RFKILL_HARD_BLOCK_NOT_OWNER	= 1 << 1,
+};
+
 /**
  * struct rfkill_event - events for userspace on /dev/rfkill
  * @idx: index of dev rfkill
@@ -76,6 +86,8 @@ enum rfkill_operation {
  * @op: operation code
  * @hard: hard state (0/1)
  * @soft: soft state (0/1)
+ * @hard_block_reasons: valid if hard is set. One or several reasons from
+ *	&enum rfkill_hard_block_reasons.
  *
  * Structure used for userspace communication on /dev/rfkill,
  * used for events from the kernel and control to the kernel.
@@ -84,7 +96,9 @@ struct rfkill_event {
 	__u32 idx;
 	__u8  type;
 	__u8  op;
-	__u8  soft, hard;
+	__u8  soft;
+	__u8  hard;
+	__u8  hard_block_reasons;
 } __attribute__((packed));
 
 /*
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 97101c55763d..68d6ef9e59fc 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -40,6 +40,7 @@ struct rfkill {
 	enum rfkill_type	type;
 
 	unsigned long		state;
+	unsigned long		hard_block_reasons;
 
 	u32			idx;
 
@@ -265,6 +266,7 @@ static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
 	ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW);
 	ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW |
 					RFKILL_BLOCK_SW_PREV));
+	ev->hard_block_reasons = rfkill->hard_block_reasons;
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 }
 
@@ -522,19 +524,29 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type)
 }
 #endif
 
-bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
+bool rfkill_set_hw_state_reason(struct rfkill *rfkill,
+				bool blocked, unsigned long reason)
 {
 	unsigned long flags;
 	bool ret, prev;
 
 	BUG_ON(!rfkill);
 
+	if (WARN(reason &
+	    ~(RFKILL_HARD_BLOCK_SIGNAL | RFKILL_HARD_BLOCK_NOT_OWNER),
+	    "hw_state reason not supported: 0x%lx", reason))
+		return blocked;
+
 	spin_lock_irqsave(&rfkill->lock, flags);
-	prev = !!(rfkill->state & RFKILL_BLOCK_HW);
-	if (blocked)
+	prev = !!(rfkill->hard_block_reasons & reason);
+	if (blocked) {
 		rfkill->state |= RFKILL_BLOCK_HW;
-	else
-		rfkill->state &= ~RFKILL_BLOCK_HW;
+		rfkill->hard_block_reasons |= reason;
+	} else {
+		rfkill->hard_block_reasons &= ~reason;
+		if (!rfkill->hard_block_reasons)
+			rfkill->state &= ~RFKILL_BLOCK_HW;
+	}
 	ret = !!(rfkill->state & RFKILL_BLOCK_ANY);
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
@@ -546,7 +558,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 
 	return ret;
 }
-EXPORT_SYMBOL(rfkill_set_hw_state);
+EXPORT_SYMBOL(rfkill_set_hw_state_reason);
 
 static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 {
@@ -744,6 +756,16 @@ static ssize_t soft_store(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RW(soft);
 
+static ssize_t hard_block_reasons_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "0x%lx\n", rfkill->hard_block_reasons);
+}
+static DEVICE_ATTR_RO(hard_block_reasons);
+
 static u8 user_state_from_blocked(unsigned long state)
 {
 	if (state & RFKILL_BLOCK_HW)
@@ -796,6 +818,7 @@ static struct attribute *rfkill_dev_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_soft.attr,
 	&dev_attr_hard.attr,
+	&dev_attr_hard_block_reasons.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(rfkill_dev);
@@ -811,6 +834,7 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
 	unsigned long flags;
+	unsigned long reasons;
 	u32 state;
 	int error;
 
@@ -823,10 +847,13 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
 		return error;
 	spin_lock_irqsave(&rfkill->lock, flags);
 	state = rfkill->state;
+	reasons = rfkill->hard_block_reasons;
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 	error = add_uevent_var(env, "RFKILL_STATE=%d",
 			       user_state_from_blocked(state));
-	return error;
+	if (error)
+		return error;
+	return add_uevent_var(env, "RFKILL_HW_BLOCK_REASON=0x%lx", reasons);
 }
 
 void rfkill_pause_polling(struct rfkill *rfkill)
-- 
cgit 


From d6587602c59974a2eda35e8ed70a4f5970380be8 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Sun, 29 Nov 2020 17:30:46 +0200
Subject: cfg80211: Parse SAE H2E only membership selector

This extends the support for drivers that rebuild IEs in the
FW (same as with HT/VHT/HE).

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20201129172929.4012647275f3.I1a93ae71c57ef0b6f58f99d47fce919d19d65ff0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 include/net/cfg80211.h    | 3 ++-
 net/wireless/nl80211.c    | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 5e8cc9c3d45a..cbd294239430 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1261,6 +1261,7 @@ struct ieee80211_mgmt {
 #define BSS_MEMBERSHIP_SELECTOR_HT_PHY	127
 #define BSS_MEMBERSHIP_SELECTOR_VHT_PHY	126
 #define BSS_MEMBERSHIP_SELECTOR_HE_PHY	122
+#define BSS_MEMBERSHIP_SELECTOR_SAE_H2E 123
 
 /* mgmt header + 1 byte category code */
 #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u)
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 78c763dfc99a..f3dfb26b50b9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1187,6 +1187,7 @@ enum cfg80211_ap_settings_flags {
  * @vht_required: stations must support VHT
  * @twt_responder: Enable Target Wait Time
  * @he_required: stations must support HE
+ * @sae_h2e_required: stations must support direct H2E technique in SAE
  * @flags: flags, as defined in enum cfg80211_ap_settings_flags
  * @he_obss_pd: OBSS Packet Detection settings
  * @he_bss_color: BSS Color settings
@@ -1218,7 +1219,7 @@ struct cfg80211_ap_settings {
 	const struct ieee80211_vht_cap *vht_cap;
 	const struct ieee80211_he_cap_elem *he_cap;
 	const struct ieee80211_he_operation *he_oper;
-	bool ht_required, vht_required, he_required;
+	bool ht_required, vht_required, he_required, sae_h2e_required;
 	bool twt_responder;
 	u32 flags;
 	struct ieee80211_he_obss_pd he_obss_pd;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 57f615a6d2f3..8e7320a868d3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5017,6 +5017,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
 			params->vht_required = true;
 		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY)
 			params->he_required = true;
+		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_SAE_H2E)
+			params->sae_h2e_required = true;
 	}
 }
 
-- 
cgit 


From 9850742470804b2cc6a6543bd8f5822eeb5fdbc0 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Sun, 29 Nov 2020 17:30:52 +0200
Subject: ieee80211: update reduced neighbor report TBTT info length

A new field (20MHz PSD - 1 byte) was added to the RNR TBTT info field.
Adjust the expected TBTT info length accordingly.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20201129172929.b503adccce6a.Ie684e1d3039c111bf2d521bf762aaec3f7a24d2e@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index cbd294239430..72ff75fb1971 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3836,15 +3836,15 @@ static inline bool for_each_element_completed(const struct element *element,
 #define WLAN_RSNX_CAPA_SAE_H2E BIT(5)
 
 /*
- * reduced neighbor report, based on Draft P802.11ax_D5.0,
- * section 9.4.2.170
+ * reduced neighbor report, based on Draft P802.11ax_D6.1,
+ * section 9.4.2.170 and accepted contributions.
  */
 #define IEEE80211_AP_INFO_TBTT_HDR_TYPE				0x03
 #define IEEE80211_AP_INFO_TBTT_HDR_FILTERED			0x04
 #define IEEE80211_AP_INFO_TBTT_HDR_COLOC			0x08
 #define IEEE80211_AP_INFO_TBTT_HDR_COUNT			0xF0
-#define IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM		8
-#define IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM	12
+#define IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM		9
+#define IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM	13
 
 #define IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED		0x01
 #define IEEE80211_RNR_TBTT_PARAMS_SAME_SSID			0x02
-- 
cgit 


From 84e0d87c9944eb36ae6037af5cb6905f67c074c5 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Thu, 10 Dec 2020 14:30:12 +0000
Subject: thermal: devfreq_cooling: add new registration functions with Energy
 Model

The Energy Model (EM) framework supports devices such as Devfreq. Create
new registration function which automatically register EM for the thermal
devfreq_cooling devices. This patch prepares the code for coming changes
which are going to replace old power model with the new EM.

Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20201210143014.24685-4-lukasz.luba@arm.com
---
 drivers/thermal/devfreq_cooling.c | 54 ++++++++++++++++++++++++++++++++++++++-
 include/linux/devfreq_cooling.h   | 10 ++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index afcebadbad24..6cea027d89a3 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -12,6 +12,7 @@
 
 #include <linux/devfreq.h>
 #include <linux/devfreq_cooling.h>
+#include <linux/energy_model.h>
 #include <linux/export.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
@@ -576,22 +577,73 @@ struct thermal_cooling_device *devfreq_cooling_register(struct devfreq *df)
 }
 EXPORT_SYMBOL_GPL(devfreq_cooling_register);
 
+/**
+ * devfreq_cooling_em_register_power() - Register devfreq cooling device with
+ *		power information and automatically register Energy Model (EM)
+ * @df:		Pointer to devfreq device.
+ * @dfc_power:	Pointer to devfreq_cooling_power.
+ *
+ * Register a devfreq cooling device and automatically register EM. The
+ * available OPPs must be registered for the device.
+ *
+ * If @dfc_power is provided, the cooling device is registered with the
+ * power extensions. It is using the simple Energy Model which requires
+ * "dynamic-power-coefficient" a devicetree property. To not break drivers
+ * which miss that DT property, the function won't bail out when the EM
+ * registration failed. The cooling device will be registered if everything
+ * else is OK.
+ */
+struct thermal_cooling_device *
+devfreq_cooling_em_register(struct devfreq *df,
+			    struct devfreq_cooling_power *dfc_power)
+{
+	struct thermal_cooling_device *cdev;
+	struct device *dev;
+	int ret;
+
+	if (IS_ERR_OR_NULL(df))
+		return ERR_PTR(-EINVAL);
+
+	dev = df->dev.parent;
+
+	ret = dev_pm_opp_of_register_em(dev, NULL);
+	if (ret)
+		dev_dbg(dev, "Unable to register EM for devfreq cooling device (%d)\n",
+			ret);
+
+	cdev = of_devfreq_cooling_register_power(dev->of_node, df, dfc_power);
+
+	if (IS_ERR_OR_NULL(cdev))
+		em_dev_unregister_perf_domain(dev);
+
+	return cdev;
+}
+EXPORT_SYMBOL_GPL(devfreq_cooling_em_register);
+
 /**
  * devfreq_cooling_unregister() - Unregister devfreq cooling device.
  * @cdev: Pointer to devfreq cooling device to unregister.
+ *
+ * Unregisters devfreq cooling device and related Energy Model if it was
+ * present.
  */
 void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
 {
 	struct devfreq_cooling_device *dfc;
+	struct device *dev;
 
-	if (!cdev)
+	if (IS_ERR_OR_NULL(cdev))
 		return;
 
 	dfc = cdev->devdata;
+	dev = dfc->devfreq->dev.parent;
 
 	thermal_cooling_device_unregister(dfc->cdev);
 	ida_simple_remove(&devfreq_ida, dfc->id);
 	dev_pm_qos_remove_request(&dfc->req_max_freq);
+
+	em_dev_unregister_perf_domain(dev);
+
 	kfree(dfc->power_table);
 	kfree(dfc->freq_table);
 
diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h
index 9df2dfca68dd..7a9fbcc7b265 100644
--- a/include/linux/devfreq_cooling.h
+++ b/include/linux/devfreq_cooling.h
@@ -65,6 +65,9 @@ struct thermal_cooling_device *
 of_devfreq_cooling_register(struct device_node *np, struct devfreq *df);
 struct thermal_cooling_device *devfreq_cooling_register(struct devfreq *df);
 void devfreq_cooling_unregister(struct thermal_cooling_device *dfc);
+struct thermal_cooling_device *
+devfreq_cooling_em_register(struct devfreq *df,
+			    struct devfreq_cooling_power *dfc_power);
 
 #else /* !CONFIG_DEVFREQ_THERMAL */
 
@@ -87,6 +90,13 @@ devfreq_cooling_register(struct devfreq *df)
 	return ERR_PTR(-EINVAL);
 }
 
+static inline struct thermal_cooling_device *
+devfreq_cooling_em_register(struct devfreq *df,
+			    struct devfreq_cooling_power *dfc_power)
+{
+	return ERR_PTR(-EINVAL);
+}
+
 static inline void
 devfreq_cooling_unregister(struct thermal_cooling_device *dfc)
 {
-- 
cgit 


From 615510fe13bd2434610193f1acab53027d5146d6 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Thu, 10 Dec 2020 14:30:13 +0000
Subject: thermal: devfreq_cooling: remove old power model and use EM

Remove old power model and use new Energy Model to calculate the power
budget. It drops static + dynamic power calculations and power table
in order to use Energy Model performance domain data. This model
should be easy to use and could find more users. It is also less
complicated to setup the needed structures.

Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20201210143014.24685-5-lukasz.luba@arm.com
---
 drivers/thermal/devfreq_cooling.c | 307 ++++++++++++--------------------------
 include/linux/devfreq_cooling.h   |  17 ---
 2 files changed, 97 insertions(+), 227 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 6cea027d89a3..df1049a34777 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -34,20 +34,17 @@ static DEFINE_IDA(devfreq_ida);
  * @cdev:	Pointer to associated thermal cooling device.
  * @devfreq:	Pointer to associated devfreq device.
  * @cooling_state:	Current cooling state.
- * @power_table:	Pointer to table with maximum power draw for each
- *			cooling state. State is the index into the table, and
- *			the power is in mW.
  * @freq_table:	Pointer to a table with the frequencies sorted in descending
  *		order.  You can index the table by cooling device state
- * @freq_table_size:	Size of the @freq_table and @power_table
- * @power_ops:	Pointer to devfreq_cooling_power, used to generate the
- *		@power_table.
+ * @max_state:	It is the last index, that is, one less than the number of the
+ *		OPPs
+ * @power_ops:	Pointer to devfreq_cooling_power, a more precised model.
  * @res_util:	Resource utilization scaling factor for the power.
  *		It is multiplied by 100 to minimize the error. It is used
  *		for estimation of the power budget instead of using
- *		'utilization' (which is	'busy_time / 'total_time').
- *		The 'res_util' range is from 100 to (power_table[state] * 100)
- *		for the corresponding 'state'.
+ *		'utilization' (which is	'busy_time' / 'total_time').
+ *		The 'res_util' range is from 100 to power * 100	for the
+ *		corresponding 'state'.
  * @capped_state:	index to cooling state with in dynamic power budget
  * @req_max_freq:	PM QoS request for limiting the maximum frequency
  *			of the devfreq device.
@@ -57,9 +54,8 @@ struct devfreq_cooling_device {
 	struct thermal_cooling_device *cdev;
 	struct devfreq *devfreq;
 	unsigned long cooling_state;
-	u32 *power_table;
 	u32 *freq_table;
-	size_t freq_table_size;
+	size_t max_state;
 	struct devfreq_cooling_power *power_ops;
 	u32 res_util;
 	int capped_state;
@@ -71,7 +67,7 @@ static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev,
 {
 	struct devfreq_cooling_device *dfc = cdev->devdata;
 
-	*state = dfc->freq_table_size - 1;
+	*state = dfc->max_state;
 
 	return 0;
 }
@@ -93,16 +89,22 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
 	struct devfreq *df = dfc->devfreq;
 	struct device *dev = df->dev.parent;
 	unsigned long freq;
+	int perf_idx;
 
 	if (state == dfc->cooling_state)
 		return 0;
 
 	dev_dbg(dev, "Setting cooling state %lu\n", state);
 
-	if (state >= dfc->freq_table_size)
+	if (state > dfc->max_state)
 		return -EINVAL;
 
-	freq = dfc->freq_table[state];
+	if (dev->em_pd) {
+		perf_idx = dfc->max_state - state;
+		freq = dev->em_pd->table[perf_idx].frequency * 1000;
+	} else {
+		freq = dfc->freq_table[state];
+	}
 
 	dev_pm_qos_update_request(&dfc->req_max_freq,
 				  DIV_ROUND_UP(freq, HZ_PER_KHZ));
@@ -113,24 +115,23 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
 }
 
 /**
- * freq_get_state() - get the cooling state corresponding to a frequency
- * @dfc:	Pointer to devfreq cooling device
- * @freq:	frequency in Hz
+ * get_perf_idx() - get the performance index corresponding to a frequency
+ * @em_pd:	Pointer to device's Energy Model
+ * @freq:	frequency in kHz
  *
- * Return: the cooling state associated with the @freq, or
- * THERMAL_CSTATE_INVALID if it wasn't found.
+ * Return: the performance index associated with the @freq, or
+ * -EINVAL if it wasn't found.
  */
-static unsigned long
-freq_get_state(struct devfreq_cooling_device *dfc, unsigned long freq)
+static int get_perf_idx(struct em_perf_domain *em_pd, unsigned long freq)
 {
 	int i;
 
-	for (i = 0; i < dfc->freq_table_size; i++) {
-		if (dfc->freq_table[i] == freq)
+	for (i = 0; i < em_pd->nr_perf_states; i++) {
+		if (em_pd->table[i].frequency == freq)
 			return i;
 	}
 
-	return THERMAL_CSTATE_INVALID;
+	return -EINVAL;
 }
 
 static unsigned long get_voltage(struct devfreq *df, unsigned long freq)
@@ -161,73 +162,6 @@ static unsigned long get_voltage(struct devfreq *df, unsigned long freq)
 	return voltage;
 }
 
-/**
- * get_static_power() - calculate the static power
- * @dfc:	Pointer to devfreq cooling device
- * @freq:	Frequency in Hz
- *
- * Calculate the static power in milliwatts using the supplied
- * get_static_power().  The current voltage is calculated using the
- * OPP library.  If no get_static_power() was supplied, assume the
- * static power is negligible.
- */
-static unsigned long
-get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq)
-{
-	struct devfreq *df = dfc->devfreq;
-	unsigned long voltage;
-
-	if (!dfc->power_ops->get_static_power)
-		return 0;
-
-	voltage = get_voltage(df, freq);
-
-	if (voltage == 0)
-		return 0;
-
-	return dfc->power_ops->get_static_power(df, voltage);
-}
-
-/**
- * get_dynamic_power - calculate the dynamic power
- * @dfc:	Pointer to devfreq cooling device
- * @freq:	Frequency in Hz
- * @voltage:	Voltage in millivolts
- *
- * Calculate the dynamic power in milliwatts consumed by the device at
- * frequency @freq and voltage @voltage.  If the get_dynamic_power()
- * was supplied as part of the devfreq_cooling_power struct, then that
- * function is used.  Otherwise, a simple power model (Pdyn = Coeff *
- * Voltage^2 * Frequency) is used.
- */
-static unsigned long
-get_dynamic_power(struct devfreq_cooling_device *dfc, unsigned long freq,
-		  unsigned long voltage)
-{
-	u64 power;
-	u32 freq_mhz;
-	struct devfreq_cooling_power *dfc_power = dfc->power_ops;
-
-	if (dfc_power->get_dynamic_power)
-		return dfc_power->get_dynamic_power(dfc->devfreq, freq,
-						    voltage);
-
-	freq_mhz = freq / 1000000;
-	power = (u64)dfc_power->dyn_power_coeff * freq_mhz * voltage * voltage;
-	do_div(power, 1000000000);
-
-	return power;
-}
-
-
-static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,
-					    unsigned long freq,
-					    unsigned long voltage)
-{
-	return get_static_power(dfc, freq) + get_dynamic_power(dfc, freq,
-							       voltage);
-}
-
 static void _normalize_load(struct devfreq_dev_status *status)
 {
 	if (status->total_time > 0xfffff) {
@@ -247,13 +181,12 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 {
 	struct devfreq_cooling_device *dfc = cdev->devdata;
 	struct devfreq *df = dfc->devfreq;
+	struct device *dev = df->dev.parent;
 	struct devfreq_dev_status status;
 	unsigned long state;
 	unsigned long freq;
 	unsigned long voltage;
-	u32 dyn_power = 0;
-	u32 static_power = 0;
-	int res;
+	int res, perf_idx;
 
 	mutex_lock(&df->lock);
 	status = df->last_status;
@@ -261,13 +194,7 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 
 	freq = status.current_frequency;
 
-	state = freq_get_state(dfc, freq);
-	if (state == THERMAL_CSTATE_INVALID) {
-		res = -EAGAIN;
-		goto fail;
-	}
-
-	if (dfc->power_ops->get_real_power) {
+	if (dfc->power_ops && dfc->power_ops->get_real_power) {
 		voltage = get_voltage(df, freq);
 		if (voltage == 0) {
 			res = -EINVAL;
@@ -277,7 +204,7 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 		res = dfc->power_ops->get_real_power(df, power, freq, voltage);
 		if (!res) {
 			state = dfc->capped_state;
-			dfc->res_util = dfc->power_table[state];
+			dfc->res_util = dev->em_pd->table[state].power;
 			dfc->res_util *= SCALE_ERROR_MITIGATION;
 
 			if (*power > 1)
@@ -286,17 +213,19 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 			goto fail;
 		}
 	} else {
-		dyn_power = dfc->power_table[state];
+		/* Energy Model frequencies are in kHz */
+		perf_idx = get_perf_idx(dev->em_pd, freq / 1000);
+		if (perf_idx < 0) {
+			res = -EAGAIN;
+			goto fail;
+		}
 
 		_normalize_load(&status);
 
-		/* Scale dynamic power for utilization */
-		dyn_power *= status.busy_time;
-		dyn_power >>= 10;
-		/* Get static power */
-		static_power = get_static_power(dfc, freq);
-
-		*power = dyn_power + static_power;
+		/* Scale power for utilization */
+		*power = dev->em_pd->table[perf_idx].power;
+		*power *= status.busy_time;
+		*power >>= 10;
 	}
 
 	trace_thermal_power_devfreq_get_power(cdev, &status, freq, *power);
@@ -309,20 +238,19 @@ fail:
 }
 
 static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
-				       unsigned long state,
-				       u32 *power)
+				       unsigned long state, u32 *power)
 {
 	struct devfreq_cooling_device *dfc = cdev->devdata;
-	unsigned long freq;
-	u32 static_power;
+	struct devfreq *df = dfc->devfreq;
+	struct device *dev = df->dev.parent;
+	int perf_idx;
 
-	if (state >= dfc->freq_table_size)
+	if (state > dfc->max_state)
 		return -EINVAL;
 
-	freq = dfc->freq_table[state];
-	static_power = get_static_power(dfc, freq);
+	perf_idx = dfc->max_state - state;
+	*power = dev->em_pd->table[perf_idx].power;
 
-	*power = dfc->power_table[state] + static_power;
 	return 0;
 }
 
@@ -331,10 +259,9 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 {
 	struct devfreq_cooling_device *dfc = cdev->devdata;
 	struct devfreq *df = dfc->devfreq;
+	struct device *dev = df->dev.parent;
 	struct devfreq_dev_status status;
 	unsigned long freq;
-	s32 dyn_power;
-	u32 static_power;
 	s32 est_power;
 	int i;
 
@@ -344,32 +271,28 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 
 	freq = status.current_frequency;
 
-	if (dfc->power_ops->get_real_power) {
+	if (dfc->power_ops && dfc->power_ops->get_real_power) {
 		/* Scale for resource utilization */
 		est_power = power * dfc->res_util;
 		est_power /= SCALE_ERROR_MITIGATION;
 	} else {
-		static_power = get_static_power(dfc, freq);
-
-		dyn_power = power - static_power;
-		dyn_power = dyn_power > 0 ? dyn_power : 0;
-
 		/* Scale dynamic power for utilization */
 		_normalize_load(&status);
-		dyn_power <<= 10;
-		est_power = dyn_power / status.busy_time;
+		est_power = power << 10;
+		est_power /= status.busy_time;
 	}
 
 	/*
 	 * Find the first cooling state that is within the power
-	 * budget for dynamic power.
+	 * budget. The EM power table is sorted ascending.
 	 */
-	for (i = 0; i < dfc->freq_table_size - 1; i++)
-		if (est_power >= dfc->power_table[i])
+	for (i = dfc->max_state; i > 0; i--)
+		if (est_power >= dev->em_pd->table[i].power)
 			break;
 
-	*state = i;
-	dfc->capped_state = i;
+	*state = dfc->max_state - i;
+	dfc->capped_state = *state;
+
 	trace_thermal_power_devfreq_limit(cdev, freq, *state, power);
 	return 0;
 }
@@ -381,91 +304,43 @@ static struct thermal_cooling_device_ops devfreq_cooling_ops = {
 };
 
 /**
- * devfreq_cooling_gen_tables() - Generate power and freq tables.
- * @dfc: Pointer to devfreq cooling device.
- *
- * Generate power and frequency tables: the power table hold the
- * device's maximum power usage at each cooling state (OPP).  The
- * static and dynamic power using the appropriate voltage and
- * frequency for the state, is acquired from the struct
- * devfreq_cooling_power, and summed to make the maximum power draw.
- *
- * The frequency table holds the frequencies in descending order.
- * That way its indexed by cooling device state.
+ * devfreq_cooling_gen_tables() - Generate frequency table.
+ * @dfc:	Pointer to devfreq cooling device.
+ * @num_opps:	Number of OPPs
  *
- * The tables are malloced, and pointers put in dfc.  They must be
- * freed when unregistering the devfreq cooling device.
+ * Generate frequency table which holds the frequencies in descending
+ * order. That way its indexed by cooling device state. This is for
+ * compatibility with drivers which do not register Energy Model.
  *
  * Return: 0 on success, negative error code on failure.
  */
-static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
+static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc,
+				      int num_opps)
 {
 	struct devfreq *df = dfc->devfreq;
 	struct device *dev = df->dev.parent;
-	int ret, num_opps;
 	unsigned long freq;
-	u32 *power_table = NULL;
-	u32 *freq_table;
 	int i;
 
-	num_opps = dev_pm_opp_get_opp_count(dev);
-
-	if (dfc->power_ops) {
-		power_table = kcalloc(num_opps, sizeof(*power_table),
-				      GFP_KERNEL);
-		if (!power_table)
-			return -ENOMEM;
-	}
-
-	freq_table = kcalloc(num_opps, sizeof(*freq_table),
+	dfc->freq_table = kcalloc(num_opps, sizeof(*dfc->freq_table),
 			     GFP_KERNEL);
-	if (!freq_table) {
-		ret = -ENOMEM;
-		goto free_power_table;
-	}
+	if (!dfc->freq_table)
+		return -ENOMEM;
 
 	for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) {
-		unsigned long power, voltage;
 		struct dev_pm_opp *opp;
 
 		opp = dev_pm_opp_find_freq_floor(dev, &freq);
 		if (IS_ERR(opp)) {
-			ret = PTR_ERR(opp);
-			goto free_tables;
+			kfree(dfc->freq_table);
+			return PTR_ERR(opp);
 		}
 
-		voltage = dev_pm_opp_get_voltage(opp) / 1000; /* mV */
 		dev_pm_opp_put(opp);
-
-		if (dfc->power_ops) {
-			if (dfc->power_ops->get_real_power)
-				power = get_total_power(dfc, freq, voltage);
-			else
-				power = get_dynamic_power(dfc, freq, voltage);
-
-			dev_dbg(dev, "Power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
-				freq / 1000000, voltage, power, power);
-
-			power_table[i] = power;
-		}
-
-		freq_table[i] = freq;
+		dfc->freq_table[i] = freq;
 	}
 
-	if (dfc->power_ops)
-		dfc->power_table = power_table;
-
-	dfc->freq_table = freq_table;
-	dfc->freq_table_size = num_opps;
-
 	return 0;
-
-free_tables:
-	kfree(freq_table);
-free_power_table:
-	kfree(power_table);
-
-	return ret;
 }
 
 /**
@@ -488,9 +363,10 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
 				  struct devfreq_cooling_power *dfc_power)
 {
 	struct thermal_cooling_device *cdev;
+	struct device *dev = df->dev.parent;
 	struct devfreq_cooling_device *dfc;
 	char dev_name[THERMAL_NAME_LENGTH];
-	int err;
+	int err, num_opps;
 
 	dfc = kzalloc(sizeof(*dfc), GFP_KERNEL);
 	if (!dfc)
@@ -498,28 +374,44 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
 
 	dfc->devfreq = df;
 
-	if (dfc_power) {
-		dfc->power_ops = dfc_power;
-
+	if (dev->em_pd) {
 		devfreq_cooling_ops.get_requested_power =
 			devfreq_cooling_get_requested_power;
 		devfreq_cooling_ops.state2power = devfreq_cooling_state2power;
 		devfreq_cooling_ops.power2state = devfreq_cooling_power2state;
+
+		dfc->power_ops = dfc_power;
+
+		num_opps = em_pd_nr_perf_states(dev->em_pd);
+	} else {
+		/* Backward compatibility for drivers which do not use IPA */
+		dev_dbg(dev, "missing EM for cooling device\n");
+
+		num_opps = dev_pm_opp_get_opp_count(dev);
+
+		err = devfreq_cooling_gen_tables(dfc, num_opps);
+		if (err)
+			goto free_dfc;
 	}
 
-	err = devfreq_cooling_gen_tables(dfc);
-	if (err)
+	if (num_opps <= 0) {
+		err = -EINVAL;
 		goto free_dfc;
+	}
+
+	/* max_state is an index, not a counter */
+	dfc->max_state = num_opps - 1;
 
-	err = dev_pm_qos_add_request(df->dev.parent, &dfc->req_max_freq,
+	err = dev_pm_qos_add_request(dev, &dfc->req_max_freq,
 				     DEV_PM_QOS_MAX_FREQUENCY,
 				     PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
 	if (err < 0)
-		goto free_tables;
+		goto free_table;
 
 	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
 	if (err < 0)
 		goto remove_qos_req;
+
 	dfc->id = err;
 
 	snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id);
@@ -528,7 +420,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
 						  &devfreq_cooling_ops);
 	if (IS_ERR(cdev)) {
 		err = PTR_ERR(cdev);
-		dev_err(df->dev.parent,
+		dev_err(dev,
 			"Failed to register devfreq cooling device (%d)\n",
 			err);
 		goto release_ida;
@@ -540,12 +432,9 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
 
 release_ida:
 	ida_simple_remove(&devfreq_ida, dfc->id);
-
 remove_qos_req:
 	dev_pm_qos_remove_request(&dfc->req_max_freq);
-
-free_tables:
-	kfree(dfc->power_table);
+free_table:
 	kfree(dfc->freq_table);
 free_dfc:
 	kfree(dfc);
@@ -644,9 +533,7 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
 	em_dev_unregister_perf_domain(dev);
 
-	kfree(dfc->power_table);
 	kfree(dfc->freq_table);
-
 	kfree(dfc);
 }
 EXPORT_SYMBOL_GPL(devfreq_cooling_unregister);
diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h
index 7a9fbcc7b265..14baa73fc2de 100644
--- a/include/linux/devfreq_cooling.h
+++ b/include/linux/devfreq_cooling.h
@@ -16,17 +16,6 @@
 
 /**
  * struct devfreq_cooling_power - Devfreq cooling power ops
- * @get_static_power:	Take voltage, in mV, and return the static power
- *			in mW.  If NULL, the static power is assumed
- *			to be 0.
- * @get_dynamic_power:	Take voltage, in mV, and frequency, in HZ, and
- *			return the dynamic power draw in mW.  If NULL,
- *			a simple power model is used.
- * @dyn_power_coeff:	Coefficient for the simple dynamic power model in
- *			mW/(MHz mV mV).
- *			If get_dynamic_power() is NULL, then the
- *			dynamic power is calculated as
- *			@dyn_power_coeff * frequency * voltage^2
  * @get_real_power:	When this is set, the framework uses it to ask the
  *			device driver for the actual power.
  *			Some devices have more sophisticated methods
@@ -46,14 +35,8 @@
  *			max total (static + dynamic) power value for each OPP.
  */
 struct devfreq_cooling_power {
-	unsigned long (*get_static_power)(struct devfreq *devfreq,
-					  unsigned long voltage);
-	unsigned long (*get_dynamic_power)(struct devfreq *devfreq,
-					   unsigned long freq,
-					   unsigned long voltage);
 	int (*get_real_power)(struct devfreq *df, u32 *power,
 			      unsigned long freq, unsigned long voltage);
-	unsigned long dyn_power_coeff;
 };
 
 #ifdef CONFIG_DEVFREQ_THERMAL
-- 
cgit 


From d7203eedf4f68e9909fd489453168a9d26bf0c3d Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 10 Dec 2020 13:15:11 +0100
Subject: thermal/core: Add critical and hot ops

Currently there is no way to the sensors to directly call an ops in
interrupt mode without calling thermal_zone_device_update assuming all
the trip points are defined.

A sensor may want to do something special if a trip point is hot or
critical.

This patch adds the critical and hot ops to the thermal zone device,
so a sensor can directly invoke them or let the thermal framework to
call the sensor specific ones.

Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20201210121514.25760-2-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_core.c | 43 ++++++++++++++++++++++++++----------------
 include/linux/thermal.h        |  3 +++
 2 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 64de09861c05..4a291d205d5c 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -380,6 +380,25 @@ static void thermal_emergency_poweroff(void)
 			      msecs_to_jiffies(poweroff_delay_ms));
 }
 
+void thermal_zone_device_critical(struct thermal_zone_device *tz)
+{
+	dev_emerg(&tz->device, "%s: critical temperature reached, "
+		  "shutting down\n", tz->type);
+
+	mutex_lock(&poweroff_lock);
+	if (!power_off_triggered) {
+		/*
+		 * Queue a backup emergency shutdown in the event of
+		 * orderly_poweroff failure
+		 */
+		thermal_emergency_poweroff();
+		orderly_poweroff(true);
+		power_off_triggered = true;
+	}
+	mutex_unlock(&poweroff_lock);
+}
+EXPORT_SYMBOL(thermal_zone_device_critical);
+
 static void handle_critical_trips(struct thermal_zone_device *tz,
 				  int trip, enum thermal_trip_type trip_type)
 {
@@ -396,22 +415,10 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
 	if (tz->ops->notify)
 		tz->ops->notify(tz, trip, trip_type);
 
-	if (trip_type == THERMAL_TRIP_CRITICAL) {
-		dev_emerg(&tz->device,
-			  "critical temperature reached (%d C), shutting down\n",
-			  tz->temperature / 1000);
-		mutex_lock(&poweroff_lock);
-		if (!power_off_triggered) {
-			/*
-			 * Queue a backup emergency shutdown in the event of
-			 * orderly_poweroff failure
-			 */
-			thermal_emergency_poweroff();
-			orderly_poweroff(true);
-			power_off_triggered = true;
-		}
-		mutex_unlock(&poweroff_lock);
-	}
+	if (trip_type == THERMAL_TRIP_HOT && tz->ops->hot)
+		tz->ops->hot(tz);
+	else if (trip_type == THERMAL_TRIP_CRITICAL)
+		tz->ops->critical(tz);
 }
 
 static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
@@ -1336,6 +1343,10 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 
 	tz->id = id;
 	strlcpy(tz->type, type, sizeof(tz->type));
+
+	if (!ops->critical)
+		ops->critical = thermal_zone_device_critical;
+
 	tz->ops = ops;
 	tz->tzp = tzp;
 	tz->device.class = &thermal_class;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index d07ea27e72a9..31b84404f047 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -79,6 +79,8 @@ struct thermal_zone_device_ops {
 			  enum thermal_trend *);
 	int (*notify) (struct thermal_zone_device *, int,
 		       enum thermal_trip_type);
+	void (*hot)(struct thermal_zone_device *);
+	void (*critical)(struct thermal_zone_device *);
 };
 
 struct thermal_cooling_device_ops {
@@ -399,6 +401,7 @@ void thermal_cdev_update(struct thermal_cooling_device *);
 void thermal_notify_framework(struct thermal_zone_device *, int);
 int thermal_zone_device_enable(struct thermal_zone_device *tz);
 int thermal_zone_device_disable(struct thermal_zone_device *tz);
+void thermal_zone_device_critical(struct thermal_zone_device *tz);
 #else
 static inline struct thermal_zone_device *thermal_zone_device_register(
 	const char *type, int trips, int mask, void *devdata,
-- 
cgit 


From b388fa50142510fb6477f130bb1b3f05a0a263a1 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Mon, 9 Nov 2020 09:41:21 +0000
Subject: Revert "genirq: Add fasteoi IPI flow"

handle_percpu_devid_fasteoi_ipi() has no more users, and
handle_percpu_devid_irq() can do all that it was supposed to do. Get rid of
it.

This reverts commit c5e5ec033c4ab25c53f1fd217849e75deb0bf7bf.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201109094121.29975-6-valentin.schneider@arm.com
---
 include/linux/irq.h |  1 -
 kernel/irq/chip.c   | 27 ---------------------------
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c54365309e97..ca26bec51cec 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -647,7 +647,6 @@ static inline int irq_set_parent(int irq, int parent_irq)
  */
 extern void handle_level_irq(struct irq_desc *desc);
 extern void handle_fasteoi_irq(struct irq_desc *desc);
-extern void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc);
 extern void handle_edge_irq(struct irq_desc *desc);
 extern void handle_edge_eoi_irq(struct irq_desc *desc);
 extern void handle_simple_irq(struct irq_desc *desc);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b9b9618e1aca..0ae308efa604 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -944,33 +944,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
 		chip->irq_eoi(&desc->irq_data);
 }
 
-/**
- * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu
- *				     dev ids
- * @desc:	the interrupt description structure for this irq
- *
- * The biggest difference with the IRQ version is that the interrupt is
- * EOIed early, as the IPI could result in a context switch, and we need to
- * make sure the IPI can fire again. We also assume that the arch code has
- * registered an action. If not, we are positively doomed.
- */
-void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc)
-{
-	struct irq_chip *chip = irq_desc_get_chip(desc);
-	struct irqaction *action = desc->action;
-	unsigned int irq = irq_desc_get_irq(desc);
-	irqreturn_t res;
-
-	__kstat_incr_irqs_this_cpu(desc);
-
-	if (chip->irq_eoi)
-		chip->irq_eoi(&desc->irq_data);
-
-	trace_irq_handler_entry(irq, action);
-	res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
-	trace_irq_handler_exit(irq, action, res);
-}
-
 /**
  * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
  *				     dev ids
-- 
cgit 


From 1d3aec89286254487df7641c30f1b14ad1d127a5 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 2 Dec 2020 18:36:53 +0800
Subject: genirq/affinity: Add irq_update_affinity_desc()

Add a function to allow the affinity of an interrupt be switched to
managed, such that interrupts allocated for platform devices may be
managed.

This new interface has certain limitations, and attempts to use it in the
following circumstances will fail:
- For when the kernel is configured for generic IRQ reservation mode (in
  config GENERIC_IRQ_RESERVATION_MODE). The reason being that it could
  conflict with managed vs. non-managed interrupt accounting.
- The interrupt is already started, which should not be the case during
  init
- The interrupt is already configured as managed, which means double init

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/1606905417-183214-2-git-send-email-john.garry@huawei.com
---
 include/linux/interrupt.h |  8 ++++++
 kernel/irq/manage.c       | 70 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ee8299eb1f52..870b3251e174 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -352,6 +352,8 @@ extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
 extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
+extern int irq_update_affinity_desc(unsigned int irq,
+				    struct irq_affinity_desc *affinity);
 
 extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
@@ -387,6 +389,12 @@ static inline int irq_set_affinity_hint(unsigned int irq,
 	return -EINVAL;
 }
 
+static inline int irq_update_affinity_desc(unsigned int irq,
+					   struct irq_affinity_desc *affinity)
+{
+	return -EINVAL;
+}
+
 static inline int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c460e0496006..c826ba4141fe 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -371,6 +371,76 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
 	return ret;
 }
 
+/**
+ * irq_update_affinity_desc - Update affinity management for an interrupt
+ * @irq:	The interrupt number to update
+ * @affinity:	Pointer to the affinity descriptor
+ *
+ * This interface can be used to configure the affinity management of
+ * interrupts which have been allocated already.
+ *
+ * There are certain limitations on when it may be used - attempts to use it
+ * for when the kernel is configured for generic IRQ reservation mode (in
+ * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with
+ * managed/non-managed interrupt accounting. In addition, attempts to use it on
+ * an interrupt which is already started or which has already been configured
+ * as managed will also fail, as these mean invalid init state or double init.
+ */
+int irq_update_affinity_desc(unsigned int irq,
+			     struct irq_affinity_desc *affinity)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+	bool activated;
+	int ret = 0;
+
+	/*
+	 * Supporting this with the reservation scheme used by x86 needs
+	 * some more thought. Fail it for now.
+	 */
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
+		return -EOPNOTSUPP;
+
+	desc = irq_get_desc_buslock(irq, &flags, 0);
+	if (!desc)
+		return -EINVAL;
+
+	/* Requires the interrupt to be shut down */
+	if (irqd_is_started(&desc->irq_data)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	/* Interrupts which are already managed cannot be modified */
+	if (irqd_affinity_is_managed(&desc->irq_data)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	/*
+	 * Deactivate the interrupt. That's required to undo
+	 * anything an earlier activation has established.
+	 */
+	activated = irqd_is_activated(&desc->irq_data);
+	if (activated)
+		irq_domain_deactivate_irq(&desc->irq_data);
+
+	if (affinity->is_managed) {
+		irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+		irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
+	}
+
+	cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
+
+	/* Restore the activation state */
+	if (activated)
+		irq_domain_activate_irq(&desc->irq_data, false);
+
+out_unlock:
+	irq_put_desc_busunlock(desc, flags);
+	return ret;
+}
+
 int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-- 
cgit 


From 9806731db684a475ade1e95d166089b9edbd9da3 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 2 Dec 2020 18:36:54 +0800
Subject: resource: Add irqresource_disabled()

Add a common function to set the fields for a irq resource to disabled,
which mimics what is done in acpi_dev_irqresource_disabled(), with a view
to replace that function.

Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/1606905417-183214-3-git-send-email-john.garry@huawei.com
---
 include/linux/ioport.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5135d4b86cd6..f9bf374f9633 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -307,6 +307,13 @@ struct resource *devm_request_free_mem_region(struct device *dev,
 struct resource *request_free_mem_region(struct resource *base,
 		unsigned long size, const char *name);
 
+static inline void irqresource_disabled(struct resource *res, u32 irq)
+{
+	res->start = irq;
+	res->end = irq;
+	res->flags = IORESOURCE_IRQ | IORESOURCE_DISABLED | IORESOURCE_UNSET;
+}
+
 #ifdef CONFIG_IO_STRICT_DEVMEM
 void revoke_devmem(struct resource *res);
 #else
-- 
cgit 


From e15f2fa959f2cce8a05e8e3a596e75d068cd42c5 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 2 Dec 2020 18:36:56 +0800
Subject: driver core: platform: Add devm_platform_get_irqs_affinity()

Drivers for multi-queue platform devices may also want managed interrupts
for handling HW queue completion interrupts, so add support.

The function accepts an affinity descriptor pointer, which covers all IRQs
expected for the device.

The function is devm class as the only current in-tree user will also use
devm method for requesting the interrupts; as such, the function is made
as devm as it can ensure ordering of freeing the irq and disposing of the
mapping.

Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/1606905417-183214-5-git-send-email-john.garry@huawei.com
---
 drivers/base/platform.c         | 121 ++++++++++++++++++++++++++++++++++++++++
 include/linux/platform_device.h |   6 ++
 2 files changed, 127 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 88aef93eb4dd..ea8add164b89 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -15,6 +15,8 @@
 #include <linux/of_irq.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
 #include <linux/dma-mapping.h>
 #include <linux/memblock.h>
 #include <linux/err.h>
@@ -289,6 +291,125 @@ int platform_irq_count(struct platform_device *dev)
 }
 EXPORT_SYMBOL_GPL(platform_irq_count);
 
+struct irq_affinity_devres {
+	unsigned int count;
+	unsigned int irq[];
+};
+
+static void platform_disable_acpi_irq(struct platform_device *pdev, int index)
+{
+	struct resource *r;
+
+	r = platform_get_resource(pdev, IORESOURCE_IRQ, index);
+	if (r)
+		irqresource_disabled(r, 0);
+}
+
+static void devm_platform_get_irqs_affinity_release(struct device *dev,
+						    void *res)
+{
+	struct irq_affinity_devres *ptr = res;
+	int i;
+
+	for (i = 0; i < ptr->count; i++) {
+		irq_dispose_mapping(ptr->irq[i]);
+
+		if (has_acpi_companion(dev))
+			platform_disable_acpi_irq(to_platform_device(dev), i);
+	}
+}
+
+/**
+ * devm_platform_get_irqs_affinity - devm method to get a set of IRQs for a
+ *				device using an interrupt affinity descriptor
+ * @dev: platform device pointer
+ * @affd: affinity descriptor
+ * @minvec: minimum count of interrupt vectors
+ * @maxvec: maximum count of interrupt vectors
+ * @irqs: pointer holder for IRQ numbers
+ *
+ * Gets a set of IRQs for a platform device, and updates IRQ afffinty according
+ * to the passed affinity descriptor
+ *
+ * Return: Number of vectors on success, negative error number on failure.
+ */
+int devm_platform_get_irqs_affinity(struct platform_device *dev,
+				    struct irq_affinity *affd,
+				    unsigned int minvec,
+				    unsigned int maxvec,
+				    int **irqs)
+{
+	struct irq_affinity_devres *ptr;
+	struct irq_affinity_desc *desc;
+	size_t size;
+	int i, ret, nvec;
+
+	if (!affd)
+		return -EPERM;
+
+	if (maxvec < minvec)
+		return -ERANGE;
+
+	nvec = platform_irq_count(dev);
+
+	if (nvec < minvec)
+		return -ENOSPC;
+
+	nvec = irq_calc_affinity_vectors(minvec, nvec, affd);
+	if (nvec < minvec)
+		return -ENOSPC;
+
+	if (nvec > maxvec)
+		nvec = maxvec;
+
+	size = sizeof(*ptr) + sizeof(unsigned int) * nvec;
+	ptr = devres_alloc(devm_platform_get_irqs_affinity_release, size,
+			   GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ptr->count = nvec;
+
+	for (i = 0; i < nvec; i++) {
+		int irq = platform_get_irq(dev, i);
+		if (irq < 0) {
+			ret = irq;
+			goto err_free_devres;
+		}
+		ptr->irq[i] = irq;
+	}
+
+	desc = irq_create_affinity_masks(nvec, affd);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto err_free_devres;
+	}
+
+	for (i = 0; i < nvec; i++) {
+		ret = irq_update_affinity_desc(ptr->irq[i], &desc[i]);
+		if (ret) {
+			dev_err(&dev->dev, "failed to update irq%d affinity descriptor (%d)\n",
+				ptr->irq[i], ret);
+			goto err_free_desc;
+		}
+	}
+
+	devres_add(&dev->dev, ptr);
+
+	kfree(desc);
+
+	*irqs = ptr->irq;
+
+	return nvec;
+
+err_free_desc:
+	kfree(desc);
+err_free_devres:
+	devres_free(ptr);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_platform_get_irqs_affinity);
+
 /**
  * platform_get_resource_byname - get a resource for a device by name
  * @dev: platform device
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 77a2aada106d..4d75633e6735 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -15,6 +15,7 @@
 #define PLATFORM_DEVID_NONE	(-1)
 #define PLATFORM_DEVID_AUTO	(-2)
 
+struct irq_affinity;
 struct mfd_cell;
 struct property_entry;
 struct platform_device_id;
@@ -70,6 +71,11 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev,
 extern int platform_get_irq(struct platform_device *, unsigned int);
 extern int platform_get_irq_optional(struct platform_device *, unsigned int);
 extern int platform_irq_count(struct platform_device *);
+extern int devm_platform_get_irqs_affinity(struct platform_device *dev,
+					   struct irq_affinity *affd,
+					   unsigned int minvec,
+					   unsigned int maxvec,
+					   int **irqs);
 extern struct resource *platform_get_resource_byname(struct platform_device *,
 						     unsigned int,
 						     const char *);
-- 
cgit 


From 426506a7e0f1902268c3edbdc7e5475624a9d18b Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:24 +0200
Subject: dmaengine: ti: k3-udma-glue: Add function to get device pointer for
 DMA API

Glue layer users should use the device of the DMA for DMA mapping and
allocations as it is the DMA which accesses to descriptors and buffers,
not the clients

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-5-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c    | 14 ++++++++++++++
 drivers/dma/ti/k3-udma-private.c |  6 ++++++
 drivers/dma/ti/k3-udma.h         |  1 +
 include/linux/dma/k3-udma-glue.h |  4 ++++
 4 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index dfb65e382ab9..29d1524d1916 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -493,6 +493,13 @@ int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn)
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_irq);
 
+struct device *
+	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn)
+{
+	return xudma_get_device(tx_chn->common.udmax);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_dma_device);
+
 static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 {
 	const struct udma_tisci_rm *tisci_rm = rx_chn->common.tisci_rm;
@@ -1201,3 +1208,10 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
 	return flow->virq;
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_irq);
+
+struct device *
+	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn)
+{
+	return xudma_get_device(rx_chn->common.udmax);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_dma_device);
diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c
index aa24e554f7b4..8ff7a264be03 100644
--- a/drivers/dma/ti/k3-udma-private.c
+++ b/drivers/dma/ti/k3-udma-private.c
@@ -50,6 +50,12 @@ struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property)
 }
 EXPORT_SYMBOL(of_xudma_dev_get);
 
+struct device *xudma_get_device(struct udma_dev *ud)
+{
+	return ud->dev;
+}
+EXPORT_SYMBOL(xudma_get_device);
+
 u32 xudma_dev_get_psil_base(struct udma_dev *ud)
 {
 	return ud->psil_base;
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index 09c4529e013d..d1cace0cb43b 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -112,6 +112,7 @@ int xudma_navss_psil_unpair(struct udma_dev *ud, u32 src_thread,
 			    u32 dst_thread);
 
 struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property);
+struct device *xudma_get_device(struct udma_dev *ud);
 void xudma_dev_put(struct udma_dev *ud);
 u32 xudma_dev_get_psil_base(struct udma_dev *ud);
 struct udma_tisci_rm *xudma_dev_get_tisci_rm(struct udma_dev *ud);
diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index 5eb34ad973a7..d7c12f31377c 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -41,6 +41,8 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 u32 k3_udma_glue_tx_get_hdesc_size(struct k3_udma_glue_tx_channel *tx_chn);
 u32 k3_udma_glue_tx_get_txcq_id(struct k3_udma_glue_tx_channel *tx_chn);
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn);
+struct device *
+	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn);
 
 enum {
 	K3_UDMA_GLUE_SRC_TAG_LO_KEEP = 0,
@@ -130,5 +132,7 @@ int k3_udma_glue_rx_flow_enable(struct k3_udma_glue_rx_channel *rx_chn,
 				u32 flow_idx);
 int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn,
 				 u32 flow_idx);
+struct device *
+	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn);
 
 #endif /* K3_UDMA_GLUE_H_ */
-- 
cgit 


From 4f910c035f38053ac8eb63a672c78862c535cd0f Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:27 +0200
Subject: dmaengine: of-dma: Add support for optional router configuration
 callback

Additional configuration for the DMA event router might be needed for a
channel which can not be done during device_alloc_chan_resources callback
since the router information is not yet present for the drivers.

If there is a need for additional configuration for the channel if DMA
router is in use, then the driver can implement the device_router_config
callback.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-8-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/of-dma.c      | 10 ++++++++++
 include/linux/dmaengine.h |  2 ++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 8a4f608904b9..ec00b20ae8e4 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -75,8 +75,18 @@ static struct dma_chan *of_dma_router_xlate(struct of_phandle_args *dma_spec,
 		ofdma->dma_router->route_free(ofdma->dma_router->dev,
 					      route_data);
 	} else {
+		int ret = 0;
+
 		chan->router = ofdma->dma_router;
 		chan->route_data = route_data;
+
+		if (chan->device->device_router_config)
+			ret = chan->device->device_router_config(chan);
+
+		if (ret) {
+			dma_release_channel(chan);
+			chan = ERR_PTR(ret);
+		}
 	}
 
 	/*
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 493a047ed0a2..aed44888cad3 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -805,6 +805,7 @@ struct dma_filter {
  *	by tx_status
  * @device_alloc_chan_resources: allocate resources and return the
  *	number of allocated descriptors
+ * @device_router_config: optional callback for DMA router configuration
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
  * @device_prep_dma_xor: prepares a xor operation
@@ -879,6 +880,7 @@ struct dma_device {
 	enum dma_residue_granularity residue_granularity;
 
 	int (*device_alloc_chan_resources)(struct dma_chan *chan);
+	int (*device_router_config)(struct dma_chan *chan);
 	void (*device_free_chan_resources)(struct dma_chan *chan);
 
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
-- 
cgit 


From ab650ef6d548153862119e1bf3bf267510707f48 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:28 +0200
Subject: dmaengine: Add support for per channel coherency handling

If the DMA device supports per channel coherency configuration (a channel
can be configured to have coherent or not coherent view) then a single
device (the DMA controller's device) can not be used for dma_api for all
channels as channels can have different coherency.

Introduce custom_dma_mapping flag for the dma_chan and a new helper to get
the device pointer to be used for dma_api for the given channel.

Client drivers should be updated to be able to support per channel
coherency by:

- dma_map_single(chan->device->dev, ptr, size, DMA_TO_DEVICE);
+ struct device *dma_dev = dmaengine_get_dma_device(chan);
+
+ dma_map_single(dma_dev, ptr, size, DMA_TO_DEVICE);

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-9-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index aed44888cad3..68130f5f599e 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -357,11 +357,14 @@ struct dma_chan {
  * @chan: driver channel device
  * @device: sysfs device
  * @dev_id: parent dma_device dev_id
+ * @chan_dma_dev: The channel is using custom/different dma-mapping
+ * compared to the parent dma_device
  */
 struct dma_chan_dev {
 	struct dma_chan *chan;
 	struct device device;
 	int dev_id;
+	bool chan_dma_dev;
 };
 
 /**
@@ -1618,4 +1621,13 @@ dmaengine_get_direction_text(enum dma_transfer_direction dir)
 		return "invalid";
 	}
 }
+
+static inline struct device *dmaengine_get_dma_device(struct dma_chan *chan)
+{
+	if (chan->dev->chan_dma_dev)
+		return &chan->dev->device;
+
+	return chan->device->dev;
+}
+
 #endif /* DMAENGINE_H */
-- 
cgit 


From b9366e2577a38ca5322f326cff9752c2008597c6 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:33 +0200
Subject: dmaengine: ti: k3-psil: Extend psil_endpoint_config for K3 PKTDMA

Additional fields needed for K3 PKTDMA to be able to handle the mapped
channels (channels are locked to handle specific threads) and flow ranges
for these mapped threads.
PKTDMA also introduces tflow for tx channels which can not be found in
K3 UDMA architecture.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-14-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-psil.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma/k3-psil.h b/include/linux/dma/k3-psil.h
index 1962f75fa2d3..36e22c5a0f29 100644
--- a/include/linux/dma/k3-psil.h
+++ b/include/linux/dma/k3-psil.h
@@ -50,6 +50,15 @@ enum psil_endpoint_type {
  * @channel_tpl:	Desired throughput level for the channel
  * @pdma_acc32:		ACC32 must be enabled on the PDMA side
  * @pdma_burst:		BURST must be enabled on the PDMA side
+ * @mapped_channel_id:	PKTDMA thread to channel mapping for mapped channels.
+ *			The thread must be serviced by the specified channel if
+ *			mapped_channel_id is >= 0 in case of PKTDMA
+ * @flow_start:		PKDMA flow range start of mapped channel. Unmapped
+ *			channels use flow_id == chan_id
+ * @flow_num:		PKDMA flow count of mapped channel. Unmapped channels
+ *			use flow_id == chan_id
+ * @default_flow_id:	PKDMA default (r)flow index of mapped channel.
+ *			Must be within the flow range of the mapped channel.
  */
 struct psil_endpoint_config {
 	enum psil_endpoint_type ep_type;
@@ -63,6 +72,13 @@ struct psil_endpoint_config {
 	/* PDMA properties, valid for PSIL_EP_PDMA_* */
 	unsigned pdma_acc32:1;
 	unsigned pdma_burst:1;
+
+	/* PKDMA mapped channel */
+	int mapped_channel_id;
+	/* PKTDMA tflow and rflow ranges for mapped channel */
+	u16 flow_start;
+	u16 flow_num;
+	u16 default_flow_id;
 };
 
 int psil_set_new_ep_config(struct device *dev, const char *name,
-- 
cgit 


From fc373e47d72605cc3f5012ddda49d2dca430d51f Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:35 +0200
Subject: dmaengine: ti: Add support for k3 event routers

In k3 architecture a DMA channel (in TR momde) can be triggered by global
events, origination from different modules.

The events for triggers can be sent from any module which is connected to
PSI-L fabric, but the event number to be sent is DMA channel specific, it
is only known after the channel itself is requested.

The router operation needs to be split up:
- route_allocate: configure the dma_spec for the DMA and store the
  configuration which is needed for the router's input
- set_event: callback used by the DMA driver to set the event number for
  the channel and enable the routing

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-16-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-event-router.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 include/linux/dma/k3-event-router.h

(limited to 'include/linux')

diff --git a/include/linux/dma/k3-event-router.h b/include/linux/dma/k3-event-router.h
new file mode 100644
index 000000000000..e3f88b2f87be
--- /dev/null
+++ b/include/linux/dma/k3-event-router.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (C) 2020 Texas Instruments Incorporated - https://www.ti.com
+ */
+
+#ifndef K3_EVENT_ROUTER_
+#define K3_EVENT_ROUTER_
+
+#include <linux/types.h>
+
+struct k3_event_route_data {
+	void *priv;
+	int (*set_event)(void *priv, u32 event);
+};
+
+#endif /* K3_EVENT_ROUTER_ */
-- 
cgit 


From d782298c6f6b854452965b56d91616dfb60490c5 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Tue, 8 Dec 2020 11:04:36 +0200
Subject: soc: ti: k3-ringacc: add AM64 DMA rings support.

The DMAs in AM64 have built in rings compared to AM654/J721e/J7200 where a
separate and generic ringacc is used.

The ring SW interface is similar to ringacc with some major architectural
differences, like

They are part of the DMA (BCDMA or PKTDMA).

They are dual mode rings are modeled as pair of Rings objects which has
common configuration and memory buffer, but separate real-time control
register sets for each direction mem2dev (forward) and dev2mem (reverse).

The ringacc driver must be initialized for DMA rings use with
k3_ringacc_dmarings_init() as it is not an independent device as ringacc
is.

AM64 rings must be requested only using k3_ringacc_request_rings_pair(),
and forward ring must always be initialized/configured. After this any
other Ringacc APIs can be used without any callers changes.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-17-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soc/ti/k3-ringacc.c       | 325 +++++++++++++++++++++++++++++++++++++-
 include/linux/soc/ti/k3-ringacc.h |  17 ++
 2 files changed, 335 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c
index 119164abcb41..c88c305ba367 100644
--- a/drivers/soc/ti/k3-ringacc.c
+++ b/drivers/soc/ti/k3-ringacc.c
@@ -11,6 +11,7 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/sys_soc.h>
+#include <linux/dma/ti-cppi5.h>
 #include <linux/soc/ti/k3-ringacc.h>
 #include <linux/soc/ti/ti_sci_protocol.h>
 #include <linux/soc/ti/ti_sci_inta_msi.h>
@@ -21,6 +22,7 @@ static LIST_HEAD(k3_ringacc_list);
 static DEFINE_MUTEX(k3_ringacc_list_lock);
 
 #define K3_RINGACC_CFG_RING_SIZE_ELCNT_MASK		GENMASK(19, 0)
+#define K3_DMARING_CFG_RING_SIZE_ELCNT_MASK		GENMASK(15, 0)
 
 /**
  * struct k3_ring_rt_regs - The RA realtime Control/Status Registers region
@@ -43,7 +45,13 @@ struct k3_ring_rt_regs {
 	u32	hwindx;
 };
 
-#define K3_RINGACC_RT_REGS_STEP	0x1000
+#define K3_RINGACC_RT_REGS_STEP			0x1000
+#define K3_DMARING_RT_REGS_STEP			0x2000
+#define K3_DMARING_RT_REGS_REVERSE_OFS		0x1000
+#define K3_RINGACC_RT_OCC_MASK			GENMASK(20, 0)
+#define K3_DMARING_RT_OCC_TDOWN_COMPLETE	BIT(31)
+#define K3_DMARING_RT_DB_ENTRY_MASK		GENMASK(7, 0)
+#define K3_DMARING_RT_DB_TDOWN_ACK		BIT(31)
 
 /**
  * struct k3_ring_fifo_regs - The Ring Accelerator Queues Registers region
@@ -122,6 +130,7 @@ struct k3_ring_state {
 	u32 occ;
 	u32 windex;
 	u32 rindex;
+	u32 tdown_complete:1;
 };
 
 /**
@@ -143,6 +152,7 @@ struct k3_ring_state {
  * @use_count: Use count for shared rings
  * @proxy_id: RA Ring Proxy Id (only if @K3_RINGACC_RING_USE_PROXY)
  * @dma_dev: device to be used for DMA API (allocation, mapping)
+ * @asel: Address Space Select value for physical addresses
  */
 struct k3_ring {
 	struct k3_ring_rt_regs __iomem *rt;
@@ -157,12 +167,15 @@ struct k3_ring {
 	u32		flags;
 #define K3_RING_FLAG_BUSY	BIT(1)
 #define K3_RING_FLAG_SHARED	BIT(2)
+#define K3_RING_FLAG_REVERSE	BIT(3)
 	struct k3_ring_state state;
 	u32		ring_id;
 	struct k3_ringacc	*parent;
 	u32		use_count;
 	int		proxy_id;
 	struct device	*dma_dev;
+	u32		asel;
+#define K3_ADDRESS_ASEL_SHIFT	48
 };
 
 struct k3_ringacc_ops {
@@ -188,6 +201,7 @@ struct k3_ringacc_ops {
  * @tisci_ring_ops: ti-sci rings ops
  * @tisci_dev_id: ti-sci device id
  * @ops: SoC specific ringacc operation
+ * @dma_rings: indicate DMA ring (dual ring within BCDMA/PKTDMA)
  */
 struct k3_ringacc {
 	struct device *dev;
@@ -210,6 +224,7 @@ struct k3_ringacc {
 	u32 tisci_dev_id;
 
 	const struct k3_ringacc_ops *ops;
+	bool dma_rings;
 };
 
 /**
@@ -221,6 +236,21 @@ struct k3_ringacc_soc_data {
 	unsigned dma_ring_reset_quirk:1;
 };
 
+static int k3_ringacc_ring_read_occ(struct k3_ring *ring)
+{
+	return readl(&ring->rt->occ) & K3_RINGACC_RT_OCC_MASK;
+}
+
+static void k3_ringacc_ring_update_occ(struct k3_ring *ring)
+{
+	u32 val;
+
+	val = readl(&ring->rt->occ);
+
+	ring->state.occ = val & K3_RINGACC_RT_OCC_MASK;
+	ring->state.tdown_complete = !!(val & K3_DMARING_RT_OCC_TDOWN_COMPLETE);
+}
+
 static long k3_ringacc_ring_get_fifo_pos(struct k3_ring *ring)
 {
 	return K3_RINGACC_FIFO_WINDOW_SIZE_BYTES -
@@ -234,12 +264,24 @@ static void *k3_ringacc_get_elm_addr(struct k3_ring *ring, u32 idx)
 
 static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_pop_mem(struct k3_ring *ring, void *elem);
+static int k3_dmaring_fwd_pop(struct k3_ring *ring, void *elem);
+static int k3_dmaring_reverse_pop(struct k3_ring *ring, void *elem);
 
 static struct k3_ring_ops k3_ring_mode_ring_ops = {
 		.push_tail = k3_ringacc_ring_push_mem,
 		.pop_head = k3_ringacc_ring_pop_mem,
 };
 
+static struct k3_ring_ops k3_dmaring_fwd_ops = {
+		.push_tail = k3_ringacc_ring_push_mem,
+		.pop_head = k3_dmaring_fwd_pop,
+};
+
+static struct k3_ring_ops k3_dmaring_reverse_ops = {
+		/* Reverse side of the DMA ring can only be popped by SW */
+		.pop_head = k3_dmaring_reverse_pop,
+};
+
 static int k3_ringacc_ring_push_io(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_pop_io(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_push_head_io(struct k3_ring *ring, void *elem);
@@ -342,6 +384,40 @@ error:
 }
 EXPORT_SYMBOL_GPL(k3_ringacc_request_ring);
 
+static int k3_dmaring_request_dual_ring(struct k3_ringacc *ringacc, int fwd_id,
+					struct k3_ring **fwd_ring,
+					struct k3_ring **compl_ring)
+{
+	int ret = 0;
+
+	/*
+	 * DMA rings must be requested by ID, completion ring is the reverse
+	 * side of the forward ring
+	 */
+	if (fwd_id < 0)
+		return -EINVAL;
+
+	mutex_lock(&ringacc->req_lock);
+
+	if (test_bit(fwd_id, ringacc->rings_inuse)) {
+		ret = -EBUSY;
+		goto error;
+	}
+
+	*fwd_ring = &ringacc->rings[fwd_id];
+	*compl_ring = &ringacc->rings[fwd_id + ringacc->num_rings];
+	set_bit(fwd_id, ringacc->rings_inuse);
+	ringacc->rings[fwd_id].use_count++;
+	dev_dbg(ringacc->dev, "Giving ring#%d\n", fwd_id);
+
+	mutex_unlock(&ringacc->req_lock);
+	return 0;
+
+error:
+	mutex_unlock(&ringacc->req_lock);
+	return ret;
+}
+
 int k3_ringacc_request_rings_pair(struct k3_ringacc *ringacc,
 				  int fwd_id, int compl_id,
 				  struct k3_ring **fwd_ring,
@@ -352,6 +428,10 @@ int k3_ringacc_request_rings_pair(struct k3_ringacc *ringacc,
 	if (!fwd_ring || !compl_ring)
 		return -EINVAL;
 
+	if (ringacc->dma_rings)
+		return k3_dmaring_request_dual_ring(ringacc, fwd_id,
+						    fwd_ring, compl_ring);
+
 	*fwd_ring = k3_ringacc_request_ring(ringacc, fwd_id, 0);
 	if (!(*fwd_ring))
 		return -ENODEV;
@@ -421,7 +501,7 @@ void k3_ringacc_ring_reset_dma(struct k3_ring *ring, u32 occ)
 		goto reset;
 
 	if (!occ)
-		occ = readl(&ring->rt->occ);
+		occ = k3_ringacc_ring_read_occ(ring);
 
 	if (occ) {
 		u32 db_ring_cnt, db_ring_cnt_cur;
@@ -496,6 +576,13 @@ int k3_ringacc_ring_free(struct k3_ring *ring)
 
 	ringacc = ring->parent;
 
+	/*
+	 * DMA rings: rings shared memory and configuration, only forward ring
+	 * is configured and reverse ring considered as slave.
+	 */
+	if (ringacc->dma_rings && (ring->flags & K3_RING_FLAG_REVERSE))
+		return 0;
+
 	dev_dbg(ring->parent->dev, "flags: 0x%08x\n", ring->flags);
 
 	if (!test_bit(ring->ring_id, ringacc->rings_inuse))
@@ -517,6 +604,8 @@ int k3_ringacc_ring_free(struct k3_ring *ring)
 	ring->flags = 0;
 	ring->ops = NULL;
 	ring->dma_dev = NULL;
+	ring->asel = 0;
+
 	if (ring->proxy_id != K3_RINGACC_PROXY_NOT_USED) {
 		clear_bit(ring->proxy_id, ringacc->proxy_inuse);
 		ring->proxy = NULL;
@@ -581,6 +670,7 @@ static int k3_ringacc_ring_cfg_sci(struct k3_ring *ring)
 	ring_cfg.count = ring->size;
 	ring_cfg.mode = ring->mode;
 	ring_cfg.size = ring->elm_size;
+	ring_cfg.asel = ring->asel;
 
 	ret = ringacc->tisci_ring_ops->set_cfg(ringacc->tisci, &ring_cfg);
 	if (ret)
@@ -590,6 +680,90 @@ static int k3_ringacc_ring_cfg_sci(struct k3_ring *ring)
 	return ret;
 }
 
+static int k3_dmaring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
+{
+	struct k3_ringacc *ringacc;
+	struct k3_ring *reverse_ring;
+	int ret = 0;
+
+	if (cfg->elm_size != K3_RINGACC_RING_ELSIZE_8 ||
+	    cfg->mode != K3_RINGACC_RING_MODE_RING ||
+	    cfg->size & ~K3_DMARING_CFG_RING_SIZE_ELCNT_MASK)
+		return -EINVAL;
+
+	ringacc = ring->parent;
+
+	/*
+	 * DMA rings: rings shared memory and configuration, only forward ring
+	 * is configured and reverse ring considered as slave.
+	 */
+	if (ringacc->dma_rings && (ring->flags & K3_RING_FLAG_REVERSE))
+		return 0;
+
+	if (!test_bit(ring->ring_id, ringacc->rings_inuse))
+		return -EINVAL;
+
+	ring->size = cfg->size;
+	ring->elm_size = cfg->elm_size;
+	ring->mode = cfg->mode;
+	ring->asel = cfg->asel;
+	ring->dma_dev = cfg->dma_dev;
+	if (!ring->dma_dev) {
+		dev_warn(ringacc->dev, "dma_dev is not provided for ring%d\n",
+			 ring->ring_id);
+		ring->dma_dev = ringacc->dev;
+	}
+
+	memset(&ring->state, 0, sizeof(ring->state));
+
+	ring->ops = &k3_dmaring_fwd_ops;
+
+	ring->ring_mem_virt = dma_alloc_coherent(ring->dma_dev,
+						 ring->size * (4 << ring->elm_size),
+						 &ring->ring_mem_dma, GFP_KERNEL);
+	if (!ring->ring_mem_virt) {
+		dev_err(ringacc->dev, "Failed to alloc ring mem\n");
+		ret = -ENOMEM;
+		goto err_free_ops;
+	}
+
+	ret = k3_ringacc_ring_cfg_sci(ring);
+	if (ret)
+		goto err_free_mem;
+
+	ring->flags |= K3_RING_FLAG_BUSY;
+
+	k3_ringacc_ring_dump(ring);
+
+	/* DMA rings: configure reverse ring */
+	reverse_ring = &ringacc->rings[ring->ring_id + ringacc->num_rings];
+	reverse_ring->size = cfg->size;
+	reverse_ring->elm_size = cfg->elm_size;
+	reverse_ring->mode = cfg->mode;
+	reverse_ring->asel = cfg->asel;
+	memset(&reverse_ring->state, 0, sizeof(reverse_ring->state));
+	reverse_ring->ops = &k3_dmaring_reverse_ops;
+
+	reverse_ring->ring_mem_virt = ring->ring_mem_virt;
+	reverse_ring->ring_mem_dma = ring->ring_mem_dma;
+	reverse_ring->flags |= K3_RING_FLAG_BUSY;
+	k3_ringacc_ring_dump(reverse_ring);
+
+	return 0;
+
+err_free_mem:
+	dma_free_coherent(ring->dma_dev,
+			  ring->size * (4 << ring->elm_size),
+			  ring->ring_mem_virt,
+			  ring->ring_mem_dma);
+err_free_ops:
+	ring->ops = NULL;
+	ring->proxy = NULL;
+	ring->dma_dev = NULL;
+	ring->asel = 0;
+	return ret;
+}
+
 int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 {
 	struct k3_ringacc *ringacc;
@@ -597,8 +771,12 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 
 	if (!ring || !cfg)
 		return -EINVAL;
+
 	ringacc = ring->parent;
 
+	if (ringacc->dma_rings)
+		return k3_dmaring_cfg(ring, cfg);
+
 	if (cfg->elm_size > K3_RINGACC_RING_ELSIZE_256 ||
 	    cfg->mode >= K3_RINGACC_RING_MODE_INVALID ||
 	    cfg->size & ~K3_RINGACC_CFG_RING_SIZE_ELCNT_MASK ||
@@ -705,7 +883,7 @@ u32 k3_ringacc_ring_get_free(struct k3_ring *ring)
 		return -EINVAL;
 
 	if (!ring->state.free)
-		ring->state.free = ring->size - readl(&ring->rt->occ);
+		ring->state.free = ring->size - k3_ringacc_ring_read_occ(ring);
 
 	return ring->state.free;
 }
@@ -716,7 +894,7 @@ u32 k3_ringacc_ring_get_occ(struct k3_ring *ring)
 	if (!ring || !(ring->flags & K3_RING_FLAG_BUSY))
 		return -EINVAL;
 
-	return readl(&ring->rt->occ);
+	return k3_ringacc_ring_read_occ(ring);
 }
 EXPORT_SYMBOL_GPL(k3_ringacc_ring_get_occ);
 
@@ -892,6 +1070,72 @@ static int k3_ringacc_ring_pop_tail_io(struct k3_ring *ring, void *elem)
 					 K3_RINGACC_ACCESS_MODE_POP_HEAD);
 }
 
+/*
+ * The element is 48 bits of address + ASEL bits in the ring.
+ * ASEL is used by the DMAs and should be removed for the kernel as it is not
+ * part of the physical memory address.
+ */
+static void k3_dmaring_remove_asel_from_elem(u64 *elem)
+{
+	*elem &= GENMASK_ULL(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+
+static int k3_dmaring_fwd_pop(struct k3_ring *ring, void *elem)
+{
+	void *elem_ptr;
+	u32 elem_idx;
+
+	/*
+	 * DMA rings: forward ring is always tied DMA channel and HW does not
+	 * maintain any state data required for POP operation and its unknown
+	 * how much elements were consumed by HW. So, to actually
+	 * do POP, the read pointer has to be recalculated every time.
+	 */
+	ring->state.occ = k3_ringacc_ring_read_occ(ring);
+	if (ring->state.windex >= ring->state.occ)
+		elem_idx = ring->state.windex - ring->state.occ;
+	else
+		elem_idx = ring->size - (ring->state.occ - ring->state.windex);
+
+	elem_ptr = k3_ringacc_get_elm_addr(ring, elem_idx);
+	memcpy(elem, elem_ptr, (4 << ring->elm_size));
+	k3_dmaring_remove_asel_from_elem(elem);
+
+	ring->state.occ--;
+	writel(-1, &ring->rt->db);
+
+	dev_dbg(ring->parent->dev, "%s: occ%d Windex%d Rindex%d pos_ptr%px\n",
+		__func__, ring->state.occ, ring->state.windex, elem_idx,
+		elem_ptr);
+	return 0;
+}
+
+static int k3_dmaring_reverse_pop(struct k3_ring *ring, void *elem)
+{
+	void *elem_ptr;
+
+	elem_ptr = k3_ringacc_get_elm_addr(ring, ring->state.rindex);
+
+	if (ring->state.occ) {
+		memcpy(elem, elem_ptr, (4 << ring->elm_size));
+		k3_dmaring_remove_asel_from_elem(elem);
+
+		ring->state.rindex = (ring->state.rindex + 1) % ring->size;
+		ring->state.occ--;
+		writel(-1 & K3_DMARING_RT_DB_ENTRY_MASK, &ring->rt->db);
+	} else if (ring->state.tdown_complete) {
+		dma_addr_t *value = elem;
+
+		*value = CPPI5_TDCM_MARKER;
+		writel(K3_DMARING_RT_DB_TDOWN_ACK, &ring->rt->db);
+		ring->state.tdown_complete = false;
+	}
+
+	dev_dbg(ring->parent->dev, "%s: occ%d index%d pos_ptr%px\n",
+		__func__, ring->state.occ, ring->state.rindex, elem_ptr);
+	return 0;
+}
+
 static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem)
 {
 	void *elem_ptr;
@@ -899,6 +1143,11 @@ static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem)
 	elem_ptr = k3_ringacc_get_elm_addr(ring, ring->state.windex);
 
 	memcpy(elem_ptr, elem, (4 << ring->elm_size));
+	if (ring->parent->dma_rings) {
+		u64 *addr = elem_ptr;
+
+		*addr |= ((u64)ring->asel << K3_ADDRESS_ASEL_SHIFT);
+	}
 
 	ring->state.windex = (ring->state.windex + 1) % ring->size;
 	ring->state.free--;
@@ -975,12 +1224,12 @@ int k3_ringacc_ring_pop(struct k3_ring *ring, void *elem)
 		return -EINVAL;
 
 	if (!ring->state.occ)
-		ring->state.occ = k3_ringacc_ring_get_occ(ring);
+		k3_ringacc_ring_update_occ(ring);
 
 	dev_dbg(ring->parent->dev, "ring_pop: occ%d index%d\n", ring->state.occ,
 		ring->state.rindex);
 
-	if (!ring->state.occ)
+	if (!ring->state.occ && !ring->state.tdown_complete)
 		return -ENODATA;
 
 	if (ring->ops && ring->ops->pop_head)
@@ -998,7 +1247,7 @@ int k3_ringacc_ring_pop_tail(struct k3_ring *ring, void *elem)
 		return -EINVAL;
 
 	if (!ring->state.occ)
-		ring->state.occ = k3_ringacc_ring_get_occ(ring);
+		k3_ringacc_ring_update_occ(ring);
 
 	dev_dbg(ring->parent->dev, "ring_pop_tail: occ%d index%d\n",
 		ring->state.occ, ring->state.rindex);
@@ -1203,6 +1452,68 @@ static const struct of_device_id k3_ringacc_of_match[] = {
 	{},
 };
 
+struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev,
+					    struct k3_ringacc_init_data *data)
+{
+	struct device *dev = &pdev->dev;
+	struct k3_ringacc *ringacc;
+	void __iomem *base_rt;
+	struct resource *res;
+	int i;
+
+	ringacc = devm_kzalloc(dev, sizeof(*ringacc), GFP_KERNEL);
+	if (!ringacc)
+		return ERR_PTR(-ENOMEM);
+
+	ringacc->dev = dev;
+	ringacc->dma_rings = true;
+	ringacc->num_rings = data->num_rings;
+	ringacc->tisci = data->tisci;
+	ringacc->tisci_dev_id = data->tisci_dev_id;
+
+	mutex_init(&ringacc->req_lock);
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ringrt");
+	base_rt = devm_ioremap_resource(dev, res);
+	if (IS_ERR(base_rt))
+		return base_rt;
+
+	ringacc->rings = devm_kzalloc(dev,
+				      sizeof(*ringacc->rings) *
+				      ringacc->num_rings * 2,
+				      GFP_KERNEL);
+	ringacc->rings_inuse = devm_kcalloc(dev,
+					    BITS_TO_LONGS(ringacc->num_rings),
+					    sizeof(unsigned long), GFP_KERNEL);
+
+	if (!ringacc->rings || !ringacc->rings_inuse)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < ringacc->num_rings; i++) {
+		struct k3_ring *ring = &ringacc->rings[i];
+
+		ring->rt = base_rt + K3_DMARING_RT_REGS_STEP * i;
+		ring->parent = ringacc;
+		ring->ring_id = i;
+		ring->proxy_id = K3_RINGACC_PROXY_NOT_USED;
+
+		ring = &ringacc->rings[ringacc->num_rings + i];
+		ring->rt = base_rt + K3_DMARING_RT_REGS_STEP * i +
+			   K3_DMARING_RT_REGS_REVERSE_OFS;
+		ring->parent = ringacc;
+		ring->ring_id = i;
+		ring->proxy_id = K3_RINGACC_PROXY_NOT_USED;
+		ring->flags = K3_RING_FLAG_REVERSE;
+	}
+
+	ringacc->tisci_ring_ops = &ringacc->tisci->ops.rm_ring_ops;
+
+	dev_info(dev, "Number of rings: %u\n", ringacc->num_rings);
+
+	return ringacc;
+}
+EXPORT_SYMBOL_GPL(k3_ringacc_dmarings_init);
+
 static int k3_ringacc_probe(struct platform_device *pdev)
 {
 	const struct ringacc_match_data *match_data;
diff --git a/include/linux/soc/ti/k3-ringacc.h b/include/linux/soc/ti/k3-ringacc.h
index 658dc71d2901..39b022b92598 100644
--- a/include/linux/soc/ti/k3-ringacc.h
+++ b/include/linux/soc/ti/k3-ringacc.h
@@ -70,6 +70,7 @@ struct k3_ring;
  * @dma_dev: Master device which is using and accessing to the ring
  *	memory when the mode is K3_RINGACC_RING_MODE_RING. Memory allocations
  *	should be done using this device.
+ * @asel: Address Space Select value for physical addresses
  */
 struct k3_ring_cfg {
 	u32 size;
@@ -79,6 +80,7 @@ struct k3_ring_cfg {
 	u32 flags;
 
 	struct device *dma_dev;
+	u32 asel;
 };
 
 #define K3_RINGACC_RING_ID_ANY (-1)
@@ -250,4 +252,19 @@ int k3_ringacc_ring_pop_tail(struct k3_ring *ring, void *elem);
 
 u32 k3_ringacc_get_tisci_dev_id(struct k3_ring *ring);
 
+/* DMA ring support */
+struct ti_sci_handle;
+
+/**
+ * struct struct k3_ringacc_init_data - Initialization data for DMA rings
+ */
+struct k3_ringacc_init_data {
+	const struct ti_sci_handle *tisci;
+	u32 tisci_dev_id;
+	u32 num_rings;
+};
+
+struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev,
+					    struct k3_ringacc_init_data *data);
+
 #endif /* __SOC_TI_K3_RINGACC_API_H_ */
-- 
cgit 


From 5b65781d06ea90ef2f8e51a13352c43c3daa8cdc Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 8 Dec 2020 11:04:40 +0200
Subject: dmaengine: ti: k3-udma-glue: Add support for K3 PKTDMA

This commit adds support for PKTDMA in k3-udma glue driver. Use new
psil_endpoint_config struct to get static data for a given channel or a
flow during setup.  Make sure that the RX flows being mapped to a RX
channel is within the range of flows that is been allocated to that RX
channel.

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-21-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c    | 291 ++++++++++++++++++++++++++++++++++-----
 drivers/dma/ti/k3-udma-private.c |  24 ++++
 drivers/dma/ti/k3-udma.h         |   4 +
 include/linux/dma/k3-udma-glue.h |   8 ++
 4 files changed, 289 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index e6ebcd98c02a..4fdd9f06b723 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -22,6 +22,7 @@
 
 struct k3_udma_glue_common {
 	struct device *dev;
+	struct device chan_dev;
 	struct udma_dev *udmax;
 	const struct udma_tisci_rm *tisci_rm;
 	struct k3_ringacc *ringacc;
@@ -32,7 +33,8 @@ struct k3_udma_glue_common {
 	bool epib;
 	u32  psdata_size;
 	u32  swdata_size;
-	u32  atype;
+	u32  atype_asel;
+	struct psil_endpoint_config *ep_config;
 };
 
 struct k3_udma_glue_tx_channel {
@@ -53,6 +55,8 @@ struct k3_udma_glue_tx_channel {
 	bool tx_filt_einfo;
 	bool tx_filt_pswords;
 	bool tx_supr_tdpkt;
+
+	int udma_tflow_id;
 };
 
 struct k3_udma_glue_rx_flow {
@@ -81,6 +85,16 @@ struct k3_udma_glue_rx_channel {
 	u32 flows_ready;
 };
 
+static void k3_udma_chan_dev_release(struct device *dev)
+{
+	/* The struct containing the device is devm managed */
+}
+
+static struct class k3_udma_glue_devclass = {
+	.name		= "k3_udma_glue_chan",
+	.dev_release	= k3_udma_chan_dev_release,
+};
+
 #define K3_UDMAX_TDOWN_TIMEOUT_US 1000
 
 static int of_k3_udma_glue_parse(struct device_node *udmax_np,
@@ -100,7 +114,6 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 		const char *name, struct k3_udma_glue_common *common,
 		bool tx_chn)
 {
-	struct psil_endpoint_config *ep_config;
 	struct of_phandle_args dma_spec;
 	u32 thread_id;
 	int ret = 0;
@@ -117,15 +130,26 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 				       &dma_spec))
 		return -ENOENT;
 
+	ret = of_k3_udma_glue_parse(dma_spec.np, common);
+	if (ret)
+		goto out_put_spec;
+
 	thread_id = dma_spec.args[0];
 	if (dma_spec.args_count == 2) {
-		if (dma_spec.args[1] > 2) {
+		if (dma_spec.args[1] > 2 && !xudma_is_pktdma(common->udmax)) {
 			dev_err(common->dev, "Invalid channel atype: %u\n",
 				dma_spec.args[1]);
 			ret = -EINVAL;
 			goto out_put_spec;
 		}
-		common->atype = dma_spec.args[1];
+		if (dma_spec.args[1] > 15 && xudma_is_pktdma(common->udmax)) {
+			dev_err(common->dev, "Invalid channel asel: %u\n",
+				dma_spec.args[1]);
+			ret = -EINVAL;
+			goto out_put_spec;
+		}
+
+		common->atype_asel = dma_spec.args[1];
 	}
 
 	if (tx_chn && !(thread_id & K3_PSIL_DST_THREAD_ID_OFFSET)) {
@@ -139,25 +163,23 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 	}
 
 	/* get psil endpoint config */
-	ep_config = psil_get_ep_config(thread_id);
-	if (IS_ERR(ep_config)) {
+	common->ep_config = psil_get_ep_config(thread_id);
+	if (IS_ERR(common->ep_config)) {
 		dev_err(common->dev,
 			"No configuration for psi-l thread 0x%04x\n",
 			thread_id);
-		ret = PTR_ERR(ep_config);
+		ret = PTR_ERR(common->ep_config);
 		goto out_put_spec;
 	}
 
-	common->epib = ep_config->needs_epib;
-	common->psdata_size = ep_config->psd_size;
+	common->epib = common->ep_config->needs_epib;
+	common->psdata_size = common->ep_config->psd_size;
 
 	if (tx_chn)
 		common->dst_thread = thread_id;
 	else
 		common->src_thread = thread_id;
 
-	ret = of_k3_udma_glue_parse(dma_spec.np, common);
-
 out_put_spec:
 	of_node_put(dma_spec.np);
 	return ret;
@@ -223,7 +245,7 @@ static int k3_udma_glue_cfg_tx_chn(struct k3_udma_glue_tx_channel *tx_chn)
 		req.tx_supr_tdpkt = 1;
 	req.tx_fetch_size = tx_chn->common.hdesc_size >> 2;
 	req.txcq_qnum = k3_ringacc_get_ring_id(tx_chn->ringtxcq);
-	req.tx_atype = tx_chn->common.atype;
+	req.tx_atype = tx_chn->common.atype_asel;
 
 	return tisci_rm->tisci_udmap_ops->tx_ch_cfg(tisci_rm->tisci, &req);
 }
@@ -255,8 +277,14 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 						tx_chn->common.psdata_size,
 						tx_chn->common.swdata_size);
 
+	if (xudma_is_pktdma(tx_chn->common.udmax))
+		tx_chn->udma_tchan_id = tx_chn->common.ep_config->mapped_channel_id;
+	else
+		tx_chn->udma_tchan_id = -1;
+
 	/* request and cfg UDMAP TX channel */
-	tx_chn->udma_tchanx = xudma_tchan_get(tx_chn->common.udmax, -1);
+	tx_chn->udma_tchanx = xudma_tchan_get(tx_chn->common.udmax,
+					      tx_chn->udma_tchan_id);
 	if (IS_ERR(tx_chn->udma_tchanx)) {
 		ret = PTR_ERR(tx_chn->udma_tchanx);
 		dev_err(dev, "UDMAX tchanx get err %d\n", ret);
@@ -264,11 +292,34 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 	}
 	tx_chn->udma_tchan_id = xudma_tchan_get_id(tx_chn->udma_tchanx);
 
+	tx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	tx_chn->common.chan_dev.parent = xudma_get_device(tx_chn->common.udmax);
+	dev_set_name(&tx_chn->common.chan_dev, "tchan%d-0x%04x",
+		     tx_chn->udma_tchan_id, tx_chn->common.dst_thread);
+	ret = device_register(&tx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		tx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
+
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		tx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&tx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
 	atomic_set(&tx_chn->free_pkts, cfg->txcq_cfg.size);
 
+	if (xudma_is_pktdma(tx_chn->common.udmax))
+		tx_chn->udma_tflow_id = tx_chn->common.ep_config->default_flow_id;
+	else
+		tx_chn->udma_tflow_id = tx_chn->udma_tchan_id;
+
 	/* request and cfg rings */
 	ret =  k3_ringacc_request_rings_pair(tx_chn->common.ringacc,
-					     tx_chn->udma_tchan_id, -1,
+					     tx_chn->udma_tflow_id, -1,
 					     &tx_chn->ringtx,
 					     &tx_chn->ringtxcq);
 	if (ret) {
@@ -280,6 +331,12 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 	cfg->tx_cfg.dma_dev = k3_udma_glue_tx_get_dma_device(tx_chn);
 	cfg->txcq_cfg.dma_dev = cfg->tx_cfg.dma_dev;
 
+	/* Set the ASEL value for DMA rings of PKTDMA */
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		cfg->tx_cfg.asel = tx_chn->common.atype_asel;
+		cfg->txcq_cfg.asel = tx_chn->common.atype_asel;
+	}
+
 	ret = k3_ringacc_ring_cfg(tx_chn->ringtx, &cfg->tx_cfg);
 	if (ret) {
 		dev_err(dev, "Failed to cfg ringtx %d\n", ret);
@@ -331,6 +388,11 @@ void k3_udma_glue_release_tx_chn(struct k3_udma_glue_tx_channel *tx_chn)
 
 	if (tx_chn->ringtx)
 		k3_ringacc_ring_free(tx_chn->ringtx);
+
+	if (tx_chn->common.chan_dev.parent) {
+		device_unregister(&tx_chn->common.chan_dev);
+		tx_chn->common.chan_dev.parent = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_release_tx_chn);
 
@@ -443,13 +505,10 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 			       void *data,
 			       void (*cleanup)(void *data, dma_addr_t desc_dma))
 {
+	struct device *dev = tx_chn->common.dev;
 	dma_addr_t desc_dma;
 	int occ_tx, i, ret;
 
-	/* reset TXCQ as it is not input for udma - expected to be empty */
-	if (tx_chn->ringtxcq)
-		k3_ringacc_ring_reset(tx_chn->ringtxcq);
-
 	/*
 	 * TXQ reset need to be special way as it is input for udma and its
 	 * state cached by udma, so:
@@ -458,17 +517,20 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 	 * 3) reset TXQ in a special way
 	 */
 	occ_tx = k3_ringacc_ring_get_occ(tx_chn->ringtx);
-	dev_dbg(tx_chn->common.dev, "TX reset occ_tx %u\n", occ_tx);
+	dev_dbg(dev, "TX reset occ_tx %u\n", occ_tx);
 
 	for (i = 0; i < occ_tx; i++) {
 		ret = k3_ringacc_ring_pop(tx_chn->ringtx, &desc_dma);
 		if (ret) {
-			dev_err(tx_chn->common.dev, "TX reset pop %d\n", ret);
+			if (ret != -ENODATA)
+				dev_err(dev, "TX reset pop %d\n", ret);
 			break;
 		}
 		cleanup(data, desc_dma);
 	}
 
+	/* reset TXCQ as it is not input for udma - expected to be empty */
+	k3_ringacc_ring_reset(tx_chn->ringtxcq);
 	k3_ringacc_ring_reset_dma(tx_chn->ringtx, occ_tx);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_reset_tx_chn);
@@ -487,7 +549,12 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_txcq_id);
 
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn)
 {
-	tx_chn->virq = k3_ringacc_get_ring_irq_num(tx_chn->ringtxcq);
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		tx_chn->virq = xudma_pktdma_tflow_get_irq(tx_chn->common.udmax,
+							  tx_chn->udma_tflow_id);
+	} else {
+		tx_chn->virq = k3_ringacc_get_ring_irq_num(tx_chn->ringtxcq);
+	}
 
 	return tx_chn->virq;
 }
@@ -496,10 +563,36 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_irq);
 struct device *
 	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn)
 {
+	if (xudma_is_pktdma(tx_chn->common.udmax) &&
+	    (tx_chn->common.atype_asel == 14 || tx_chn->common.atype_asel == 15))
+		return &tx_chn->common.chan_dev;
+
 	return xudma_get_device(tx_chn->common.udmax);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_dma_device);
 
+void k3_udma_glue_tx_dma_to_cppi5_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(tx_chn->common.udmax) ||
+	    !tx_chn->common.atype_asel)
+		return;
+
+	*addr |= (u64)tx_chn->common.atype_asel << K3_ADDRESS_ASEL_SHIFT;
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_dma_to_cppi5_addr);
+
+void k3_udma_glue_tx_cppi5_to_dma_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(tx_chn->common.udmax) ||
+	    !tx_chn->common.atype_asel)
+		return;
+
+	*addr &= (u64)GENMASK(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_cppi5_to_dma_addr);
+
 static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 {
 	const struct udma_tisci_rm *tisci_rm = rx_chn->common.tisci_rm;
@@ -511,8 +604,6 @@ static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	req.valid_params = TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_CHAN_TYPE_VALID |
-			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID |
-			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID;
 
 	req.nav_id = tisci_rm->tisci_dev_id;
@@ -524,13 +615,16 @@ static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	 * req.rxcq_qnum = k3_ringacc_get_ring_id(rx_chn->flows[0].ringrx);
 	 */
 	req.rxcq_qnum = 0xFFFF;
-	if (rx_chn->flow_num && rx_chn->flow_id_base != rx_chn->udma_rchan_id) {
+	if (!xudma_is_pktdma(rx_chn->common.udmax) && rx_chn->flow_num &&
+	    rx_chn->flow_id_base != rx_chn->udma_rchan_id) {
 		/* Default flow + extra ones */
+		req.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID |
+				    TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID;
 		req.flowid_start = rx_chn->flow_id_base;
 		req.flowid_cnt = rx_chn->flow_num;
 	}
 	req.rx_chan_type = TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR;
-	req.rx_atype = rx_chn->common.atype;
+	req.rx_atype = rx_chn->common.atype_asel;
 
 	ret = tisci_rm->tisci_udmap_ops->rx_ch_cfg(tisci_rm->tisci, &req);
 	if (ret)
@@ -584,10 +678,18 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn,
 		goto err_rflow_put;
 	}
 
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		rx_ringfdq_id = flow->udma_rflow_id +
+				xudma_get_rflow_ring_offset(rx_chn->common.udmax);
+		rx_ring_id = 0;
+	} else {
+		rx_ring_id = flow_cfg->ring_rxq_id;
+		rx_ringfdq_id = flow_cfg->ring_rxfdq0_id;
+	}
+
 	/* request and cfg rings */
 	ret =  k3_ringacc_request_rings_pair(rx_chn->common.ringacc,
-					     flow_cfg->ring_rxfdq0_id,
-					     flow_cfg->ring_rxq_id,
+					     rx_ringfdq_id, rx_ring_id,
 					     &flow->ringrxfdq,
 					     &flow->ringrx);
 	if (ret) {
@@ -599,6 +701,12 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn,
 	flow_cfg->rx_cfg.dma_dev = k3_udma_glue_rx_get_dma_device(rx_chn);
 	flow_cfg->rxfdq_cfg.dma_dev = flow_cfg->rx_cfg.dma_dev;
 
+	/* Set the ASEL value for DMA rings of PKTDMA */
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		flow_cfg->rx_cfg.asel = rx_chn->common.atype_asel;
+		flow_cfg->rxfdq_cfg.asel = rx_chn->common.atype_asel;
+	}
+
 	ret = k3_ringacc_ring_cfg(flow->ringrx, &flow_cfg->rx_cfg);
 	if (ret) {
 		dev_err(dev, "Failed to cfg ringrx %d\n", ret);
@@ -757,6 +865,7 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 				 struct k3_udma_glue_rx_channel_cfg *cfg)
 {
 	struct k3_udma_glue_rx_channel *rx_chn;
+	struct psil_endpoint_config *ep_cfg;
 	int ret, i;
 
 	if (cfg->flow_id_num <= 0)
@@ -784,8 +893,16 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 						rx_chn->common.psdata_size,
 						rx_chn->common.swdata_size);
 
+	ep_cfg = rx_chn->common.ep_config;
+
+	if (xudma_is_pktdma(rx_chn->common.udmax))
+		rx_chn->udma_rchan_id = ep_cfg->mapped_channel_id;
+	else
+		rx_chn->udma_rchan_id = -1;
+
 	/* request and cfg UDMAP RX channel */
-	rx_chn->udma_rchanx = xudma_rchan_get(rx_chn->common.udmax, -1);
+	rx_chn->udma_rchanx = xudma_rchan_get(rx_chn->common.udmax,
+					      rx_chn->udma_rchan_id);
 	if (IS_ERR(rx_chn->udma_rchanx)) {
 		ret = PTR_ERR(rx_chn->udma_rchanx);
 		dev_err(dev, "UDMAX rchanx get err %d\n", ret);
@@ -793,12 +910,48 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 	}
 	rx_chn->udma_rchan_id = xudma_rchan_get_id(rx_chn->udma_rchanx);
 
-	rx_chn->flow_num = cfg->flow_id_num;
-	rx_chn->flow_id_base = cfg->flow_id_base;
+	rx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	rx_chn->common.chan_dev.parent = xudma_get_device(rx_chn->common.udmax);
+	dev_set_name(&rx_chn->common.chan_dev, "rchan%d-0x%04x",
+		     rx_chn->udma_rchan_id, rx_chn->common.src_thread);
+	ret = device_register(&rx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		rx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
 
-	/* Use RX channel id as flow id: target dev can't generate flow_id */
-	if (cfg->flow_id_use_rxchan_id)
-		rx_chn->flow_id_base = rx_chn->udma_rchan_id;
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		rx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&rx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		int flow_start = cfg->flow_id_base;
+		int flow_end;
+
+		if (flow_start == -1)
+			flow_start = ep_cfg->flow_start;
+
+		flow_end = flow_start + cfg->flow_id_num - 1;
+		if (flow_start < ep_cfg->flow_start ||
+		    flow_end > (ep_cfg->flow_start + ep_cfg->flow_num - 1)) {
+			dev_err(dev, "Invalid flow range requested\n");
+			ret = -EINVAL;
+			goto err;
+		}
+		rx_chn->flow_id_base = flow_start;
+	} else {
+		rx_chn->flow_id_base = cfg->flow_id_base;
+
+		/* Use RX channel id as flow id: target dev can't generate flow_id */
+		if (cfg->flow_id_use_rxchan_id)
+			rx_chn->flow_id_base = rx_chn->udma_rchan_id;
+	}
+
+	rx_chn->flow_num = cfg->flow_id_num;
 
 	rx_chn->flows = devm_kcalloc(dev, rx_chn->flow_num,
 				     sizeof(*rx_chn->flows), GFP_KERNEL);
@@ -888,6 +1041,24 @@ k3_udma_glue_request_remote_rx_chn(struct device *dev, const char *name,
 		goto err;
 	}
 
+	rx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	rx_chn->common.chan_dev.parent = xudma_get_device(rx_chn->common.udmax);
+	dev_set_name(&rx_chn->common.chan_dev, "rchan_remote-0x%04x",
+		     rx_chn->common.src_thread);
+	ret = device_register(&rx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		rx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
+
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		rx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&rx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
 	ret = k3_udma_glue_allocate_rx_flows(rx_chn, cfg);
 	if (ret)
 		goto err;
@@ -940,6 +1111,11 @@ void k3_udma_glue_release_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	if (!IS_ERR_OR_NULL(rx_chn->udma_rchanx))
 		xudma_rchan_put(rx_chn->common.udmax,
 				rx_chn->udma_rchanx);
+
+	if (rx_chn->common.chan_dev.parent) {
+		device_unregister(&rx_chn->common.chan_dev);
+		rx_chn->common.chan_dev.parent = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_release_rx_chn);
 
@@ -1151,12 +1327,10 @@ void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
 	/* reset RXCQ as it is not input for udma - expected to be empty */
 	occ_rx = k3_ringacc_ring_get_occ(flow->ringrx);
 	dev_dbg(dev, "RX reset flow %u occ_rx %u\n", flow_num, occ_rx);
-	if (flow->ringrx)
-		k3_ringacc_ring_reset(flow->ringrx);
 
 	/* Skip RX FDQ in case one FDQ is used for the set of flows */
 	if (skip_fdq)
-		return;
+		goto do_reset;
 
 	/*
 	 * RX FDQ reset need to be special way as it is input for udma and its
@@ -1171,13 +1345,17 @@ void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
 	for (i = 0; i < occ_rx; i++) {
 		ret = k3_ringacc_ring_pop(flow->ringrxfdq, &desc_dma);
 		if (ret) {
-			dev_err(dev, "RX reset pop %d\n", ret);
+			if (ret != -ENODATA)
+				dev_err(dev, "RX reset pop %d\n", ret);
 			break;
 		}
 		cleanup(data, desc_dma);
 	}
 
 	k3_ringacc_ring_reset_dma(flow->ringrxfdq, occ_rx);
+
+do_reset:
+	k3_ringacc_ring_reset(flow->ringrx);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_reset_rx_chn);
 
@@ -1207,7 +1385,12 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
 
 	flow = &rx_chn->flows[flow_num];
 
-	flow->virq = k3_ringacc_get_ring_irq_num(flow->ringrx);
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		flow->virq = xudma_pktdma_rflow_get_irq(rx_chn->common.udmax,
+							flow->udma_rflow_id);
+	} else {
+		flow->virq = k3_ringacc_get_ring_irq_num(flow->ringrx);
+	}
 
 	return flow->virq;
 }
@@ -1216,6 +1399,38 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_irq);
 struct device *
 	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn)
 {
+	if (xudma_is_pktdma(rx_chn->common.udmax) &&
+	    (rx_chn->common.atype_asel == 14 || rx_chn->common.atype_asel == 15))
+		return &rx_chn->common.chan_dev;
+
 	return xudma_get_device(rx_chn->common.udmax);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_dma_device);
+
+void k3_udma_glue_rx_dma_to_cppi5_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(rx_chn->common.udmax) ||
+	    !rx_chn->common.atype_asel)
+		return;
+
+	*addr |= (u64)rx_chn->common.atype_asel << K3_ADDRESS_ASEL_SHIFT;
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_dma_to_cppi5_addr);
+
+void k3_udma_glue_rx_cppi5_to_dma_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(rx_chn->common.udmax) ||
+	    !rx_chn->common.atype_asel)
+		return;
+
+	*addr &= (u64)GENMASK(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_cppi5_to_dma_addr);
+
+static int __init k3_udma_glue_class_init(void)
+{
+	return class_register(&k3_udma_glue_devclass);
+}
+arch_initcall(k3_udma_glue_class_init);
diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c
index 5436b19d656e..eb4795c089bd 100644
--- a/drivers/dma/ti/k3-udma-private.c
+++ b/drivers/dma/ti/k3-udma-private.c
@@ -157,3 +157,27 @@ void xudma_##res##rt_write(struct udma_##res *p, int reg, u32 val)	\
 EXPORT_SYMBOL(xudma_##res##rt_write)
 XUDMA_RT_IO_FUNCTIONS(tchan);
 XUDMA_RT_IO_FUNCTIONS(rchan);
+
+int xudma_is_pktdma(struct udma_dev *ud)
+{
+	return ud->match_data->type == DMA_TYPE_PKTDMA;
+}
+EXPORT_SYMBOL(xudma_is_pktdma);
+
+int xudma_pktdma_tflow_get_irq(struct udma_dev *ud, int udma_tflow_id)
+{
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+
+	return ti_sci_inta_msi_get_virq(ud->dev, udma_tflow_id +
+					oes->pktdma_tchan_flow);
+}
+EXPORT_SYMBOL(xudma_pktdma_tflow_get_irq);
+
+int xudma_pktdma_rflow_get_irq(struct udma_dev *ud, int udma_rflow_id)
+{
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+
+	return ti_sci_inta_msi_get_virq(ud->dev, udma_rflow_id +
+					oes->pktdma_rchan_flow);
+}
+EXPORT_SYMBOL(xudma_pktdma_rflow_get_irq);
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index ccb19f286daf..d349c6d482ae 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -157,4 +157,8 @@ void xudma_rchanrt_write(struct udma_rchan *rchan, int reg, u32 val);
 bool xudma_rflow_is_gp(struct udma_dev *ud, int id);
 int xudma_get_rflow_ring_offset(struct udma_dev *ud);
 
+int xudma_is_pktdma(struct udma_dev *ud);
+
+int xudma_pktdma_tflow_get_irq(struct udma_dev *ud, int udma_tflow_id);
+int xudma_pktdma_rflow_get_irq(struct udma_dev *ud, int udma_rflow_id);
 #endif /* K3_UDMA_H_ */
diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index d7c12f31377c..e443be4d3b4b 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -43,6 +43,10 @@ u32 k3_udma_glue_tx_get_txcq_id(struct k3_udma_glue_tx_channel *tx_chn);
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn);
 struct device *
 	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn);
+void k3_udma_glue_tx_dma_to_cppi5_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr);
+void k3_udma_glue_tx_cppi5_to_dma_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr);
 
 enum {
 	K3_UDMA_GLUE_SRC_TAG_LO_KEEP = 0,
@@ -134,5 +138,9 @@ int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn,
 				 u32 flow_idx);
 struct device *
 	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn);
+void k3_udma_glue_rx_dma_to_cppi5_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr);
+void k3_udma_glue_rx_cppi5_to_dma_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr);
 
 #endif /* K3_UDMA_GLUE_H_ */
-- 
cgit 


From e998879d4fb7991856916972168cf27c0d86ed12 Mon Sep 17 00:00:00 2001
From: Ashish Kalra <ashish.kalra@amd.com>
Date: Thu, 10 Dec 2020 01:25:15 +0000
Subject: x86,swiotlb: Adjust SWIOTLB bounce buffer size for SEV guests

For SEV, all DMA to and from guest has to use shared (un-encrypted) pages.
SEV uses SWIOTLB to make this happen without requiring changes to device
drivers.  However, depending on the workload being run, the default 64MB
of it might not be enough and it may run out of buffers to use for DMA,
resulting in I/O errors and/or performance degradation for high
I/O workloads.

Adjust the default size of SWIOTLB for SEV guests using a
percentage of the total memory available to guest for the SWIOTLB buffers.

Adds a new sev_setup_arch() function which is invoked from setup_arch()
and it calls into a new swiotlb generic code function swiotlb_adjust_size()
to do the SWIOTLB buffer adjustment.

v5 fixed build errors and warnings as
Reported-by: kbuild test robot <lkp@intel.com>

Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Co-developed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/mem_encrypt.h |  2 ++
 arch/x86/kernel/setup.c            |  6 ++++++
 arch/x86/mm/mem_encrypt.c          | 31 +++++++++++++++++++++++++++++++
 include/linux/swiotlb.h            |  8 ++++++++
 kernel/dma/swiotlb.c               | 20 ++++++++++++++++++--
 5 files changed, 65 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 2f62bbdd9d12..31c4df123aa0 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -37,6 +37,7 @@ void __init sme_map_bootdata(char *real_mode_data);
 void __init sme_unmap_bootdata(char *real_mode_data);
 
 void __init sme_early_init(void);
+void __init sev_setup_arch(void);
 
 void __init sme_encrypt_kernel(struct boot_params *bp);
 void __init sme_enable(struct boot_params *bp);
@@ -69,6 +70,7 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
 static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
 
 static inline void __init sme_early_init(void) { }
+static inline void __init sev_setup_arch(void) { }
 
 static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 84f581c91db4..874b2c17af41 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1054,6 +1054,12 @@ void __init setup_arch(char **cmdline_p)
 	memblock_set_current_limit(ISA_END_ADDRESS);
 	e820__memblock_setup();
 
+	/*
+	 * Needs to run after memblock setup because it needs the physical
+	 * memory size.
+	 */
+	sev_setup_arch();
+
 	reserve_bios_regions();
 
 	efi_fake_memmap();
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index bc0833713be9..c79e5736ab2b 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -198,6 +198,37 @@ void __init sme_early_init(void)
 		swiotlb_force = SWIOTLB_FORCE;
 }
 
+void __init sev_setup_arch(void)
+{
+	phys_addr_t total_mem = memblock_phys_mem_size();
+	unsigned long size;
+
+	if (!sev_active())
+		return;
+
+	/*
+	 * For SEV, all DMA has to occur via shared/unencrypted pages.
+	 * SEV uses SWIOTLB to make this happen without changing device
+	 * drivers. However, depending on the workload being run, the
+	 * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+	 * run out of buffers for DMA, resulting in I/O errors and/or
+	 * performance degradation especially with high I/O workloads.
+	 *
+	 * Adjust the default size of SWIOTLB for SEV guests using
+	 * a percentage of guest memory for SWIOTLB buffers.
+	 * Also, as the SWIOTLB bounce buffer memory is allocated
+	 * from low memory, ensure that the adjusted size is within
+	 * the limits of low available memory.
+	 *
+	 * The percentage of guest memory used here for SWIOTLB buffers
+	 * is more of an approximation of the static adjustment which
+	 * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+	 */
+	size = total_mem * 6 / 100;
+	size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+	swiotlb_adjust_size(size);
+}
+
 static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
 {
 	pgprot_t old_prot, new_prot;
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index fbdc65782195..d9c9fc9ca5d2 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -30,6 +30,9 @@ enum swiotlb_force {
  */
 #define IO_TLB_SHIFT 11
 
+/* default to 64MB */
+#define IO_TLB_DEFAULT_SIZE (64UL<<20)
+
 extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
@@ -78,6 +81,7 @@ void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
 bool is_swiotlb_active(void);
+void __init swiotlb_adjust_size(unsigned long new_size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
@@ -100,6 +104,10 @@ static inline bool is_swiotlb_active(void)
 {
 	return false;
 }
+
+static inline void swiotlb_adjust_size(unsigned long new_size)
+{
+}
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 781b9dca197c..7c42df6e6100 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -152,8 +152,6 @@ void swiotlb_set_max_segment(unsigned int val)
 		max_segment = rounddown(val, PAGE_SIZE);
 }
 
-/* default to 64MB */
-#define IO_TLB_DEFAULT_SIZE (64UL<<20)
 unsigned long swiotlb_size_or_default(void)
 {
 	unsigned long size;
@@ -163,6 +161,24 @@ unsigned long swiotlb_size_or_default(void)
 	return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+void __init swiotlb_adjust_size(unsigned long new_size)
+{
+	unsigned long size;
+
+	/*
+	 * If swiotlb parameter has not been specified, give a chance to
+	 * architectures such as those supporting memory encryption to
+	 * adjust/expand SWIOTLB size for their use.
+	 */
+	if (!io_tlb_nslabs) {
+		size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+		io_tlb_nslabs = size >> IO_TLB_SHIFT;
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+		pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
+	}
+}
+
 void swiotlb_print_info(void)
 {
 	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-- 
cgit 


From 14dc3983b5dff513a90bd5a8cc90acaf7867c3d0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 11 Dec 2020 13:36:38 -0800
Subject: kbuild: avoid static_assert for genksyms

genksyms does not know or care about the _Static_assert() built-in, and
sometimes falls back to ignoring the later symbols, which causes
undefined behavior such as

  WARNING: modpost: EXPORT symbol "ethtool_set_ethtool_phy_ops" [vmlinux] version generation failed, symbol will not be versioned.
  ld: net/ethtool/common.o: relocation R_AARCH64_ABS32 against `__crc_ethtool_set_ethtool_phy_ops' can not be used when making a shared object
  net/ethtool/common.o:(_ftrace_annotated_branch+0x0): dangerous relocation: unsupported relocation

Redefine static_assert for genksyms to avoid that.

Link: https://lkml.kernel.org/r/20201203230955.1482058-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Ard Biesheuvel <ardb@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michal Marek <michal.lkml@markovi.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/build_bug.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h
index e3a0be2c90ad..7bb66e15b481 100644
--- a/include/linux/build_bug.h
+++ b/include/linux/build_bug.h
@@ -77,4 +77,9 @@
 #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr)
 #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
 
+#ifdef __GENKSYMS__
+/* genksyms gets confused by _Static_assert */
+#define _Static_assert(expr, ...)
+#endif
+
 #endif	/* _LINUX_BUILD_BUG_H */
-- 
cgit 


From 6e7b64b9dd6d96537d816ea07ec26b7dedd397b9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 11 Dec 2020 13:36:46 -0800
Subject: elfcore: fix building with clang

kernel/elfcore.c only contains weak symbols, which triggers a bug with
clang in combination with recordmcount:

  Cannot find symbol for section 2: .text.
  kernel/elfcore.o: failed

Move the empty stubs into linux/elfcore.h as inline functions.  As only
two architectures use these, just use the architecture specific Kconfig
symbols to key off the declaration.

Link: https://lkml.kernel.org/r/20201204165742.3815221-2-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Barret Rhoden <brho@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/elfcore.h | 22 ++++++++++++++++++++++
 kernel/Makefile         |  1 -
 kernel/elfcore.c        | 26 --------------------------
 3 files changed, 22 insertions(+), 27 deletions(-)
 delete mode 100644 kernel/elfcore.c

(limited to 'include/linux')

diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 46c3d691f677..de51c1bef27d 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -104,6 +104,7 @@ static inline int elf_core_copy_task_fpregs(struct task_struct *t, struct pt_reg
 #endif
 }
 
+#if defined(CONFIG_UM) || defined(CONFIG_IA64)
 /*
  * These functions parameterize elf_core_dump in fs/binfmt_elf.c to write out
  * extra segments containing the gate DSO contents.  Dumping its
@@ -118,5 +119,26 @@ elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset);
 extern int
 elf_core_write_extra_data(struct coredump_params *cprm);
 extern size_t elf_core_extra_data_size(void);
+#else
+static inline Elf_Half elf_core_extra_phdrs(void)
+{
+	return 0;
+}
+
+static inline int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
+{
+	return 1;
+}
+
+static inline int elf_core_write_extra_data(struct coredump_params *cprm)
+{
+	return 1;
+}
+
+static inline size_t elf_core_extra_data_size(void)
+{
+	return 0;
+}
+#endif
 
 #endif /* _LINUX_ELFCORE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index af601b9bda0e..6c9f19911be0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -97,7 +97,6 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_ELFCORE) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_TRACE_CLOCK) += trace/
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
deleted file mode 100644
index 57fb4dcff434..000000000000
--- a/kernel/elfcore.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/elf.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/binfmts.h>
-#include <linux/elfcore.h>
-
-Elf_Half __weak elf_core_extra_phdrs(void)
-{
-	return 0;
-}
-
-int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
-{
-	return 1;
-}
-
-int __weak elf_core_write_extra_data(struct coredump_params *cprm)
-{
-	return 1;
-}
-
-size_t __weak elf_core_extra_data_size(void)
-{
-	return 0;
-}
-- 
cgit 


From 2ab695aa8eb8f3226f68a2b91fc6103b56fcb57d Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 11 Dec 2020 12:26:29 -0800
Subject: ACPI: Use fwnode_init() to set up fwnode

Commit 01bb86b380a3 ("driver core: Add fwnode_init()") was supposed to
fix up all instances of fwnode creation to use fwnode_init(). But looks
like this instance was missed. This causes a NULL pointer dereference
during device_add() [1]. So, fix it.

[   60.792324][    T1] Call trace:
[   60.795495][    T1]  device_add+0xf60/0x16b0
__fw_devlink_link_to_consumers at drivers/base/core.c:1583
(inlined by) fw_devlink_link_device at drivers/base/core.c:1726
(inlined by) device_add at drivers/base/core.c:3088
[   60.799813][    T1]  platform_device_add+0x274/0x628
[   60.804833][    T1]  acpi_iort_init+0x9d8/0xc50
[   60.809415][    T1]  acpi_init+0x45c/0x4e8
[   60.813556][    T1]  do_one_initcall+0x170/0xb70
[   60.818224][    T1]  kernel_init_freeable+0x6a8/0x734
[   60.823332][    T1]  kernel_init+0x18/0x12c
[   60.827566][    T1]  ret_from_fork+0x10/0x1c
[   60.838756][    T1] ---[ end trace fa5c8ce17a226d83 ]---

[1] - https://lore.kernel.org/linux-arm-kernel/02e7047071f0b54b046ac472adeeb3fafabc643c.camel@redhat.com/

Fixes: 01bb86b380a3 ("driver core: Add fwnode_init()")
Reported-by: Qian Cai <qcai@redhat.com>
Suggested-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Marc Zyngier <maz@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201211202629.2164655-1-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 39263c6b52e1..2630c2e953f7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -55,7 +55,7 @@ static inline struct fwnode_handle *acpi_alloc_fwnode_static(void)
 	if (!fwnode)
 		return NULL;
 
-	fwnode->ops = &acpi_static_fwnode_ops;
+	fwnode_init(fwnode, &acpi_static_fwnode_ops);
 
 	return fwnode;
 }
-- 
cgit 


From 98b89b649fce39dacb9dc036d6d0fdb8caff73f7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 9 Oct 2020 16:03:01 -0600
Subject: signal: kill JOBCTL_TASK_WORK

It's no longer used, get rid of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sched/jobctl.h |  4 +---
 kernel/signal.c              | 20 --------------------
 2 files changed, 1 insertion(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index d2b4204ba4d3..fa067de9f1a9 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -19,7 +19,6 @@ struct task_struct;
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 #define JOBCTL_TRAP_FREEZE_BIT	23	/* trap for cgroup freezer */
-#define JOBCTL_TASK_WORK_BIT	24	/* set by TWA_SIGNAL */
 
 #define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
@@ -29,10 +28,9 @@ struct task_struct;
 #define JOBCTL_TRAPPING		(1UL << JOBCTL_TRAPPING_BIT)
 #define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
 #define JOBCTL_TRAP_FREEZE	(1UL << JOBCTL_TRAP_FREEZE_BIT)
-#define JOBCTL_TASK_WORK	(1UL << JOBCTL_TASK_WORK_BIT)
 
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
-#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK | JOBCTL_TASK_WORK)
+#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask);
 extern void task_clear_jobctl_trapping(struct task_struct *task);
diff --git a/kernel/signal.c b/kernel/signal.c
index 923230ff6cfc..cf8b057ca2ac 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2556,26 +2556,6 @@ bool get_signal(struct ksignal *ksig)
 
 relock:
 	spin_lock_irq(&sighand->siglock);
-	/*
-	 * Make sure we can safely read ->jobctl() in task_work add. As Oleg
-	 * states:
-	 *
-	 * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we
-	 * roughly have
-	 *
-	 *	task_work_add:				get_signal:
-	 *	STORE(task->task_works, new_work);	STORE(task->jobctl);
-	 *	mb();					mb();
-	 *	LOAD(task->jobctl);			LOAD(task->task_works);
-	 *
-	 * and we can rely on STORE-MB-LOAD [ in task_work_add].
-	 */
-	smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK);
-	if (unlikely(current->task_works)) {
-		spin_unlock_irq(&sighand->siglock);
-		task_work_run();
-		goto relock;
-	}
 
 	/*
 	 * Every stopped thread goes here after wakeup. Check to see if
-- 
cgit 


From e296dc4996b8094ccde45d19090d804c4103513e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 9 Oct 2020 16:04:39 -0600
Subject: kernel: remove checking for TIF_NOTIFY_SIGNAL

It's available everywhere now, no need to check or add dummy defines.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/entry-common.h | 4 ----
 include/linux/sched/signal.h | 2 --
 include/linux/tracehook.h    | 4 ----
 kernel/signal.c              | 2 --
 4 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index b9711e813ec2..abec3a5ae799 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -37,10 +37,6 @@
 # define _TIF_UPROBE			(0)
 #endif
 
-#ifndef _TIF_NOTIFY_SIGNAL
-# define _TIF_NOTIFY_SIGNAL		(0)
-#endif
-
 /*
  * TIF flags handled in syscall_enter_from_user_mode()
  */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index bd5afa076189..24b7b862e043 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -360,7 +360,6 @@ static inline int task_sigpending(struct task_struct *p)
 
 static inline int signal_pending(struct task_struct *p)
 {
-#if defined(TIF_NOTIFY_SIGNAL)
 	/*
 	 * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
 	 * behavior in terms of ensuring that we break out of wait loops
@@ -368,7 +367,6 @@ static inline int signal_pending(struct task_struct *p)
 	 */
 	if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
 		return 1;
-#endif
 	return task_sigpending(p);
 }
 
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index f7d82e4fafd6..ee9ab7dbc8c3 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -205,12 +205,10 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
  */
 static inline void tracehook_notify_signal(void)
 {
-#if defined(TIF_NOTIFY_SIGNAL)
 	clear_thread_flag(TIF_NOTIFY_SIGNAL);
 	smp_mb__after_atomic();
 	if (current->task_works)
 		task_work_run();
-#endif
 }
 
 /*
@@ -218,11 +216,9 @@ static inline void tracehook_notify_signal(void)
  */
 static inline void set_notify_signal(struct task_struct *task)
 {
-#if defined(TIF_NOTIFY_SIGNAL)
 	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
 	    !wake_up_state(task, TASK_INTERRUPTIBLE))
 		kick_process(task);
-#endif
 }
 
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/signal.c b/kernel/signal.c
index cf8b057ca2ac..ccd530509201 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2535,14 +2535,12 @@ bool get_signal(struct ksignal *ksig)
 	 * that the arch handlers don't all have to do it. If we get here
 	 * without TIF_SIGPENDING, just exit after running signal work.
 	 */
-#ifdef TIF_NOTIFY_SIGNAL
 	if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
 		if (test_thread_flag(TIF_NOTIFY_SIGNAL))
 			tracehook_notify_signal();
 		if (!task_sigpending(current))
 			return false;
 	}
-#endif
 
 	if (unlikely(uprobe_deny_signal()))
 		return false;
-- 
cgit 


From 3cabca87b329cbcbdf295be0094adbd72c7b1f67 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 12 Dec 2020 18:29:20 +0100
Subject: ntp: Fix prototype in the !CONFIG_GENERIC_CMOS_UPDATE case

In the !CONFIG_GENERIC_CMOS_UPDATE case the update_persistent_clock64() function
gets defined as a stub in ntp.c - make the prototype in <linux/timekeeping.h>
conditional on CONFIG_GENERIC_CMOS_UPDATE as well.

Fixes: 76e87d96b30b5 ("ntp: Consolidate the RTC update implementation")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timekeeping.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 7f7e4a3f4394..929d3f3937c0 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -303,6 +303,8 @@ extern int persistent_clock_is_local;
 extern void read_persistent_clock64(struct timespec64 *ts);
 void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
 					  struct timespec64 *boot_offset);
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
 extern int update_persistent_clock64(struct timespec64 now);
+#endif
 
 #endif
-- 
cgit 


From 9a20f6f4e6ba9713605fbf7e7426ca22f1181545 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 4 Dec 2020 15:16:46 -0500
Subject: SUNRPC: Fixes for xdr_align_data()

The main use case right now for xdr_align_data() is to shift the page
data to the left, and in practice shrink the total XDR data buffer.
This patch ensures that we fix up the accounting for the buffer length
as we shift that data around.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |   2 +-
 net/sunrpc/xdr.c           | 174 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 133 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 9548d075e06d..2b4e44bb0654 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -252,7 +252,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
-extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t);
+extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
 
 /**
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5833329c132c..c474339ba9ac 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -266,26 +266,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-static void
-_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len)
-{
-	struct kvec *tail = buf->tail;
-
-	if (len > tail->iov_len)
-		len = tail->iov_len;
-
-	_copy_to_pages(buf->pages,
-		       buf->page_base + pgto,
-		       (char *)tail->iov_base,
-		       len);
-	tail->iov_len -= len;
-
-	if (tail->iov_len > 0)
-		memmove((char *)tail->iov_base,
-				tail->iov_base + len,
-				tail->iov_len);
-}
-
 /**
  * _shift_data_right_pages
  * @pages: vector of pages containing both the source and dest memory area.
@@ -516,6 +496,109 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len)
 	} while ((len -= zero) != 0);
 }
 
+static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
+				   unsigned int len, unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+
+	if (base >= tail->iov_len)
+		return;
+	if (len > tail->iov_len - base)
+		len = tail->iov_len - base;
+	/* Shift data into head */
+	if (shift > buf->page_len + base) {
+		const struct kvec *head = buf->head;
+		unsigned int hdto =
+			head->iov_len + buf->page_len + base - shift;
+		unsigned int hdlen = len;
+
+		if (WARN_ONCE(shift > head->iov_len + buf->page_len + base,
+			      "SUNRPC: Misaligned data.\n"))
+			return;
+		if (hdto + hdlen > head->iov_len)
+			hdlen = head->iov_len - hdto;
+		memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen);
+		base += hdlen;
+		len -= hdlen;
+		if (!len)
+			return;
+	}
+	/* Shift data into pages */
+	if (shift > base) {
+		unsigned int pgto = buf->page_len + base - shift;
+		unsigned int pglen = len;
+
+		if (pgto + pglen > buf->page_len)
+			pglen = buf->page_len - pgto;
+		_copy_to_pages(buf->pages, buf->page_base + pgto,
+			       tail->iov_base + base, pglen);
+		base += pglen;
+		len -= pglen;
+		if (!len)
+			return;
+	}
+	memmove(tail->iov_base + base - shift, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	unsigned int pgto;
+
+	if (base >= buf->page_len)
+		return;
+	if (len > buf->page_len - base)
+		len = buf->page_len - base;
+	/* Shift data into head */
+	if (shift > base) {
+		const struct kvec *head = buf->head;
+		unsigned int hdto = head->iov_len + base - shift;
+		unsigned int hdlen = len;
+
+		if (WARN_ONCE(shift > head->iov_len + base,
+			      "SUNRPC: Misaligned data.\n"))
+			return;
+		if (hdto + hdlen > head->iov_len)
+			hdlen = head->iov_len - hdto;
+		_copy_from_pages(head->iov_base + hdto, buf->pages,
+				 buf->page_base + base, hdlen);
+		base += hdlen;
+		len -= hdlen;
+		if (!len)
+			return;
+	}
+	pgto = base - shift;
+	_shift_data_left_pages(buf->pages, buf->page_base + pgto,
+			       buf->page_base + base, len);
+}
+
+static void xdr_buf_tail_shift_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	xdr_buf_tail_copy_left(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	if (base >= buf->page_len) {
+		xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift);
+		return;
+	}
+	xdr_buf_pages_copy_left(buf, base, len, shift);
+	len += base;
+	if (len <= buf->page_len)
+		return;
+	xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
+}
+
 /**
  * xdr_shrink_bufhead
  * @buf: xdr_buf
@@ -1261,38 +1344,45 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(xdr_read_pages);
 
-uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length)
+unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
+			    unsigned int length)
 {
 	struct xdr_buf *buf = xdr->buf;
-	unsigned int from, bytes;
-	unsigned int shift = 0;
-
-	if ((offset + length) < offset ||
-	    (offset + length) > buf->page_len)
-		length = buf->page_len - offset;
+	unsigned int from, bytes, len;
+	unsigned int shift;
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr_stream_remaining(xdr);
-	if (length < bytes)
-		bytes = length;
+
+	if (from >= buf->page_len + buf->tail->iov_len)
+		return 0;
+	if (from + buf->head->iov_len >= buf->len)
+		return 0;
+
+	len = buf->len - buf->head->iov_len;
+
+	/* We only shift data left! */
+	if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
+		      from, offset))
+		return 0;
+	if (WARN_ONCE(offset > buf->page_len,
+		      "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
+		      offset, buf->page_len))
+		return 0;
 
 	/* Move page data to the left */
-	if (from > offset) {
-		shift = min_t(unsigned int, bytes, buf->page_len - from);
-		_shift_data_left_pages(buf->pages,
-				       buf->page_base + offset,
-				       buf->page_base + from,
-				       shift);
-		bytes -= shift;
+	shift = from - offset;
+	xdr_buf_pages_shift_left(buf, from, len, shift);
+	xdr->buf->len -= shift;
+	xdr->nwords -= XDR_QUADLEN(shift);
 
-		/* Move tail data into the pages, if necessary */
-		if (bytes > 0)
-			_shift_data_left_tail(buf, offset + shift, bytes);
-	}
+	bytes = xdr_stream_remaining(xdr);
+	if (length > bytes)
+		length = bytes;
+	bytes -= length;
 
 	xdr->nwords -= XDR_QUADLEN(length);
-	xdr_set_page(xdr, from + length, xdr_stream_remaining(xdr));
+	xdr_set_page(xdr, offset + length, bytes);
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_align_data);
-- 
cgit 


From c4f2f591f02c392ea7de018d2733748bf4c7b5f5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 4 Dec 2020 17:15:09 -0500
Subject: SUNRPC: Fix xdr_expand_hole()

We do want to try to grow the buffer if possible, but if that attempt
fails, we still want to move the data and truncate the XDR message.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |   2 +-
 net/sunrpc/xdr.c           | 274 +++++++++++++++++++++++++++++----------------
 2 files changed, 180 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 2b4e44bb0654..178f499e2283 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -253,7 +253,7 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
-extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
+extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 
 /**
  * xdr_stream_remaining - Return the number of bytes remaining in the stream
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c474339ba9ac..e0906ed24374 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -334,46 +334,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-static unsigned int
-_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len)
-{
-	struct kvec *tail = buf->tail;
-	unsigned int tailbuf_len;
-	unsigned int result = 0;
-	size_t copy;
-
-	tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
-
-	/* Shift the tail first */
-	if (tailbuf_len != 0) {
-		unsigned int free_space = tailbuf_len - tail->iov_len;
-
-		if (len < free_space)
-			free_space = len;
-		if (len > free_space)
-			len = free_space;
-
-		tail->iov_len += free_space;
-		copy = len;
-
-		if (tail->iov_len > len) {
-			char *p = (char *)tail->iov_base + len;
-			memmove(p, tail->iov_base, tail->iov_len - free_space);
-			result += tail->iov_len - free_space;
-		} else
-			copy = tail->iov_len;
-
-		/* Copy from the inlined pages into the tail */
-		_copy_from_pages((char *)tail->iov_base,
-					 buf->pages,
-					 buf->page_base + pgfrom,
-					 copy);
-		result += copy;
-	}
-
-	return result;
-}
-
 /**
  * _copy_to_pages
  * @pages: array of pages
@@ -464,18 +424,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 }
 EXPORT_SYMBOL_GPL(_copy_from_pages);
 
+static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base,
+			     unsigned int len)
+{
+	if (base >= iov->iov_len)
+		return;
+	if (len > iov->iov_len - base)
+		len = iov->iov_len - base;
+	memset(iov->iov_base + base, 0, len);
+}
+
 /**
- * _zero_pages
- * @pages: array of pages
- * @pgbase: beginning page vector address
+ * xdr_buf_pages_zero
+ * @buf: xdr_buf
+ * @pgbase: beginning offset
  * @len: length
  */
-static void
-_zero_pages(struct page **pages, size_t pgbase, size_t len)
+static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase,
+			       unsigned int len)
 {
+	struct page **pages = buf->pages;
 	struct page **page;
 	char *vpage;
-	size_t zero;
+	unsigned int zero;
+
+	if (!len)
+		return;
+	if (pgbase >= buf->page_len) {
+		xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len);
+		return;
+	}
+	if (pgbase + len > buf->page_len) {
+		xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len);
+		len = buf->page_len - pgbase;
+	}
+
+	pgbase += buf->page_base;
 
 	page = pages + (pgbase >> PAGE_SHIFT);
 	pgbase &= ~PAGE_MASK;
@@ -496,6 +480,103 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len)
 	} while ((len -= zero) != 0);
 }
 
+static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len)
+{
+	struct kvec *head = buf->head;
+	struct kvec *tail = buf->tail;
+	unsigned int sum = head->iov_len + buf->page_len + tail->iov_len;
+	unsigned int free_space;
+
+	if (sum > buf->len) {
+		free_space = min_t(unsigned int, sum - buf->len, len);
+		buf->len += free_space;
+		len -= free_space;
+		if (!len)
+			return;
+	}
+
+	if (buf->buflen > sum) {
+		/* Expand the tail buffer */
+		free_space = min_t(unsigned int, buf->buflen - sum, len);
+		tail->iov_len += free_space;
+		buf->len += free_space;
+	}
+}
+
+static void xdr_buf_tail_copy_right(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+
+	if (to >= tail->iov_len)
+		return;
+	if (len + to > tail->iov_len)
+		len = tail->iov_len - to;
+	memmove(tail->iov_base + to, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+	unsigned int pglen = 0;
+	unsigned int talen = 0, tato = 0;
+
+	if (base >= buf->page_len)
+		return;
+	if (len > buf->page_len - base)
+		len = buf->page_len - base;
+	if (to >= buf->page_len) {
+		tato = to - buf->page_len;
+		if (tail->iov_len >= len + tato)
+			talen = len;
+		else if (tail->iov_len > tato)
+			talen = tail->iov_len - tato;
+	} else if (len + to >= buf->page_len) {
+		pglen = buf->page_len - to;
+		talen = len - pglen;
+		if (talen > tail->iov_len)
+			talen = tail->iov_len;
+	} else
+		pglen = len;
+
+	_copy_from_pages(tail->iov_base + tato, buf->pages,
+			 buf->page_base + base + pglen, talen);
+	_shift_data_right_pages(buf->pages, buf->page_base + to,
+				buf->page_base + base, pglen);
+}
+
+static void xdr_buf_tail_shift_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+
+	if (base >= tail->iov_len || !shift || !len)
+		return;
+	xdr_buf_tail_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_right(const struct xdr_buf *buf,
+				      unsigned int base, unsigned int len,
+				      unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	if (base >= buf->page_len) {
+		xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift);
+		return;
+	}
+	if (base + len > buf->page_len)
+		xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len,
+					 shift);
+	xdr_buf_pages_copy_right(buf, base, len, shift);
+}
+
 static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
 				   unsigned int len, unsigned int shift)
 {
@@ -685,30 +766,33 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 }
 
 /**
- * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes
+ * xdr_shrink_pagelen - shrinks buf->pages to @len bytes
  * @buf: xdr_buf
- * @len: bytes to remove from buf->pages
+ * @len: new page buffer length
  *
  * The extra data is not lost, but is instead moved into buf->tail.
  * Returns the actual number of bytes moved.
  */
-static unsigned int
-xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len)
 {
-	unsigned int pglen = buf->page_len;
-	unsigned int result;
-
-	if (len > buf->page_len)
-		len = buf-> page_len;
-
-	result = _shift_data_right_tail(buf, pglen - len, len);
-	buf->page_len -= len;
-	buf->buflen -= len;
-	/* Have we truncated the message? */
-	if (buf->len > buf->buflen)
-		buf->len = buf->buflen;
+	unsigned int shift, buflen = buf->len - buf->head->iov_len;
 
-	return result;
+	WARN_ON_ONCE(len > buf->page_len);
+	if (buf->head->iov_len >= buf->len || len > buflen)
+		buflen = len;
+	if (buf->page_len > buflen) {
+		buf->buflen -= buf->page_len - buflen;
+		buf->page_len = buflen;
+	}
+	if (len >= buf->page_len)
+		return 0;
+	shift = buf->page_len - len;
+	xdr_buf_try_expand(buf, shift);
+	xdr_buf_pages_shift_right(buf, len, buflen - len, shift);
+	buf->page_len = len;
+	buf->len -= shift;
+	buf->buflen -= shift;
+	return shift;
 }
 
 void
@@ -728,6 +812,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
 }
 EXPORT_SYMBOL_GPL(xdr_stream_pos);
 
+static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+	unsigned int blen = xdr->buf->len;
+
+	xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0;
+}
+
+static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+	xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len);
+}
+
 /**
  * xdr_page_pos - Return the current offset from the start of the xdr pages
  * @xdr: pointer to struct xdr_stream
@@ -1291,7 +1387,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 	struct xdr_buf *buf = xdr->buf;
 	unsigned int nwords = XDR_QUADLEN(len);
 	unsigned int cur = xdr_stream_pos(xdr);
-	unsigned int copied, offset;
+	unsigned int copied;
 
 	if (xdr->nwords == 0)
 		return 0;
@@ -1305,9 +1401,8 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 		len = buf->page_len;
 	else if (nwords < xdr->nwords) {
 		/* Truncate page data and move it into the tail */
-		offset = buf->page_len - len;
-		copied = xdr_shrink_pagelen(buf, offset);
-		trace_rpc_xdr_alignment(xdr, offset, copied);
+		copied = xdr_shrink_pagelen(buf, len);
+		trace_rpc_xdr_alignment(xdr, len, copied);
 		xdr->nwords = XDR_QUADLEN(buf->len - cur);
 	}
 	return len;
@@ -1387,39 +1482,28 @@ unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
 }
 EXPORT_SYMBOL_GPL(xdr_align_data);
 
-uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length)
+unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
+			     unsigned int length)
 {
 	struct xdr_buf *buf = xdr->buf;
-	unsigned int bytes;
-	unsigned int from;
-	unsigned int truncated = 0;
-
-	if ((offset + length) < offset ||
-	    (offset + length) > buf->page_len)
-		length = buf->page_len - offset;
+	unsigned int from, to, shift;
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr_stream_remaining(xdr);
-
-	if (offset + length + bytes > buf->page_len) {
-		unsigned int shift = (offset + length + bytes) - buf->page_len;
-		unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift);
-		truncated = shift - res;
-		xdr->nwords -= XDR_QUADLEN(truncated);
-		bytes -= shift;
-	}
-
-	/* Now move the page data over and zero pages */
-	if (bytes > 0)
-		_shift_data_right_pages(buf->pages,
-					buf->page_base + offset + length,
-					buf->page_base + from,
-					bytes);
-	_zero_pages(buf->pages, buf->page_base + offset, length);
-
-	buf->len += length - (from - offset) - truncated;
-	xdr_set_page(xdr, offset + length, xdr_stream_remaining(xdr));
+	to = xdr_align_size(offset + length);
+
+	/* Could the hole be behind us? */
+	if (to > from) {
+		unsigned int buflen = buf->len - buf->head->iov_len;
+		shift = to - from;
+		xdr_buf_try_expand(buf, shift);
+		xdr_buf_pages_shift_right(buf, from, buflen, shift);
+		xdr_stream_page_set_pos(xdr, to);
+	} else if (to != from)
+		xdr_align_data(xdr, to, 0);
+	xdr_buf_pages_zero(buf, offset, length);
+
+	xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_expand_hole);
-- 
cgit 


From f8d0e60f1056687826abc1eded98f0ea067dfc4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 22:56:18 -0500
Subject: SUNRPC: Cleanup - constify a number of xdr_buf helpers

There are a number of xdr helpers for struct xdr_buf that do not change
the structure itself. Mark those as taking const pointers for
documentation purposes.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h | 22 +++++++++----------
 net/sunrpc/xdr.c           | 53 +++++++++++++++++++++-------------------------
 2 files changed, 35 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 178f499e2283..68d49fdc4ee9 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -128,8 +128,8 @@ __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
 
 void	xdr_inline_pages(struct xdr_buf *, unsigned int,
 			 struct page **, unsigned int, unsigned int);
-void	xdr_terminate_string(struct xdr_buf *, const u32);
-size_t	xdr_buf_pagecount(struct xdr_buf *buf);
+void	xdr_terminate_string(const struct xdr_buf *, const u32);
+size_t	xdr_buf_pagecount(const struct xdr_buf *buf);
 int	xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp);
 void	xdr_free_bvec(struct xdr_buf *buf);
 
@@ -182,14 +182,14 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
  * XDR buffer helper functions
  */
 extern void xdr_shift_buf(struct xdr_buf *, size_t);
-extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
-extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
+extern void xdr_buf_from_iov(const struct kvec *, struct xdr_buf *);
+extern int xdr_buf_subsegment(const struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
 extern void xdr_buf_trim(struct xdr_buf *, unsigned int);
-extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
-extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
+extern int read_bytes_from_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int);
+extern int write_bytes_to_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int);
 
-extern int xdr_encode_word(struct xdr_buf *, unsigned int, u32);
-extern int xdr_decode_word(struct xdr_buf *, unsigned int, u32 *);
+extern int xdr_encode_word(const struct xdr_buf *, unsigned int, u32);
+extern int xdr_decode_word(const struct xdr_buf *, unsigned int, u32 *);
 
 struct xdr_array2_desc;
 typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem);
@@ -200,9 +200,9 @@ struct xdr_array2_desc {
 	xdr_xcode_elem_t xcode;
 };
 
-extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+extern int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
-extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+extern int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
 extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase,
 			     size_t len);
@@ -251,7 +251,7 @@ extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buf
 extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
-extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f0444bf5617c..2e91fbd70f11 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
  * @len: length of string, in bytes
  *
  */
-void
-xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+void xdr_terminate_string(const struct xdr_buf *buf, const u32 len)
 {
 	char *kaddr;
 
@@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
 }
 EXPORT_SYMBOL_GPL(xdr_terminate_string);
 
-size_t
-xdr_buf_pagecount(struct xdr_buf *buf)
+size_t xdr_buf_pagecount(const struct xdr_buf *buf)
 {
 	if (!buf->page_len)
 		return 0;
@@ -1545,8 +1543,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page);
 
 static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
 
-void
-xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf)
 {
 	buf->head[0] = *iov;
 	buf->tail[0] = empty_iov;
@@ -1569,9 +1566,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
  *
  * Returns -1 if base of length are out of bounds.
  */
-int
-xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
-			unsigned int base, unsigned int len)
+int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
+		       unsigned int base, unsigned int len)
 {
 	subbuf->buflen = subbuf->len = len;
 	if (base < buf->head[0].iov_len) {
@@ -1659,7 +1655,8 @@ fix_len:
 }
 EXPORT_SYMBOL_GPL(xdr_buf_trim);
 
-static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf,
+				      void *obj, unsigned int len)
 {
 	unsigned int this_len;
 
@@ -1676,7 +1673,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+			    void *obj, unsigned int len)
 {
 	struct xdr_buf subbuf;
 	int status;
@@ -1689,7 +1687,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u
 }
 EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
 
-static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf,
+				     void *obj, unsigned int len)
 {
 	unsigned int this_len;
 
@@ -1706,7 +1705,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+			   void *obj, unsigned int len)
 {
 	struct xdr_buf subbuf;
 	int status;
@@ -1719,8 +1719,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un
 }
 EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
 
-int
-xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj)
 {
 	__be32	raw;
 	int	status;
@@ -1733,8 +1732,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
 }
 EXPORT_SYMBOL_GPL(xdr_decode_word);
 
-int
-xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj)
 {
 	__be32	raw = cpu_to_be32(obj);
 
@@ -1743,9 +1741,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
 EXPORT_SYMBOL_GPL(xdr_encode_word);
 
 /* Returns 0 on success, or else a negative error code. */
-static int
-xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
-		 struct xdr_array2_desc *desc, int encode)
+static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base,
+			    struct xdr_array2_desc *desc, int encode)
 {
 	char *elem = NULL, *c;
 	unsigned int copied = 0, todo, avail_here;
@@ -1937,9 +1934,8 @@ out:
 	return err;
 }
 
-int
-xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
-		  struct xdr_array2_desc *desc)
+int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
+		      struct xdr_array2_desc *desc)
 {
 	if (base >= buf->len)
 		return -EINVAL;
@@ -1948,9 +1944,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
 }
 EXPORT_SYMBOL_GPL(xdr_decode_array2);
 
-int
-xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
-		  struct xdr_array2_desc *desc)
+int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
+		      struct xdr_array2_desc *desc)
 {
 	if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
 	    buf->head->iov_len + buf->page_len + buf->tail->iov_len)
@@ -1960,9 +1955,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
 }
 EXPORT_SYMBOL_GPL(xdr_encode_array2);
 
-int
-xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
-		int (*actor)(struct scatterlist *, void *), void *data)
+int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset,
+		    unsigned int len,
+		    int (*actor)(struct scatterlist *, void *), void *data)
 {
 	int i, ret = 0;
 	unsigned int page_len, thislen, page_offset;
-- 
cgit 


From 7c03e2cda4a584cadc398e8f6641ca9988a39d52 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 14 Dec 2020 15:26:13 +0100
Subject: vfs: move cap_convert_nscap() call into vfs_setxattr()

cap_convert_nscap() does permission checking as well as conversion of the
xattr value conditionally based on fs's user-ns.

This is needed by overlayfs and probably other layered fs (ecryptfs) and is
what vfs_foo() is supposed to do anyway.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Acked-by: James Morris <jamorris@linux.microsoft.com>
---
 fs/xattr.c                 | 17 +++++++++++------
 include/linux/capability.h |  2 +-
 security/commoncap.c       |  3 +--
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/xattr.c b/fs/xattr.c
index cd7a563e8bcd..fd57153b1f61 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -276,8 +276,16 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
+	const void  *orig_value = value;
 	int error;
 
+	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
+		error = cap_convert_nscap(dentry, &value, size);
+		if (error < 0)
+			return error;
+		size = error;
+	}
+
 retry_deleg:
 	inode_lock(inode);
 	error = __vfs_setxattr_locked(dentry, name, value, size, flags,
@@ -289,6 +297,9 @@ retry_deleg:
 		if (!error)
 			goto retry_deleg;
 	}
+	if (value != orig_value)
+		kfree(value);
+
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -537,12 +548,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
 			posix_acl_fix_xattr_from_user(kvalue, size);
-		else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
-			error = cap_convert_nscap(d, &kvalue, size);
-			if (error < 0)
-				goto out;
-			size = error;
-		}
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 1e7fe311cabe..b2f698915c0f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -270,6 +270,6 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
 
-extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size);
+extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/security/commoncap.c b/security/commoncap.c
index 59bf3c1674c8..bacc1111d871 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -473,7 +473,7 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap)
  *
  * If all is ok, we return the new size, on error return < 0.
  */
-int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
+int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
 {
 	struct vfs_ns_cap_data *nscap;
 	uid_t nsrootid;
@@ -516,7 +516,6 @@ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
 	nscap->magic_etc = cpu_to_le32(nsmagic);
 	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
 
-	kvfree(*ivalue);
 	*ivalue = nscap;
 	return newsize;
 }
-- 
cgit 


From e0a6aa30504cb8179d07609fb6386705e8f00663 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sun, 13 Dec 2020 09:39:40 +0100
Subject: efi: ia64: disable the capsule loader

EFI capsule loading is a feature that was introduced into EFI long after
its initial introduction on Itanium, and it is highly unlikely that IA64
systems are receiving firmware updates in the first place, let alone
using EFI capsules.

So let's disable capsule support altogether on IA64. This fixes a build
error on IA64 due to a recent change that added an unconditional
include of asm/efi.h, which IA64 does not provide.

While at it, tweak the make rules a bit so that the EFI capsule
component that is always builtin (even if the EFI capsule loader itself
is built as a module) is omitted for all architectures if the module is
not enabled in the build.

Cc: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/linux-efi/20201214152200.38353-1-ardb@kernel.org
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/Kconfig  |  2 +-
 drivers/firmware/efi/Makefile |  5 ++++-
 include/linux/efi.h           | 10 ++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index b452cfa2100b..5ac2a37ed025 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -147,7 +147,7 @@ config EFI_BOOTLOADER_CONTROL
 
 config EFI_CAPSULE_LOADER
 	tristate "EFI capsule loader"
-	depends on EFI
+	depends on EFI && !IA64
 	help
 	  This option exposes a loader interface "/dev/efi_capsule_loader" for
 	  users to load EFI capsules. This driver requires working runtime
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index d6ca2da19339..467e94259679 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -12,7 +12,10 @@ KASAN_SANITIZE_runtime-wrappers.o	:= n
 
 obj-$(CONFIG_ACPI_BGRT) 		+= efi-bgrt.o
 obj-$(CONFIG_EFI)			+= efi.o vars.o reboot.o memattr.o tpm.o
-obj-$(CONFIG_EFI)			+= capsule.o memmap.o
+obj-$(CONFIG_EFI)			+= memmap.o
+ifneq ($(CONFIG_EFI_CAPSULE_LOADER),)
+obj-$(CONFIG_EFI)			+= capsule.o
+endif
 obj-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= fdtparams.o
 obj-$(CONFIG_EFI_VARS)			+= efivars.o
 obj-$(CONFIG_EFI_ESRT)			+= esrt.o
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 1cd5d91d8ca1..763b816ba19c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -817,12 +817,6 @@ static inline bool efi_enabled(int feature)
 static inline void
 efi_reboot(enum reboot_mode reboot_mode, const char *__unused) {}
 
-static inline bool
-efi_capsule_pending(int *reset_type)
-{
-	return false;
-}
-
 static inline bool efi_soft_reserve_enabled(void)
 {
 	return false;
@@ -1038,6 +1032,7 @@ bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
 				  size_t len);
 
+#if IS_ENABLED(CONFIG_EFI_CAPSULE_LOADER)
 extern bool efi_capsule_pending(int *reset_type);
 
 extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
@@ -1045,6 +1040,9 @@ extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
 
 extern int efi_capsule_update(efi_capsule_header_t *capsule,
 			      phys_addr_t *pages);
+#else
+static inline bool efi_capsule_pending(int *reset_type) { return false; }
+#endif
 
 #ifdef CONFIG_EFI_RUNTIME_MAP
 int efi_runtime_map_init(struct kobject *);
-- 
cgit 


From 50c9132ddfb2024e96900407beeec660cf9848bd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 25 Sep 2020 07:55:39 -0400
Subject: ceph: add new RECOVER mount_state when recovering session

When recovering a session (a'la recover_session=clean), we want to do
all of the operations that we do on a forced umount, but changing the
mount state to SHUTDOWN is can cause queued MDS requests to fail when
the session comes back. Most of those can idle until the session is
recovered in this situation.

Reserve SHUTDOWN state for forced umount, and make a new RECOVER state
for the forced reconnect situation. Change several tests for equality with
SHUTDOWN to test for that or RECOVER.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c               |  4 ++--
 fs/ceph/caps.c               |  2 +-
 fs/ceph/inode.c              |  2 +-
 fs/ceph/mds_client.c         |  4 ++--
 fs/ceph/super.c              | 14 ++++++++++----
 include/linux/ceph/libceph.h |  1 +
 6 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 35c83f65475b..e10b07edc95c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -840,7 +840,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		if (ci->i_wrbuffer_ref > 0) {
 			pr_warn_ratelimited(
 				"writepage_start %p %lld forced umount\n",
@@ -1264,7 +1264,7 @@ ceph_find_incompatible(struct page *page)
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		dout(" page %p forced umount\n", page);
 		return ERR_PTR(-EIO);
 	}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8552d1082b0e..c74d8182fb48 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2747,7 +2747,7 @@ again:
 			goto out_unlock;
 		}
 
-		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 			dout("get_cap_refs %p forced umount\n", inode);
 			ret = -EIO;
 			goto out_unlock;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 526faf4778ce..02b11a4a4d39 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1888,7 +1888,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 
 	mutex_lock(&ci->i_truncate_mutex);
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
 				    inode, ceph_ino(inode));
 		mapping_set_error(inode->i_mapping, -EIO);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 8f1d7500a7ec..a2d6ef808f70 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1595,7 +1595,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		struct ceph_cap_flush *cf;
 		struct ceph_mds_client *mdsc = fsc->mdsc;
 
-		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 			if (inode->i_data.nrpages > 0)
 				invalidate = true;
 			if (ci->i_wrbuffer_ref > 0)
@@ -4678,7 +4678,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 
-	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
 		return;
 
 	dout("sync\n");
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 33ba6f0aa55c..9b1b7f4cfdd4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -831,6 +831,13 @@ static void destroy_caches(void)
 	ceph_fscache_unregister();
 }
 
+static void __ceph_umount_begin(struct ceph_fs_client *fsc)
+{
+	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
+	ceph_mdsc_force_umount(fsc->mdsc);
+	fsc->filp_gen++; // invalidate open files
+}
+
 /*
  * ceph_umount_begin - initiate forced umount.  Tear down the
  * mount, skipping steps that may hang while waiting for server(s).
@@ -843,9 +850,7 @@ static void ceph_umount_begin(struct super_block *sb)
 	if (!fsc)
 		return;
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
-	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
-	ceph_mdsc_force_umount(fsc->mdsc);
-	fsc->filp_gen++; // invalidate open files
+	__ceph_umount_begin(fsc);
 }
 
 static const struct super_operations ceph_super_ops = {
@@ -1234,7 +1239,8 @@ int ceph_force_reconnect(struct super_block *sb)
 	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 	int err = 0;
 
-	ceph_umount_begin(sb);
+	fsc->mount_state = CEPH_MOUNT_RECOVER;
+	__ceph_umount_begin(fsc);
 
 	/* Make sure all page caches get invalidated.
 	 * see remove_session_caps_cb() */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index c8645f0b797d..eb5a7ca13f9c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -104,6 +104,7 @@ enum {
 	CEPH_MOUNT_UNMOUNTING,
 	CEPH_MOUNT_UNMOUNTED,
 	CEPH_MOUNT_SHUTDOWN,
+	CEPH_MOUNT_RECOVER,
 };
 
 static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
-- 
cgit 


From 36c9478d6069994848c8897755b4380aa0a29dd3 Mon Sep 17 00:00:00 2001
From: "Liu, Changcheng" <changcheng.liu@aliyun.com>
Date: Tue, 10 Nov 2020 21:20:08 +0800
Subject: libceph: remove unused port macros

1. monitor's default port is defined by CEPH_MON_PORT
2. CEPH_PORT_START and CEPH_PORT_LAST are not needed.

Signed-off-by: Changcheng Liu <changcheng.liu@aliyun.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/msgr.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 9e50aede46c8..46939485f2c3 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -8,15 +8,6 @@
 
 #define CEPH_MON_PORT    6789  /* default monitor port */
 
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST  6789
-#define CEPH_PORT_START  6800  /* non-monitors start here */
-#define CEPH_PORT_LAST   6900
-
 /*
  * tcp connection banner.  include a protocol version. and adjust
  * whenever the wire protocol changes.  try to keep this string length
-- 
cgit 


From 968cd14edc3acff251f98bdc1eb15f13f05dd5fb Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Wed, 9 Dec 2020 10:52:20 +0800
Subject: ceph: set osdmap epoch for setxattr

When setting the file/dir layout, it may need data pool info. So
in mds server, it needs to check the osdmap. At present, if mds
doesn't find the data pool specified, it will try to get the latest
osdmap. Now if pass the osd epoch for setxattr, the mds server can
only check this epoch of osdmap.

URL: https://tracker.ceph.com/issues/48504
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 2 +-
 fs/ceph/xattr.c              | 3 +++
 include/linux/ceph/ceph_fs.h | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 70d347989603..75034f7d8f46 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2533,7 +2533,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free2;
 	}
 
-	msg->hdr.version = cpu_to_le16(2);
+	msg->hdr.version = cpu_to_le16(3);
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
 	head = msg->front.iov_base;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index cd8c7aaa23a0..24997982de01 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1022,6 +1022,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_pagelist *pagelist = NULL;
 	int op = CEPH_MDS_OP_SETXATTR;
 	int err;
@@ -1060,6 +1061,8 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 
 	if (op == CEPH_MDS_OP_SETXATTR) {
 		req->r_args.setxattr.flags = cpu_to_le32(flags);
+		req->r_args.setxattr.osdmap_epoch =
+			cpu_to_le32(osdc->osdmap->epoch);
 		req->r_pagelist = pagelist;
 		pagelist = NULL;
 	}
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 455e9b9e2adf..c0f1b921ec69 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -424,6 +424,7 @@ union ceph_mds_request_args {
 	} __attribute__ ((packed)) open;
 	struct {
 		__le32 flags;
+		__le32 osdmap_epoch; /* used for setting file/dir layouts */
 	} __attribute__ ((packed)) setxattr;
 	struct {
 		struct ceph_file_layout_legacy layout;
-- 
cgit 


From 4f1ddb1ea874c7703528a8c21b77b7f2462ee247 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 9 Dec 2020 10:12:59 -0500
Subject: ceph: implement updated ceph_mds_request_head structure

When we added the btime feature in mainline ceph, we had to extend
struct ceph_mds_request_args so that it could be set. Implement the same
in the kernel client.

Rename ceph_mds_request_head with a _old extension, and a union
ceph_mds_request_args_ext to allow for the extended size of the new
header format.

Add the appropriate code to handle both formats in struct
create_request_message and key the behavior on whether the peer supports
CEPH_FEATURE_FS_BTIME.

The gid_list field in the payload is now populated from the saved
credential. For now, we don't add any support for setting the btime via
setattr, but this does enable us to add that in the future.

[ idryomov: break unnecessarily long lines ]

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 75 ++++++++++++++++++++++++++++++++++++--------
 include/linux/ceph/ceph_fs.h | 32 ++++++++++++++++++-
 2 files changed, 93 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 14d0a11b7d88..a256d95ec99a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2478,21 +2478,24 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 /*
  * called under mdsc->mutex
  */
-static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 					       struct ceph_mds_request *req,
-					       int mds, bool drop_cap_releases)
+					       bool drop_cap_releases)
 {
+	int mds = session->s_mds;
+	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_msg *msg;
-	struct ceph_mds_request_head *head;
+	struct ceph_mds_request_head_old *head;
 	const char *path1 = NULL;
 	const char *path2 = NULL;
 	u64 ino1 = 0, ino2 = 0;
 	int pathlen1 = 0, pathlen2 = 0;
 	bool freepath1 = false, freepath2 = false;
-	int len;
+	int len, i;
 	u16 releases;
 	void *p, *end;
 	int ret;
+	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
 
 	ret = set_request_path_attr(req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2514,14 +2517,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free1;
 	}
 
-	len = sizeof(*head) +
-		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+	if (legacy) {
+		/* Old style */
+		len = sizeof(*head);
+	} else {
+		/* New style: add gid_list and any later fields */
+		len = sizeof(struct ceph_mds_request_head) + sizeof(u32) +
+		      (sizeof(u64) * req->r_cred->group_info->ngroups);
+	}
+
+	len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
 		sizeof(struct ceph_timespec);
 
 	/* calculate (max) length for cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
 		(!!req->r_inode_drop + !!req->r_dentry_drop +
 		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+
 	if (req->r_dentry_drop)
 		len += pathlen1;
 	if (req->r_old_dentry_drop)
@@ -2533,11 +2545,25 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free2;
 	}
 
-	msg->hdr.version = cpu_to_le16(3);
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
-	head = msg->front.iov_base;
-	p = msg->front.iov_base + sizeof(*head);
+	/*
+	 * The old ceph_mds_request_header didn't contain a version field, and
+	 * one was added when we moved the message version from 3->4.
+	 */
+	if (legacy) {
+		msg->hdr.version = cpu_to_le16(3);
+		head = msg->front.iov_base;
+		p = msg->front.iov_base + sizeof(*head);
+	} else {
+		struct ceph_mds_request_head *new_head = msg->front.iov_base;
+
+		msg->hdr.version = cpu_to_le16(4);
+		new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+		head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
+		p = msg->front.iov_base + sizeof(*new_head);
+	}
+
 	end = msg->front.iov_base + msg->front.iov_len;
 
 	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
@@ -2590,6 +2616,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		ceph_encode_copy(&p, &ts, sizeof(ts));
 	}
 
+	/* gid list */
+	if (!legacy) {
+		ceph_encode_32(&p, req->r_cred->group_info->ngroups);
+		for (i = 0; i < req->r_cred->group_info->ngroups; i++)
+			ceph_encode_64(&p, from_kgid(&init_user_ns,
+				       req->r_cred->group_info->gid[i]));
+	}
+
 	if (WARN_ON_ONCE(p > end)) {
 		ceph_msg_put(msg);
 		msg = ERR_PTR(-ERANGE);
@@ -2633,6 +2667,18 @@ static void complete_request(struct ceph_mds_client *mdsc,
 	complete_all(&req->r_completion);
 }
 
+static struct ceph_mds_request_head_old *
+find_old_request_head(void *p, u64 features)
+{
+	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+	struct ceph_mds_request_head *new_head;
+
+	if (legacy)
+		return (struct ceph_mds_request_head_old *)p;
+	new_head = (struct ceph_mds_request_head *)p;
+	return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
+}
+
 /*
  * called under mdsc->mutex
  */
@@ -2642,7 +2688,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 {
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
-	struct ceph_mds_request_head *rhead;
+	struct ceph_mds_request_head_old *rhead;
 	struct ceph_msg *msg;
 	int flags = 0;
 
@@ -2661,6 +2707,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
 		void *p;
+
 		/*
 		 * Replay.  Do not regenerate message (and rebuild
 		 * paths, etc.); just use the original message.
@@ -2668,7 +2715,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		 * d_move mangles the src name.
 		 */
 		msg = req->r_request;
-		rhead = msg->front.iov_base;
+		rhead = find_old_request_head(msg->front.iov_base,
+					      session->s_con.peer_features);
 
 		flags = le32_to_cpu(rhead->flags);
 		flags |= CEPH_MDS_FLAG_REPLAY;
@@ -2699,14 +2747,15 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		ceph_msg_put(req->r_request);
 		req->r_request = NULL;
 	}
-	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
+	msg = create_request_message(session, req, drop_cap_releases);
 	if (IS_ERR(msg)) {
 		req->r_err = PTR_ERR(msg);
 		return PTR_ERR(msg);
 	}
 	req->r_request = msg;
 
-	rhead = msg->front.iov_base;
+	rhead = find_old_request_head(msg->front.iov_base,
+				      session->s_con.peer_features);
 	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
 		flags |= CEPH_MDS_FLAG_REPLAY;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index c0f1b921ec69..d44d98033d58 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -446,11 +446,25 @@ union ceph_mds_request_args {
 	} __attribute__ ((packed)) lookupino;
 } __attribute__ ((packed));
 
+union ceph_mds_request_args_ext {
+	union ceph_mds_request_args old;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+		struct ceph_timespec btime;
+	} __attribute__ ((packed)) setattr_ext;
+};
+
 #define CEPH_MDS_FLAG_REPLAY		1 /* this is a replayed op */
 #define CEPH_MDS_FLAG_WANT_DENTRY	2 /* want dentry in reply */
 #define CEPH_MDS_FLAG_ASYNC		4 /* request is asynchronous */
 
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
 	__le32 flags;                  /* CEPH_MDS_FLAG_* */
@@ -463,6 +477,22 @@ struct ceph_mds_request_head {
 	union ceph_mds_request_args args;
 } __attribute__ ((packed));
 
+#define CEPH_MDS_REQUEST_HEAD_VERSION  1
+
+struct ceph_mds_request_head {
+	__le16 version;                /* struct version */
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args_ext args;
+} __attribute__ ((packed));
+
 /* cap/lease release record */
 struct ceph_mds_request_release {
 	__le64 ino, cap_id;            /* ino and unique cap id */
-- 
cgit 


From 418af5b3bfc4f1ef4854e83c5be8a0bdce51e95c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 29 Oct 2020 14:49:10 +0100
Subject: libceph: lower exponential backoff delay

The current setting allows the backoff to climb up to 5 minutes.  This
is too high -- it becomes hard to tell whether the client is stuck on
something or just in backoff.

In userspace, ms_max_backoff is defaulted to 15 seconds.  Let's do the
same.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  4 ++--
 net/ceph/messenger.c           | 12 +++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 60b324efd1c4..b47c7cc4c90a 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -241,8 +241,8 @@ struct ceph_msg {
 };
 
 /* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL	(HZ/2)
-#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
+#define BASE_DELAY_INTERVAL	(HZ / 4)
+#define MAX_DELAY_INTERVAL	(15 * HZ)
 
 /*
  * A single connection with another host.
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 214ae2d17a90..f3eb66bab988 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2812,6 +2812,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 		return -ENOENT;
 	}
 
+	if (delay >= HZ)
+		delay = round_jiffies_relative(delay);
+
 	dout("%s %p %lu\n", __func__, con, delay);
 	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
 		dout("%s %p - already queued\n", __func__, con);
@@ -2871,7 +2874,7 @@ static bool con_backoff(struct ceph_connection *con)
 	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
 		return false;
 
-	ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+	ret = queue_con_delay(con, con->delay);
 	if (ret) {
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
@@ -3018,10 +3021,13 @@ static void con_fault(struct ceph_connection *con)
 	} else {
 		/* retry after a delay. */
 		con->state = CON_STATE_PREOPEN;
-		if (con->delay == 0)
+		if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
-		else if (con->delay < MAX_DELAY_INTERVAL)
+		} else if (con->delay < MAX_DELAY_INTERVAL) {
 			con->delay *= 2;
+			if (con->delay > MAX_DELAY_INTERVAL)
+				con->delay = MAX_DELAY_INTERVAL;
+		}
 		con_flag_set(con, CON_FLAG_BACKOFF);
 		queue_con(con);
 	}
-- 
cgit 


From 5cd8da3a1ca2160b8f9c2ff6a96762e66410ea38 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 13 Oct 2020 17:23:22 +0200
Subject: libceph: drop msg->ack_stamp field

It is set in process_ack() but never used.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 1 -
 net/ceph/messenger.c           | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b47c7cc4c90a..6f77e70db855 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -235,7 +235,6 @@ struct ceph_msg {
 	bool more_to_follow;
 	bool needs_out_seq;
 	int front_alloc_len;
-	unsigned long ack_stamp;        /* tx: when we were acked */
 
 	struct ceph_msgpool *pool;
 };
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a6d93280d3e9..29b00b2cecac 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2279,7 +2279,6 @@ static void process_ack(struct ceph_connection *con)
 			break;
 		dout("got ack for seq %llu type %d at %p\n", seq,
 		     le16_to_cpu(m->hdr.type), m);
-		m->ack_stamp = jiffies;
 		ceph_msg_remove(m);
 	}
 
-- 
cgit 


From 30be780a87211de75b93935c20a0913e46744a3f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:11:26 +0100
Subject: libceph: make con->state an int

unsigned long is a leftover from when con->state used to be a set of
bits managed with set_bit(), clear_bit(), etc.  Save a bit of memory.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  2 +-
 net/ceph/messenger.c           | 16 ++++++----------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 6f77e70db855..f053de4f46dd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -257,13 +257,13 @@ struct ceph_connection {
 
 	struct ceph_messenger *msgr;
 
+	int state;
 	atomic_t sock_state;
 	struct socket *sock;
 	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_addr peer_addr_for_me;
 
 	unsigned long flags;
-	unsigned long state;
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index bb79a59daf42..9c92f101aa88 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -372,7 +372,7 @@ static void ceph_sock_data_ready(struct sock *sk)
 	}
 
 	if (sk->sk_state != TCP_CLOSE_WAIT) {
-		dout("%s on %p state = %lu, queueing work\n", __func__,
+		dout("%s %p state = %d, queueing work\n", __func__,
 		     con, con->state);
 		queue_con(con);
 	}
@@ -406,7 +406,7 @@ static void ceph_sock_state_change(struct sock *sk)
 {
 	struct ceph_connection *con = sk->sk_user_data;
 
-	dout("%s %p state = %lu sk_state = %u\n", __func__,
+	dout("%s %p state = %d sk_state = %u\n", __func__,
 	     con, con->state, sk->sk_state);
 
 	switch (sk->sk_state) {
@@ -2582,7 +2582,7 @@ static int try_write(struct ceph_connection *con)
 {
 	int ret = 1;
 
-	dout("try_write start %p state %lu\n", con, con->state);
+	dout("try_write start %p state %d\n", con, con->state);
 	if (con->state != CON_STATE_PREOPEN &&
 	    con->state != CON_STATE_CONNECTING &&
 	    con->state != CON_STATE_NEGOTIATING &&
@@ -2600,7 +2600,7 @@ static int try_write(struct ceph_connection *con)
 
 		BUG_ON(con->in_msg);
 		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %lu\n",
+		dout("try_write initiating connect on %p new state %d\n",
 		     con, con->state);
 		ret = ceph_tcp_connect(con);
 		if (ret < 0) {
@@ -2679,7 +2679,7 @@ static int try_read(struct ceph_connection *con)
 	int ret = -1;
 
 more:
-	dout("try_read start on %p state %lu\n", con, con->state);
+	dout("try_read start %p state %d\n", con, con->state);
 	if (con->state != CON_STATE_CONNECTING &&
 	    con->state != CON_STATE_NEGOTIATING &&
 	    con->state != CON_STATE_OPEN)
@@ -2876,11 +2876,7 @@ static bool con_sock_closed(struct ceph_connection *con)
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
-		pr_warn("%s con %p unrecognized state %lu\n",
-			__func__, con, con->state);
-		con->error_msg = "unrecognized con state";
 		BUG();
-		break;
 	}
 #undef CASE
 
@@ -2998,7 +2994,7 @@ static void ceph_con_workfn(struct work_struct *work)
  */
 static void con_fault(struct ceph_connection *con)
 {
-	dout("fault %p state %lu to peer %s\n",
+	dout("fault %p state %d to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr));
 
 	pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-- 
cgit 


From 6d7f62bfb5b5da6b0b37174c1fd32545f3b5b90d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:59:02 +0100
Subject: libceph: rename and export con->state states

In preparation for msgr2, rename msgr1 specific states and move the
defines to the header file.

Also drop state transition comments.  They don't cover all possible
transitions (e.g. NEGOTIATING -> STANDBY, etc) and currently do more
harm than good.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 12 +++++-
 net/ceph/messenger.c           | 90 ++++++++++++++++++------------------------
 2 files changed, 50 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index f053de4f46dd..e6be85b67c6c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -239,6 +239,16 @@ struct ceph_msg {
 	struct ceph_msgpool *pool;
 };
 
+/*
+ * connection states
+ */
+#define CEPH_CON_S_CLOSED		1
+#define CEPH_CON_S_PREOPEN		2
+#define CEPH_CON_S_V1_BANNER		3
+#define CEPH_CON_S_V1_CONNECT_MSG	4
+#define CEPH_CON_S_OPEN			5
+#define CEPH_CON_S_STANDBY		6
+
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
@@ -257,7 +267,7 @@ struct ceph_connection {
 
 	struct ceph_messenger *msgr;
 
-	int state;
+	int state;  /* CEPH_CON_S_* */
 	atomic_t sock_state;
 	struct socket *sock;
 	struct ceph_entity_addr peer_addr; /* peer address */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9c92f101aa88..adeb69ba6747 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,16 +82,6 @@
 #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
 #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
 
-/*
- * connection states
- */
-#define CON_STATE_CLOSED        1  /* -> PREOPEN */
-#define CON_STATE_PREOPEN       2  /* -> CONNECTING, CLOSED */
-#define CON_STATE_CONNECTING    3  /* -> NEGOTIATING, CLOSED */
-#define CON_STATE_NEGOTIATING   4  /* -> OPEN, CLOSED */
-#define CON_STATE_OPEN          5  /* -> STANDBY, CLOSED */
-#define CON_STATE_STANDBY       6  /* -> PREOPEN, CLOSED */
-
 /*
  * ceph_connection flag bits
  */
@@ -674,7 +664,7 @@ void ceph_con_close(struct ceph_connection *con)
 {
 	mutex_lock(&con->mutex);
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
-	con->state = CON_STATE_CLOSED;
+	con->state = CEPH_CON_S_CLOSED;
 
 	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
 	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
@@ -698,8 +688,8 @@ void ceph_con_open(struct ceph_connection *con,
 	mutex_lock(&con->mutex);
 	dout("con_open %p %s\n", con, ceph_pr_addr(addr));
 
-	WARN_ON(con->state != CON_STATE_CLOSED);
-	con->state = CON_STATE_PREOPEN;
+	WARN_ON(con->state != CEPH_CON_S_CLOSED);
+	con->state = CEPH_CON_S_PREOPEN;
 
 	con->peer_name.type = (__u8) entity_type;
 	con->peer_name.num = cpu_to_le64(entity_num);
@@ -739,7 +729,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 	INIT_LIST_HEAD(&con->out_sent);
 	INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
 
-	con->state = CON_STATE_CLOSED;
+	con->state = CEPH_CON_S_CLOSED;
 }
 EXPORT_SYMBOL(ceph_con_init);
 
@@ -2183,7 +2173,7 @@ static int process_connect(struct ceph_connection *con)
 		if (con->ops->peer_reset)
 			con->ops->peer_reset(con);
 		mutex_lock(&con->mutex);
-		if (con->state != CON_STATE_NEGOTIATING)
+		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
 			return -EAGAIN;
 		break;
 
@@ -2232,8 +2222,8 @@ static int process_connect(struct ceph_connection *con)
 			return -1;
 		}
 
-		WARN_ON(con->state != CON_STATE_NEGOTIATING);
-		con->state = CON_STATE_OPEN;
+		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+		con->state = CEPH_CON_S_OPEN;
 		con->auth_retry = 0;    /* we authenticated; clear flag */
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
@@ -2583,16 +2573,16 @@ static int try_write(struct ceph_connection *con)
 	int ret = 1;
 
 	dout("try_write start %p state %d\n", con, con->state);
-	if (con->state != CON_STATE_PREOPEN &&
-	    con->state != CON_STATE_CONNECTING &&
-	    con->state != CON_STATE_NEGOTIATING &&
-	    con->state != CON_STATE_OPEN)
+	if (con->state != CEPH_CON_S_PREOPEN &&
+	    con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
 		return 0;
 
 	/* open the socket first? */
-	if (con->state == CON_STATE_PREOPEN) {
+	if (con->state == CEPH_CON_S_PREOPEN) {
 		BUG_ON(con->sock);
-		con->state = CON_STATE_CONNECTING;
+		con->state = CEPH_CON_S_V1_BANNER;
 
 		con_out_kvec_reset(con);
 		prepare_write_banner(con);
@@ -2646,7 +2636,7 @@ more:
 	}
 
 do_next:
-	if (con->state == CON_STATE_OPEN) {
+	if (con->state == CEPH_CON_S_OPEN) {
 		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
@@ -2680,9 +2670,9 @@ static int try_read(struct ceph_connection *con)
 
 more:
 	dout("try_read start %p state %d\n", con, con->state);
-	if (con->state != CON_STATE_CONNECTING &&
-	    con->state != CON_STATE_NEGOTIATING &&
-	    con->state != CON_STATE_OPEN)
+	if (con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
 		return 0;
 
 	BUG_ON(!con->sock);
@@ -2690,8 +2680,7 @@ more:
 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
 	     con->in_base_pos);
 
-	if (con->state == CON_STATE_CONNECTING) {
-		dout("try_read connecting\n");
+	if (con->state == CEPH_CON_S_V1_BANNER) {
 		ret = read_partial_banner(con);
 		if (ret <= 0)
 			goto out;
@@ -2699,7 +2688,7 @@ more:
 		if (ret < 0)
 			goto out;
 
-		con->state = CON_STATE_NEGOTIATING;
+		con->state = CEPH_CON_S_V1_CONNECT_MSG;
 
 		/*
 		 * Received banner is good, exchange connection info.
@@ -2715,8 +2704,7 @@ more:
 		goto out;
 	}
 
-	if (con->state == CON_STATE_NEGOTIATING) {
-		dout("try_read negotiating\n");
+	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
 		ret = read_partial_connect(con);
 		if (ret <= 0)
 			goto out;
@@ -2726,7 +2714,7 @@ more:
 		goto more;
 	}
 
-	WARN_ON(con->state != CON_STATE_OPEN);
+	WARN_ON(con->state != CEPH_CON_S_OPEN);
 
 	if (con->in_base_pos < 0) {
 		/*
@@ -2760,7 +2748,7 @@ more:
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
 			con_close_socket(con);
-			con->state = CON_STATE_CLOSED;
+			con->state = CEPH_CON_S_CLOSED;
 			goto out;
 		default:
 			goto bad_tag;
@@ -2785,7 +2773,7 @@ more:
 		if (con->in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
 		process_message(con);
-		if (con->state == CON_STATE_OPEN)
+		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
 	}
@@ -2864,15 +2852,15 @@ static bool con_sock_closed(struct ceph_connection *con)
 		return false;
 
 #define CASE(x)								\
-	case CON_STATE_ ## x:						\
+	case CEPH_CON_S_ ## x:						\
 		con->error_msg = "socket closed (con state " #x ")";	\
 		break;
 
 	switch (con->state) {
 	CASE(CLOSED);
 	CASE(PREOPEN);
-	CASE(CONNECTING);
-	CASE(NEGOTIATING);
+	CASE(V1_BANNER);
+	CASE(V1_CONNECT_MSG);
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
@@ -2943,16 +2931,16 @@ static void ceph_con_workfn(struct work_struct *work)
 			dout("%s: con %p BACKOFF\n", __func__, con);
 			break;
 		}
-		if (con->state == CON_STATE_STANDBY) {
+		if (con->state == CEPH_CON_S_STANDBY) {
 			dout("%s: con %p STANDBY\n", __func__, con);
 			break;
 		}
-		if (con->state == CON_STATE_CLOSED) {
+		if (con->state == CEPH_CON_S_CLOSED) {
 			dout("%s: con %p CLOSED\n", __func__, con);
 			BUG_ON(con->sock);
 			break;
 		}
-		if (con->state == CON_STATE_PREOPEN) {
+		if (con->state == CEPH_CON_S_PREOPEN) {
 			dout("%s: con %p PREOPEN\n", __func__, con);
 			BUG_ON(con->sock);
 		}
@@ -3001,15 +2989,15 @@ static void con_fault(struct ceph_connection *con)
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
-	WARN_ON(con->state != CON_STATE_CONNECTING &&
-	       con->state != CON_STATE_NEGOTIATING &&
-	       con->state != CON_STATE_OPEN);
+	WARN_ON(con->state != CEPH_CON_S_V1_BANNER &&
+	       con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	       con->state != CEPH_CON_S_OPEN);
 
 	ceph_con_reset_protocol(con);
 
 	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
-		con->state = CON_STATE_CLOSED;
+		con->state = CEPH_CON_S_CLOSED;
 		return;
 	}
 
@@ -3022,10 +3010,10 @@ static void con_fault(struct ceph_connection *con)
 	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
 		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-		con->state = CON_STATE_STANDBY;
+		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
-		con->state = CON_STATE_PREOPEN;
+		con->state = CEPH_CON_S_PREOPEN;
 		if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
 		} else if (con->delay < MAX_DELAY_INTERVAL) {
@@ -3092,9 +3080,9 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
 static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
-	if (con->state == CON_STATE_STANDBY) {
+	if (con->state == CEPH_CON_S_STANDBY) {
 		dout("clear_standby %p and ++connect_seq\n", con);
-		con->state = CON_STATE_PREOPEN;
+		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
 		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
 		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
@@ -3115,7 +3103,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 
-	if (con->state == CON_STATE_CLOSED) {
+	if (con->state == CEPH_CON_S_CLOSED) {
 		dout("con_send %p closed, dropping %p\n", con, msg);
 		ceph_msg_put(msg);
 		mutex_unlock(&con->mutex);
@@ -3456,7 +3444,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con,
 	mutex_unlock(&con->mutex);
 	msg = con->ops->alloc_msg(con, hdr, skip);
 	mutex_lock(&con->mutex);
-	if (con->state != CON_STATE_OPEN) {
+	if (con->state != CEPH_CON_S_OPEN) {
 		if (msg)
 			ceph_msg_put(msg);
 		return -EAGAIN;
-- 
cgit 


From 3fefd43e741a5b8d55aeb9115ff488ad2cad439b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:56:36 +0100
Subject: libceph: rename and export con->flags bits

In preparation for msgr2, move the defines to the header file.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 13 ++++++-
 net/ceph/messenger.c           | 77 +++++++++++++++++++-----------------------
 2 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e6be85b67c6c..b6962a0fd76f 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -249,6 +249,17 @@ struct ceph_msg {
 #define CEPH_CON_S_OPEN			5
 #define CEPH_CON_S_STANDBY		6
 
+/*
+ * ceph_connection flag bits
+ */
+#define CEPH_CON_F_LOSSYTX		0  /* we can close channel or drop
+					      messages on errors */
+#define CEPH_CON_F_KEEPALIVE_PENDING	1  /* we need to send a keepalive */
+#define CEPH_CON_F_WRITE_PENDING	2  /* we have data ready to send */
+#define CEPH_CON_F_SOCK_CLOSED		3  /* socket state changed to closed */
+#define CEPH_CON_F_BACKOFF		4  /* need to retry queuing delayed
+					      work */
+
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
@@ -273,7 +284,7 @@ struct ceph_connection {
 	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_addr peer_addr_for_me;
 
-	unsigned long flags;
+	unsigned long flags;  /* CEPH_CON_F_* */
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index adeb69ba6747..ee87dc3af959 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,24 +82,14 @@
 #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
 #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
 
-/*
- * ceph_connection flag bits
- */
-#define CON_FLAG_LOSSYTX           0  /* we can close channel or drop
-				       * messages on errors */
-#define CON_FLAG_KEEPALIVE_PENDING 1  /* we need to send a keepalive */
-#define CON_FLAG_WRITE_PENDING	   2  /* we have data ready to send */
-#define CON_FLAG_SOCK_CLOSED	   3  /* socket state changed to closed */
-#define CON_FLAG_BACKOFF           4  /* need to retry queuing delayed work */
-
 static bool con_flag_valid(unsigned long con_flag)
 {
 	switch (con_flag) {
-	case CON_FLAG_LOSSYTX:
-	case CON_FLAG_KEEPALIVE_PENDING:
-	case CON_FLAG_WRITE_PENDING:
-	case CON_FLAG_SOCK_CLOSED:
-	case CON_FLAG_BACKOFF:
+	case CEPH_CON_F_LOSSYTX:
+	case CEPH_CON_F_KEEPALIVE_PENDING:
+	case CEPH_CON_F_WRITE_PENDING:
+	case CEPH_CON_F_SOCK_CLOSED:
+	case CEPH_CON_F_BACKOFF:
 		return true;
 	default:
 		return false;
@@ -380,7 +370,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
+	if (con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
 		if (sk_stream_is_writeable(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -406,7 +396,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		con_flag_set(con, CON_FLAG_SOCK_CLOSED);
+		con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -597,7 +587,7 @@ static int con_close_socket(struct ceph_connection *con)
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
+	con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
 
 	con_sock_state_closed(con);
 	return rc;
@@ -666,10 +656,10 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
 	con->state = CEPH_CON_S_CLOSED;
 
-	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
-	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
-	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-	con_flag_clear(con, CON_FLAG_BACKOFF);
+	con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next connect */
+	con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_BACKOFF);
 
 	ceph_con_reset_protocol(con);
 	ceph_con_reset_session(con);
@@ -1365,7 +1355,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		prepare_write_message_footer(con);
 	}
 
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1386,7 +1376,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1404,7 +1394,7 @@ static void prepare_write_seq(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->out_temp_ack),
 			 &con->out_temp_ack);
 
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1425,7 +1415,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1464,7 +1454,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
@@ -1475,7 +1465,7 @@ static void __prepare_write_connect(struct ceph_connection *con)
 				 con->auth->authorizer_buf);
 
 	con->out_more = 0;
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
@@ -2236,7 +2226,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			con_flag_set(con, CON_FLAG_LOSSYTX);
+			con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
@@ -2637,7 +2627,8 @@ more:
 
 do_next:
 	if (con->state == CEPH_CON_S_OPEN) {
-		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+		if (con_flag_test_and_clear(con,
+				CEPH_CON_F_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
 		}
@@ -2653,7 +2644,7 @@ do_next:
 	}
 
 	/* Nothing to do! */
-	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2848,7 +2839,7 @@ static void cancel_con(struct ceph_connection *con)
 
 static bool con_sock_closed(struct ceph_connection *con)
 {
-	if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
+	if (!con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
 		return false;
 
 #define CASE(x)								\
@@ -2875,7 +2866,7 @@ static bool con_backoff(struct ceph_connection *con)
 {
 	int ret;
 
-	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+	if (!con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
 		return false;
 
 	ret = queue_con_delay(con, con->delay);
@@ -2883,7 +2874,7 @@ static bool con_backoff(struct ceph_connection *con)
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
 		BUG_ON(ret == -ENOENT);
-		con_flag_set(con, CON_FLAG_BACKOFF);
+		con_flag_set(con, CEPH_CON_F_BACKOFF);
 	}
 
 	return true;
@@ -2995,7 +2986,7 @@ static void con_fault(struct ceph_connection *con)
 
 	ceph_con_reset_protocol(con);
 
-	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
+	if (con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CEPH_CON_S_CLOSED;
 		return;
@@ -3007,9 +2998,9 @@ static void con_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
+	    !con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+		con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
@@ -3021,7 +3012,7 @@ static void con_fault(struct ceph_connection *con)
 			if (con->delay > MAX_DELAY_INTERVAL)
 				con->delay = MAX_DELAY_INTERVAL;
 		}
-		con_flag_set(con, CON_FLAG_BACKOFF);
+		con_flag_set(con, CEPH_CON_F_BACKOFF);
 		queue_con(con);
 	}
 }
@@ -3084,8 +3075,8 @@ static void clear_standby(struct ceph_connection *con)
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
-		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
-		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
+		WARN_ON(con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+		WARN_ON(con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
 }
 
@@ -3126,7 +3117,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -3222,10 +3213,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	dout("con_keepalive %p\n", con);
 	mutex_lock(&con->mutex);
 	clear_standby(con);
-	con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
+	con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
 	mutex_unlock(&con->mutex);
 
-	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
-- 
cgit 


From 699921d9e68ff3d9f8645488c12f4689c6533d70 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:37:06 +0100
Subject: libceph: export zero_page

In preparation for msgr2, make zero_page global.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  1 +
 net/ceph/messenger.c           | 17 +++++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b6962a0fd76f..513ed5f90bff 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -343,6 +343,7 @@ struct ceph_connection {
 	unsigned long       delay;          /* current delay interval */
 };
 
+extern struct page *ceph_zero_page;
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ee87dc3af959..d3880fbe8424 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -164,7 +164,7 @@ static void con_fault(struct ceph_connection *con);
 static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
 static atomic_t addr_str_seq = ATOMIC_INIT(0);
 
-static struct page *zero_page;		/* used in certain error cases */
+struct page *ceph_zero_page;		/* used in certain error cases */
 
 const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 {
@@ -234,9 +234,9 @@ static void _ceph_msgr_exit(void)
 		ceph_msgr_wq = NULL;
 	}
 
-	BUG_ON(zero_page == NULL);
-	put_page(zero_page);
-	zero_page = NULL;
+	BUG_ON(!ceph_zero_page);
+	put_page(ceph_zero_page);
+	ceph_zero_page = NULL;
 
 	ceph_msgr_slab_exit();
 }
@@ -246,9 +246,9 @@ int __init ceph_msgr_init(void)
 	if (ceph_msgr_slab_init())
 		return -ENOMEM;
 
-	BUG_ON(zero_page != NULL);
-	zero_page = ZERO_PAGE(0);
-	get_page(zero_page);
+	BUG_ON(ceph_zero_page);
+	ceph_zero_page = ZERO_PAGE(0);
+	get_page(ceph_zero_page);
 
 	/*
 	 * The number of active work items is limited by the number of
@@ -1645,7 +1645,8 @@ static int write_partial_skip(struct ceph_connection *con)
 
 		if (size == con->out_skip)
 			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
+		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+					more);
 		if (ret <= 0)
 			goto out;
 		con->out_skip -= ret;
-- 
cgit 


From 6503e0b69c9d4d78b5450db01e79328f8ed4ef21 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 16:29:47 +0100
Subject: libceph: export remaining protocol independent infrastructure

In preparation for msgr2, make all protocol independent functions
in messenger.c global.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  39 +++++++++-
 net/ceph/messenger.c           | 157 ++++++++++++++++++++---------------------
 2 files changed, 113 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 513ed5f90bff..93815f1a42b5 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -345,13 +345,50 @@ struct ceph_connection {
 
 extern struct page *ceph_zero_page;
 
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag);
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+				  unsigned long con_flag);
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+				unsigned long con_flag);
+
+void ceph_encode_my_addr(struct ceph_messenger *msgr);
+
+int ceph_tcp_connect(struct ceph_connection *con);
+int ceph_con_close_socket(struct ceph_connection *con);
+void ceph_con_reset_session(struct ceph_connection *con);
+
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt);
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq);
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq);
+
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+			       struct ceph_msg *msg, size_t length);
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length,
+				bool *last_piece);
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes);
+
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+		     unsigned int length);
+
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr);
+int ceph_addr_port(const struct ceph_entity_addr *addr);
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p);
+
+void ceph_con_process_message(struct ceph_connection *con);
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+			  struct ceph_msg_header *hdr, int *skip);
+void ceph_con_get_out_msg(struct ceph_connection *con);
+
+
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
 extern int ceph_parse_ips(const char *c, const char *end,
 			  struct ceph_entity_addr *addr,
 			  int max_count, int *count);
 
-
 extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
 extern void ceph_msgr_flush(void);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d3880fbe8424..85d20372f923 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -96,37 +96,37 @@ static bool con_flag_valid(unsigned long con_flag)
 	}
 }
 
-static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	clear_bit(con_flag, &con->flags);
 }
 
-static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	set_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	return test_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test_and_clear(struct ceph_connection *con,
-					unsigned long con_flag)
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+				  unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	return test_and_clear_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test_and_set(struct ceph_connection *con,
-					unsigned long con_flag)
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+				unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
@@ -199,7 +199,7 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 }
 EXPORT_SYMBOL(ceph_pr_addr);
 
-static void encode_my_addr(struct ceph_messenger *msgr)
+void ceph_encode_my_addr(struct ceph_messenger *msgr)
 {
 	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
 	ceph_encode_banner_addr(&msgr->my_enc_addr);
@@ -370,7 +370,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
+	if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
 		if (sk_stream_is_writeable(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -396,7 +396,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
+		ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -430,13 +430,15 @@ static void set_sock_callbacks(struct socket *sock,
 /*
  * initiate connection to a remote socket.
  */
-static int ceph_tcp_connect(struct ceph_connection *con)
+int ceph_tcp_connect(struct ceph_connection *con)
 {
 	struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
 	struct socket *sock;
 	unsigned int noio_flag;
 	int ret;
 
+	dout("%s con %p peer_addr %s\n", __func__, con,
+	     ceph_pr_addr(&con->peer_addr));
 	BUG_ON(con->sock);
 
 	/* sock_create_kern() allocates with GFP_KERNEL */
@@ -454,8 +456,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 
 	set_sock_callbacks(sock, con);
 
-	dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
-
 	con_sock_state_connecting(con);
 	ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
 				 O_NONBLOCK);
@@ -570,11 +570,11 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
 /*
  * Shutdown/close the socket for the given connection.
  */
-static int con_close_socket(struct ceph_connection *con)
+int ceph_con_close_socket(struct ceph_connection *con)
 {
 	int rc = 0;
 
-	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	dout("%s con %p sock %p\n", __func__, con, con->sock);
 	if (con->sock) {
 		rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
 		sock_release(con->sock);
@@ -587,7 +587,7 @@ static int con_close_socket(struct ceph_connection *con)
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
+	ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
 
 	con_sock_state_closed(con);
 	return rc;
@@ -597,7 +597,7 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
 
-	con_close_socket(con);
+	ceph_con_close_socket(con);
 	if (con->in_msg) {
 		WARN_ON(con->in_msg->con != con);
 		ceph_msg_put(con->in_msg);
@@ -631,7 +631,7 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
-static void ceph_con_reset_session(struct ceph_connection *con)
+void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
 
@@ -656,10 +656,11 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
 	con->state = CEPH_CON_S_CLOSED;
 
-	con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next connect */
-	con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
-	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
-	con_flag_clear(con, CEPH_CON_F_BACKOFF);
+	ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next
+							  connect */
+	ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
 
 	ceph_con_reset_protocol(con);
 	ceph_con_reset_session(con);
@@ -728,7 +729,7 @@ EXPORT_SYMBOL(ceph_con_init);
  * We maintain a global counter to order connection attempts.  Get
  * a unique seq greater than @gt.
  */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
 {
 	u32 ret;
 
@@ -743,7 +744,7 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 /*
  * Discard messages that have been acked by the server.
  */
-static void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
 {
 	struct ceph_msg *msg;
 	u64 seq;
@@ -768,8 +769,7 @@ static void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
  * reconnect_seq.  This avoids gratuitously resending messages that
  * the server had received and handled prior to reconnect.
  */
-static void ceph_con_discard_requeued(struct ceph_connection *con,
-				      u64 reconnect_seq)
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
 {
 	struct ceph_msg *msg;
 	u64 seq;
@@ -1150,8 +1150,8 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
 	cursor->need_crc = true;
 }
 
-static void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
-				      struct ceph_msg *msg, size_t length)
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+			       struct ceph_msg *msg, size_t length)
 {
 	BUG_ON(!length);
 	BUG_ON(length > msg->data_length);
@@ -1168,9 +1168,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
  * data item, and supply the page offset and length of that piece.
  * Indicate whether this is the last piece in this data item.
  */
-static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
-					size_t *page_offset, size_t *length,
-					bool *last_piece)
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length,
+				bool *last_piece)
 {
 	struct page *page;
 
@@ -1209,8 +1209,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
  * Returns true if the result moves the cursor on to the next piece
  * of the data item.
  */
-static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
-				  size_t bytes)
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 {
 	bool new_piece;
 
@@ -1284,8 +1283,6 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	con->out_msg_done = true;
 }
 
-static void ceph_con_get_out_msg(struct ceph_connection *con);
-
 /*
  * Prepare headers for the next outgoing message.
  */
@@ -1355,7 +1352,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		prepare_write_message_footer(con);
 	}
 
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1376,7 +1373,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1394,7 +1391,7 @@ static void prepare_write_seq(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->out_temp_ack),
 			 &con->out_temp_ack);
 
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1415,7 +1412,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1454,7 +1451,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
@@ -1465,12 +1462,12 @@ static void __prepare_write_connect(struct ceph_connection *con)
 				 con->auth->authorizer_buf);
 
 	con->out_more = 0;
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
 {
-	unsigned int global_seq = get_global_seq(con->msgr, 0);
+	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
 	int proto;
 	int ret;
 
@@ -1549,9 +1546,8 @@ out:
 	return ret;  /* done! */
 }
 
-static u32 ceph_crc32c_page(u32 crc, struct page *page,
-				unsigned int page_offset,
-				unsigned int length)
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+		     unsigned int length)
 {
 	char *kaddr;
 
@@ -1813,7 +1809,7 @@ static int verify_hello(struct ceph_connection *con)
 	return 0;
 }
 
-static bool addr_is_blank(struct ceph_entity_addr *addr)
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
 {
 	struct sockaddr_storage ss = addr->in_addr; /* align */
 	struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
@@ -1829,7 +1825,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr)
 	}
 }
 
-static int addr_port(struct ceph_entity_addr *addr)
+int ceph_addr_port(const struct ceph_entity_addr *addr)
 {
 	switch (get_unaligned(&addr->in_addr.ss_family)) {
 	case AF_INET:
@@ -1840,7 +1836,7 @@ static int addr_port(struct ceph_entity_addr *addr)
 	return 0;
 }
 
-static void addr_set_port(struct ceph_entity_addr *addr, int p)
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
 {
 	switch (get_unaligned(&addr->in_addr.ss_family)) {
 	case AF_INET:
@@ -1998,7 +1994,7 @@ int ceph_parse_ips(const char *c, const char *end,
 			port = CEPH_MON_PORT;
 		}
 
-		addr_set_port(&addr[i], port);
+		ceph_addr_set_port(&addr[i], port);
 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
@@ -2037,7 +2033,7 @@ static int process_banner(struct ceph_connection *con)
 	 */
 	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
 		   sizeof(con->peer_addr)) != 0 &&
-	    !(addr_is_blank(&con->actual_peer_addr) &&
+	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
 	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
 		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
 			ceph_pr_addr(&con->peer_addr),
@@ -2051,12 +2047,12 @@ static int process_banner(struct ceph_connection *con)
 	/*
 	 * did we learn our address?
 	 */
-	if (addr_is_blank(my_addr)) {
+	if (ceph_addr_is_blank(my_addr)) {
 		memcpy(&my_addr->in_addr,
 		       &con->peer_addr_for_me.in_addr,
 		       sizeof(con->peer_addr_for_me.in_addr));
-		addr_set_port(my_addr, 0);
-		encode_my_addr(con->msgr);
+		ceph_addr_set_port(my_addr, 0);
+		ceph_encode_my_addr(con->msgr);
 		dout("process_banner learned my addr is %s\n",
 		     ceph_pr_addr(my_addr));
 	}
@@ -2192,8 +2188,8 @@ static int process_connect(struct ceph_connection *con)
 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
 		     con->peer_global_seq,
 		     le32_to_cpu(con->in_reply.global_seq));
-		get_global_seq(con->msgr,
-			       le32_to_cpu(con->in_reply.global_seq));
+		ceph_get_global_seq(con->msgr,
+				    le32_to_cpu(con->in_reply.global_seq));
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -2227,7 +2223,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			con_flag_set(con, CEPH_CON_F_LOSSYTX);
+			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
@@ -2351,9 +2347,6 @@ static int read_partial_msg_data(struct ceph_connection *con)
 /*
  * read (part of) a message.
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con,
-				 struct ceph_msg_header *hdr, int *skip);
-
 static int read_partial_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->in_msg;
@@ -2515,7 +2508,7 @@ static int read_partial_message(struct ceph_connection *con)
  * be careful not to do anything that waits on other incoming messages or it
  * may deadlock.
  */
-static void process_message(struct ceph_connection *con)
+void ceph_con_process_message(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->in_msg;
 
@@ -2628,7 +2621,7 @@ more:
 
 do_next:
 	if (con->state == CEPH_CON_S_OPEN) {
-		if (con_flag_test_and_clear(con,
+		if (ceph_con_flag_test_and_clear(con,
 				CEPH_CON_F_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
@@ -2645,7 +2638,7 @@ do_next:
 	}
 
 	/* Nothing to do! */
-	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2739,7 +2732,7 @@ more:
 			prepare_read_keepalive_ack(con);
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
-			con_close_socket(con);
+			ceph_con_close_socket(con);
 			con->state = CEPH_CON_S_CLOSED;
 			goto out;
 		default:
@@ -2764,7 +2757,7 @@ more:
 		}
 		if (con->in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
-		process_message(con);
+		ceph_con_process_message(con);
 		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
@@ -2840,7 +2833,7 @@ static void cancel_con(struct ceph_connection *con)
 
 static bool con_sock_closed(struct ceph_connection *con)
 {
-	if (!con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
+	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
 		return false;
 
 #define CASE(x)								\
@@ -2867,7 +2860,7 @@ static bool con_backoff(struct ceph_connection *con)
 {
 	int ret;
 
-	if (!con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
+	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
 		return false;
 
 	ret = queue_con_delay(con, con->delay);
@@ -2875,7 +2868,7 @@ static bool con_backoff(struct ceph_connection *con)
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
 		BUG_ON(ret == -ENOENT);
-		con_flag_set(con, CEPH_CON_F_BACKOFF);
+		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
 	}
 
 	return true;
@@ -2987,7 +2980,7 @@ static void con_fault(struct ceph_connection *con)
 
 	ceph_con_reset_protocol(con);
 
-	if (con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
+	if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CEPH_CON_S_CLOSED;
 		return;
@@ -2999,9 +2992,9 @@ static void con_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+	    !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+		ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
@@ -3013,7 +3006,7 @@ static void con_fault(struct ceph_connection *con)
 			if (con->delay > MAX_DELAY_INTERVAL)
 				con->delay = MAX_DELAY_INTERVAL;
 		}
-		con_flag_set(con, CEPH_CON_F_BACKOFF);
+		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
 		queue_con(con);
 	}
 }
@@ -3023,7 +3016,7 @@ void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
 {
 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
 	msgr->inst.addr.nonce = cpu_to_le32(nonce);
-	encode_my_addr(msgr);
+	ceph_encode_my_addr(msgr);
 }
 
 /*
@@ -3037,7 +3030,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 	if (myaddr) {
 		memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
 		       sizeof(msgr->inst.addr.in_addr));
-		addr_set_port(&msgr->inst.addr, 0);
+		ceph_addr_set_port(&msgr->inst.addr, 0);
 	}
 
 	msgr->inst.addr.type = 0;
@@ -3047,7 +3040,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 		get_random_bytes(&msgr->inst.addr.nonce,
 				 sizeof(msgr->inst.addr.nonce));
 	} while (!msgr->inst.addr.nonce);
-	encode_my_addr(msgr);
+	ceph_encode_my_addr(msgr);
 
 	atomic_set(&msgr->stopping, 0);
 	write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
@@ -3076,8 +3069,8 @@ static void clear_standby(struct ceph_connection *con)
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
-		WARN_ON(con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
-		WARN_ON(con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
+		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
 }
 
@@ -3118,7 +3111,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
+	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -3214,10 +3207,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	dout("con_keepalive %p\n", con);
 	mutex_lock(&con->mutex);
 	clear_standby(con);
-	con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
 	mutex_unlock(&con->mutex);
 
-	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
+	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
@@ -3423,8 +3416,8 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
  * On error (ENOMEM, EAGAIN, ...),
  *  - con->in_msg == NULL
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con,
-				 struct ceph_msg_header *hdr, int *skip)
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+			  struct ceph_msg_header *hdr, int *skip)
 {
 	int middle_len = le32_to_cpu(hdr->middle_len);
 	struct ceph_msg *msg;
@@ -3470,7 +3463,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con,
 	return ret;
 }
 
-static void ceph_con_get_out_msg(struct ceph_connection *con)
+void ceph_con_get_out_msg(struct ceph_connection *con)
 {
 	struct ceph_msg *msg;
 
-- 
cgit 


From 566050e17e53db283d4e26b73b4b50556f97ce7b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 12:55:39 +0100
Subject: libceph: separate msgr1 protocol implementation

In preparation for msgr2, define internal messenger <-> protocol
interface (as opposed to external messenger <-> client interface, which
is struct ceph_connection_operations) consisting of try_read(),
try_write(), revoke(), revoke_incoming(), opened(), reset_session() and
reset_protocol() ops.  The semantics are exactly the same as they are
now.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |   8 +++
 net/ceph/messenger.c           | 138 ++++++++++++++++++++++++++---------------
 2 files changed, 96 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 93815f1a42b5..8cc8b08eb3dd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -382,6 +382,14 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con,
 			  struct ceph_msg_header *hdr, int *skip);
 void ceph_con_get_out_msg(struct ceph_connection *con);
 
+int ceph_con_v1_try_read(struct ceph_connection *con);
+int ceph_con_v1_try_write(struct ceph_connection *con);
+void ceph_con_v1_revoke(struct ceph_connection *con);
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v1_opened(struct ceph_connection *con);
+void ceph_con_v1_reset_session(struct ceph_connection *con);
+void ceph_con_v1_reset_protocol(struct ceph_connection *con);
+
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 85d20372f923..4ca7d9b594c7 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -593,6 +593,11 @@ int ceph_con_close_socket(struct ceph_connection *con)
 	return rc;
 }
 
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+	con->out_skip = 0;
+}
+
 static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -609,7 +614,7 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 
-	con->out_skip = 0;
+	ceph_con_v1_reset_protocol(con);
 }
 
 /*
@@ -631,6 +636,12 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+	con->connect_seq = 0;
+	con->peer_global_seq = 0;
+}
+
 void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -643,8 +654,7 @@ void ceph_con_reset_session(struct ceph_connection *con)
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
+	ceph_con_v1_reset_session(con);
 }
 
 /*
@@ -692,12 +702,17 @@ void ceph_con_open(struct ceph_connection *con,
 }
 EXPORT_SYMBOL(ceph_con_open);
 
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+	return con->connect_seq;
+}
+
 /*
  * return true if this connection ever successfully opened
  */
 bool ceph_con_opened(struct ceph_connection *con)
 {
-	return con->connect_seq > 0;
+	return ceph_con_v1_opened(con);
 }
 
 /*
@@ -2552,7 +2567,7 @@ static int read_keepalive_ack(struct ceph_connection *con)
  * Write something to the socket.  Called in a worker thread when the
  * socket appears to be writeable and we have something ready to send.
  */
-static int try_write(struct ceph_connection *con)
+int ceph_con_v1_try_write(struct ceph_connection *con)
 {
 	int ret = 1;
 
@@ -2649,7 +2664,7 @@ out:
 /*
  * Read what we can from the socket.
  */
-static int try_read(struct ceph_connection *con)
+int ceph_con_v1_try_read(struct ceph_connection *con)
 {
 	int ret = -1;
 
@@ -2930,7 +2945,7 @@ static void ceph_con_workfn(struct work_struct *work)
 			BUG_ON(con->sock);
 		}
 
-		ret = try_read(con);
+		ret = ceph_con_v1_try_read(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -2940,7 +2955,7 @@ static void ceph_con_workfn(struct work_struct *work)
 			break;
 		}
 
-		ret = try_write(con);
+		ret = ceph_con_v1_try_write(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -3116,6 +3131,29 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 }
 EXPORT_SYMBOL(ceph_con_send);
 
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	WARN_ON(con->out_skip);
+	/* footer */
+	if (con->out_msg_done) {
+		con->out_skip += con_out_kvec_skip(con);
+	} else {
+		WARN_ON(!msg->data_length);
+		con->out_skip += sizeof_footer(con);
+	}
+	/* data, middle, front */
+	if (msg->data_length)
+		con->out_skip += msg->cursor.total_resid;
+	if (msg->middle)
+		con->out_skip += con_out_kvec_skip(con);
+	con->out_skip += con_out_kvec_skip(con);
+
+	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+	     con->out_kvec_bytes, con->out_skip);
+}
+
 /*
  * Revoke a message that was previously queued for send
  */
@@ -3129,39 +3167,50 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	}
 
 	mutex_lock(&con->mutex);
-	if (!list_empty(&msg->list_head)) {
-		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
-		list_del_init(&msg->list_head);
-		msg->hdr.seq = 0;
-
-		ceph_msg_put(msg);
+	if (list_empty(&msg->list_head)) {
+		WARN_ON(con->out_msg == msg);
+		dout("%s con %p msg %p not linked\n", __func__, con, msg);
+		mutex_unlock(&con->mutex);
+		return;
 	}
-	if (con->out_msg == msg) {
-		BUG_ON(con->out_skip);
-		/* footer */
-		if (con->out_msg_done) {
-			con->out_skip += con_out_kvec_skip(con);
-		} else {
-			BUG_ON(!msg->data_length);
-			con->out_skip += sizeof_footer(con);
-		}
-		/* data, middle, front */
-		if (msg->data_length)
-			con->out_skip += msg->cursor.total_resid;
-		if (msg->middle)
-			con->out_skip += con_out_kvec_skip(con);
-		con->out_skip += con_out_kvec_skip(con);
 
-		dout("%s %p msg %p - was sending, will write %d skip %d\n",
-		     __func__, con, msg, con->out_kvec_bytes, con->out_skip);
-		msg->hdr.seq = 0;
+	dout("%s con %p msg %p was linked\n", __func__, con, msg);
+	msg->hdr.seq = 0;
+	ceph_msg_remove(msg);
+
+	if (con->out_msg == msg) {
+		WARN_ON(con->state != CEPH_CON_S_OPEN);
+		dout("%s con %p msg %p was sending\n", __func__, con, msg);
+		ceph_con_v1_revoke(con);
+		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
-		ceph_msg_put(msg);
+	} else {
+		dout("%s con %p msg %p not current, out_msg %p\n", __func__,
+		     con, msg, con->out_msg);
 	}
-
 	mutex_unlock(&con->mutex);
 }
 
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+
+	/* skip rest of message */
+	con->in_base_pos = con->in_base_pos -
+			sizeof(struct ceph_msg_header) -
+			front_len -
+			middle_len -
+			data_len -
+			sizeof(struct ceph_msg_footer);
+
+	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->in_seq++;
+
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+}
+
 /*
  * Revoke a message that we may be reading data into
  */
@@ -3176,25 +3225,14 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 	if (con->in_msg == msg) {
-		unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-		unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-		unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
-		/* skip rest of message */
-		dout("%s %p msg %p revoked\n", __func__, con, msg);
-		con->in_base_pos = con->in_base_pos -
-				sizeof(struct ceph_msg_header) -
-				front_len -
-				middle_len -
-				data_len -
-				sizeof(struct ceph_msg_footer);
+		WARN_ON(con->state != CEPH_CON_S_OPEN);
+		dout("%s con %p msg %p was recving\n", __func__, con, msg);
+		ceph_con_v1_revoke_incoming(con);
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		con->in_seq++;
 	} else {
-		dout("%s %p in_msg %p msg %p no-op\n",
-		     __func__, con, con->in_msg, msg);
+		dout("%s con %p msg %p not current, in_msg %p\n", __func__,
+		     con, msg, con->in_msg);
 	}
 	mutex_unlock(&con->mutex);
 }
-- 
cgit 


From 2f713615ddd9d805b6c5e79c52e0e11af99d2bf1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 15:48:06 +0100
Subject: libceph: move msgr1 protocol implementation to its own file

A pure move, no other changes.

Note that ceph_tcp_recv{msg,page}() and ceph_tcp_send{msg,page}()
helpers are also moved.  msgr2 will bring its own, more efficient,
variants based on iov_iter.  Switching msgr1 to them was considered
but decided against to avoid subtle regressions.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |    1 +
 net/ceph/Makefile              |    3 +-
 net/ceph/messenger.c           | 1495 ---------------------------------------
 net/ceph/messenger_v1.c        | 1502 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1505 insertions(+), 1496 deletions(-)
 create mode 100644 net/ceph/messenger_v1.c

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 8cc8b08eb3dd..b5268127c55e 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -382,6 +382,7 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con,
 			  struct ceph_msg_header *hdr, int *skip);
 void ceph_con_get_out_msg(struct ceph_connection *con);
 
+/* messenger_v1.c */
 int ceph_con_v1_try_read(struct ceph_connection *con);
 int ceph_con_v1_try_write(struct ceph_connection *con);
 void ceph_con_v1_revoke(struct ceph_connection *con);
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index ce09bb4fb249..df02bd8d6c7b 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	crypto.o armor.o \
 	auth_x.o \
 	ceph_strings.o ceph_hash.o \
-	pagevec.o snapshot.o string_table.o
+	pagevec.o snapshot.o string_table.o \
+	messenger_v1.o
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4ca7d9b594c7..544cfdbe52d6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -137,12 +137,6 @@ bool ceph_con_flag_test_and_set(struct ceph_connection *con,
 
 static struct kmem_cache	*ceph_msg_cache;
 
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
-
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
 #endif
@@ -477,96 +471,6 @@ int ceph_tcp_connect(struct ceph_connection *con)
 	return 0;
 }
 
-/*
- * If @buf is NULL, discard up to @len bytes.
- */
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-	struct kvec iov = {buf, len};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	if (!buf)
-		msg.msg_flags |= MSG_TRUNC;
-
-	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
-	r = sock_recvmsg(sock, &msg, msg.msg_flags);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
-		     int page_offset, size_t length)
-{
-	struct bio_vec bvec = {
-		.bv_page = page,
-		.bv_offset = page_offset,
-		.bv_len = length
-	};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	BUG_ON(page_offset + length > PAGE_SIZE);
-	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
-	r = sock_recvmsg(sock, &msg, msg.msg_flags);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-			    size_t kvlen, size_t len, bool more)
-{
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	if (more)
-		msg.msg_flags |= MSG_MORE;
-	else
-		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-
-	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-/*
- * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
- */
-static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
-			     int offset, size_t size, int more)
-{
-	ssize_t (*sendpage)(struct socket *sock, struct page *page,
-			    int offset, size_t size, int flags);
-	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
-	int ret;
-
-	/*
-	 * sendpage cannot properly handle pages with page_count == 0,
-	 * we need to fall back to sendmsg if that's the case.
-	 *
-	 * Same goes for slab pages: skb_can_coalesce() allows
-	 * coalescing neighboring slab objects into a single frag which
-	 * triggers one of hardened usercopy checks.
-	 */
-	if (sendpage_ok(page))
-		sendpage = sock->ops->sendpage;
-	else
-		sendpage = sock_no_sendpage;
-
-	ret = sendpage(sock, page, offset, size, flags);
-	if (ret == -EAGAIN)
-		ret = 0;
-
-	return ret;
-}
-
 /*
  * Shutdown/close the socket for the given connection.
  */
@@ -593,11 +497,6 @@ int ceph_con_close_socket(struct ceph_connection *con)
 	return rc;
 }
 
-void ceph_con_v1_reset_protocol(struct ceph_connection *con)
-{
-	con->out_skip = 0;
-}
-
 static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -636,12 +535,6 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
-void ceph_con_v1_reset_session(struct ceph_connection *con)
-{
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
-}
-
 void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -702,11 +595,6 @@ void ceph_con_open(struct ceph_connection *con,
 }
 EXPORT_SYMBOL(ceph_con_open);
 
-bool ceph_con_v1_opened(struct ceph_connection *con)
-{
-	return con->connect_seq;
-}
-
 /*
  * return true if this connection ever successfully opened
  */
@@ -739,7 +627,6 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 }
 EXPORT_SYMBOL(ceph_con_init);
 
-
 /*
  * We maintain a global counter to order connection attempts.  Get
  * a unique seq greater than @gt.
@@ -805,50 +692,6 @@ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
 	}
 }
 
-static void con_out_kvec_reset(struct ceph_connection *con)
-{
-	BUG_ON(con->out_skip);
-
-	con->out_kvec_left = 0;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_cur = &con->out_kvec[0];
-}
-
-static void con_out_kvec_add(struct ceph_connection *con,
-				size_t size, void *data)
-{
-	int index = con->out_kvec_left;
-
-	BUG_ON(con->out_skip);
-	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
-
-	con->out_kvec[index].iov_len = size;
-	con->out_kvec[index].iov_base = data;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += size;
-}
-
-/*
- * Chop off a kvec from the end.  Return residual number of bytes for
- * that kvec, i.e. how many bytes would have been written if the kvec
- * hadn't been nuked.
- */
-static int con_out_kvec_skip(struct ceph_connection *con)
-{
-	int off = con->out_kvec_cur - con->out_kvec;
-	int skip = 0;
-
-	if (con->out_kvec_bytes > 0) {
-		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
-		BUG_ON(con->out_kvec_bytes < skip);
-		BUG_ON(!con->out_kvec_left);
-		con->out_kvec_bytes -= skip;
-		con->out_kvec_left--;
-	}
-
-	return skip;
-}
-
 #ifdef CONFIG_BLOCK
 
 /*
@@ -1260,307 +1103,6 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 	cursor->need_crc = new_piece;
 }
 
-static size_t sizeof_footer(struct ceph_connection *con)
-{
-	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
-	    sizeof(struct ceph_msg_footer) :
-	    sizeof(struct ceph_msg_footer_old);
-}
-
-static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
-{
-	/* Initialize data cursor */
-
-	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
-}
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->out_msg;
-
-	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
-
-	dout("prepare_write_message_footer %p\n", con);
-	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
-	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
-		if (con->ops->sign_message)
-			con->ops->sign_message(m);
-		else
-			m->footer.sig = 0;
-	} else {
-		m->old_footer.flags = m->footer.flags;
-	}
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	u32 crc;
-
-	con_out_kvec_reset(con);
-	con->out_msg_done = false;
-
-	/* Sneak an ack in there first?  If we can get it into the same
-	 * TCP packet that's a good thing. */
-	if (con->in_seq > con->in_seq_acked) {
-		con->in_seq_acked = con->in_seq;
-		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			&con->out_temp_ack);
-	}
-
-	ceph_con_get_out_msg(con);
-	m = con->out_msg;
-
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
-	     m, con->out_seq, le16_to_cpu(m->hdr.type),
-	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     m->data_length);
-	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
-	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
-
-	/* tag + hdr + front + middle */
-	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
-	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
-
-	if (m->middle)
-		con_out_kvec_add(con, m->middle->vec.iov_len,
-			m->middle->vec.iov_base);
-
-	/* fill in hdr crc and finalize hdr */
-	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
-	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
-
-	/* fill in front and middle crc, footer */
-	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
-	con->out_msg->footer.front_crc = cpu_to_le32(crc);
-	if (m->middle) {
-		crc = crc32c(0, m->middle->vec.iov_base,
-				m->middle->vec.iov_len);
-		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
-	} else
-		con->out_msg->footer.middle_crc = 0;
-	dout("%s front_crc %u middle_crc %u\n", __func__,
-	     le32_to_cpu(con->out_msg->footer.front_crc),
-	     le32_to_cpu(con->out_msg->footer.middle_crc));
-	con->out_msg->footer.flags = 0;
-
-	/* is there a data payload? */
-	con->out_msg->footer.data_crc = 0;
-	if (m->data_length) {
-		prepare_message_data(con->out_msg, m->data_length);
-		con->out_more = 1;  /* data + footer will follow */
-	} else {
-		/* no, queue up footer too and be done */
-		prepare_write_message_footer(con);
-	}
-
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-	dout("prepare_write_ack %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con_out_kvec_reset(con);
-
-	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-				&con->out_temp_ack);
-
-	con->out_more = 1;  /* more will follow.. eventually.. */
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare to share the seq during handshake
- */
-static void prepare_write_seq(struct ceph_connection *con)
-{
-	dout("prepare_write_seq %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con_out_kvec_reset(con);
-
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			 &con->out_temp_ack);
-
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-	dout("prepare_write_keepalive %p\n", con);
-	con_out_kvec_reset(con);
-	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
-		struct timespec64 now;
-
-		ktime_get_real_ts64(&now);
-		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
-		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
-		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
-				 &con->out_temp_keepalive2);
-	} else {
-		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
-	}
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Connection negotiation.
- */
-
-static int get_connect_authorizer(struct ceph_connection *con)
-{
-	struct ceph_auth_handshake *auth;
-	int auth_proto;
-
-	if (!con->ops->get_authorizer) {
-		con->auth = NULL;
-		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
-		con->out_connect.authorizer_len = 0;
-		return 0;
-	}
-
-	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
-	if (IS_ERR(auth))
-		return PTR_ERR(auth);
-
-	con->auth = auth;
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
-	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
-	return 0;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_connection *con)
-{
-	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
-	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
-					&con->msgr->my_enc_addr);
-
-	con->out_more = 0;
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-static void __prepare_write_connect(struct ceph_connection *con)
-{
-	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
-	if (con->auth)
-		con_out_kvec_add(con, con->auth->authorizer_buf_len,
-				 con->auth->authorizer_buf);
-
-	con->out_more = 0;
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-static int prepare_write_connect(struct ceph_connection *con)
-{
-	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
-	int proto;
-	int ret;
-
-	switch (con->peer_name.type) {
-	case CEPH_ENTITY_TYPE_MON:
-		proto = CEPH_MONC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_OSD:
-		proto = CEPH_OSDC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_MDS:
-		proto = CEPH_MDSC_PROTOCOL;
-		break;
-	default:
-		BUG();
-	}
-
-	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
-
-	con->out_connect.features =
-	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
-
-	ret = get_connect_authorizer(con);
-	if (ret)
-		return ret;
-
-	__prepare_write_connect(con);
-	return 0;
-}
-
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-	int ret;
-
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
-		if (ret <= 0)
-			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
-			break;            /* done */
-
-		/* account for full iov entries consumed */
-		while (ret >= con->out_kvec_cur->iov_len) {
-			BUG_ON(!con->out_kvec_left);
-			ret -= con->out_kvec_cur->iov_len;
-			con->out_kvec_cur++;
-			con->out_kvec_left--;
-		}
-		/* and for a partially-consumed entry */
-		if (ret) {
-			con->out_kvec_cur->iov_len -= ret;
-			con->out_kvec_cur->iov_base += ret;
-		}
-	}
-	con->out_kvec_left = 0;
-	ret = 1;
-out:
-	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
-	return ret;  /* done! */
-}
-
 u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
 		     unsigned int length)
 {
@@ -1573,256 +1115,6 @@ u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
 
 	return crc;
 }
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_message_data(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
-	u32 crc;
-
-	dout("%s %p msg %p\n", __func__, con, msg);
-
-	if (!msg->num_data_items)
-		return -EINVAL;
-
-	/*
-	 * Iterate through each page that contains data to be
-	 * written, and send as much as possible for each.
-	 *
-	 * If we are calculating the data crc (the default), we will
-	 * need to map the page.  If we have no pages, they have
-	 * been revoked, so use the zero page.
-	 */
-	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
-	while (cursor->total_resid) {
-		struct page *page;
-		size_t page_offset;
-		size_t length;
-		int ret;
-
-		if (!cursor->resid) {
-			ceph_msg_data_advance(cursor, 0);
-			continue;
-		}
-
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
-		if (length == cursor->total_resid)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
-					more);
-		if (ret <= 0) {
-			if (do_datacrc)
-				msg->footer.data_crc = cpu_to_le32(crc);
-
-			return ret;
-		}
-		if (do_datacrc && cursor->need_crc)
-			crc = ceph_crc32c_page(crc, page, page_offset, length);
-		ceph_msg_data_advance(cursor, (size_t)ret);
-	}
-
-	dout("%s %p msg %p done\n", __func__, con, msg);
-
-	/* prepare and queue up footer, too */
-	if (do_datacrc)
-		msg->footer.data_crc = cpu_to_le32(crc);
-	else
-		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-	con_out_kvec_reset(con);
-	prepare_write_message_footer(con);
-
-	return 1;	/* must return > 0 to indicate success */
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
-	int ret;
-
-	dout("%s %p %d left\n", __func__, con, con->out_skip);
-	while (con->out_skip > 0) {
-		size_t size = min(con->out_skip, (int) PAGE_SIZE);
-
-		if (size == con->out_skip)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
-					more);
-		if (ret <= 0)
-			goto out;
-		con->out_skip -= ret;
-	}
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
-	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_seq(struct ceph_connection *con)
-{
-	dout("prepare_read_seq %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_SEQ;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
-	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-static void prepare_read_keepalive_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_keepalive_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-	dout("prepare_read_message %p\n", con);
-	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
-	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-	return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
-			int end, int size, void *object)
-{
-	while (con->in_base_pos < end) {
-		int left = end - con->in_base_pos;
-		int have = size - left;
-		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
-	return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-	int size;
-	int end;
-	int ret;
-
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
-	/* peer's banner */
-	size = strlen(CEPH_BANNER);
-	end = size;
-	ret = read_partial(con, end, size, con->in_banner);
-	if (ret <= 0)
-		goto out;
-
-	size = sizeof (con->actual_peer_addr);
-	end += size;
-	ret = read_partial(con, end, size, &con->actual_peer_addr);
-	if (ret <= 0)
-		goto out;
-	ceph_decode_banner_addr(&con->actual_peer_addr);
-
-	size = sizeof (con->peer_addr_for_me);
-	end += size;
-	ret = read_partial(con, end, size, &con->peer_addr_for_me);
-	if (ret <= 0)
-		goto out;
-	ceph_decode_banner_addr(&con->peer_addr_for_me);
-
-out:
-	return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
-	int size;
-	int end;
-	int ret;
-
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
-	size = sizeof (con->in_reply);
-	end = size;
-	ret = read_partial(con, end, size, &con->in_reply);
-	if (ret <= 0)
-		goto out;
-
-	if (con->auth) {
-		size = le32_to_cpu(con->in_reply.authorizer_len);
-		if (size > con->auth->authorizer_reply_buf_len) {
-			pr_err("authorizer reply too big: %d > %zu\n", size,
-			       con->auth->authorizer_reply_buf_len);
-			ret = -EINVAL;
-			goto out;
-		}
-
-		end += size;
-		ret = read_partial(con, end, size,
-				   con->auth->authorizer_reply_buf);
-		if (ret <= 0)
-			goto out;
-	}
-
-	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
-out:
-	return ret;
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-		pr_err("connect to %s got bad banner\n",
-		       ceph_pr_addr(&con->peer_addr));
-		con->error_msg = "protocol error, bad banner";
-		return -1;
-	}
-	return 0;
-}
 
 bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
 {
@@ -2032,492 +1324,6 @@ bad:
 	return ret;
 }
 
-static int process_banner(struct ceph_connection *con)
-{
-	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
-
-	dout("process_banner on %p\n", con);
-
-	if (verify_hello(con) < 0)
-		return -1;
-
-	/*
-	 * Make sure the other end is who we wanted.  note that the other
-	 * end may not yet know their ip address, so if it's 0.0.0.0, give
-	 * them the benefit of the doubt.
-	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-		   sizeof(con->peer_addr)) != 0 &&
-	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
-			ceph_pr_addr(&con->peer_addr),
-			le32_to_cpu(con->peer_addr.nonce),
-			ceph_pr_addr(&con->actual_peer_addr),
-			le32_to_cpu(con->actual_peer_addr.nonce));
-		con->error_msg = "wrong peer at address";
-		return -1;
-	}
-
-	/*
-	 * did we learn our address?
-	 */
-	if (ceph_addr_is_blank(my_addr)) {
-		memcpy(&my_addr->in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
-		ceph_addr_set_port(my_addr, 0);
-		ceph_encode_my_addr(con->msgr);
-		dout("process_banner learned my addr is %s\n",
-		     ceph_pr_addr(my_addr));
-	}
-
-	return 0;
-}
-
-static int process_connect(struct ceph_connection *con)
-{
-	u64 sup_feat = from_msgr(con->msgr)->supported_features;
-	u64 req_feat = from_msgr(con->msgr)->required_features;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
-	int ret;
-
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
-	if (con->auth) {
-		int len = le32_to_cpu(con->in_reply.authorizer_len);
-
-		/*
-		 * Any connection that defines ->get_authorizer()
-		 * should also define ->add_authorizer_challenge() and
-		 * ->verify_authorizer_reply().
-		 *
-		 * See get_connect_authorizer().
-		 */
-		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
-			ret = con->ops->add_authorizer_challenge(
-				    con, con->auth->authorizer_reply_buf, len);
-			if (ret < 0)
-				return ret;
-
-			con_out_kvec_reset(con);
-			__prepare_write_connect(con);
-			prepare_read_connect(con);
-			return 0;
-		}
-
-		if (len) {
-			ret = con->ops->verify_authorizer_reply(con);
-			if (ret < 0) {
-				con->error_msg = "bad authorize reply";
-				return ret;
-			}
-		}
-	}
-
-	switch (con->in_reply.tag) {
-	case CEPH_MSGR_TAG_FEATURES:
-		pr_err("%s%lld %s feature set mismatch,"
-		       " my %llx < server's %llx, missing %llx\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr),
-		       sup_feat, server_feat, server_feat & ~sup_feat);
-		con->error_msg = "missing required protocol features";
-		return -1;
-
-	case CEPH_MSGR_TAG_BADPROTOVER:
-		pr_err("%s%lld %s protocol version mismatch,"
-		       " my %d != server's %d\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
-		con->error_msg = "protocol version mismatch";
-		return -1;
-
-	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
-		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
-			con->error_msg = "connect authorization failure";
-			return -1;
-		}
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RESETSESSION:
-		/*
-		 * If we connected with a large connect_seq but the peer
-		 * has no record of a session with us (no connection, or
-		 * connect_seq == 0), they will send RESETSESION to indicate
-		 * that they must have reset their session, and may have
-		 * dropped messages.
-		 */
-		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_reply.connect_seq));
-		pr_info("%s%lld %s session reset\n",
-			ENTITY_NAME(con->peer_name),
-			ceph_pr_addr(&con->peer_addr));
-		ceph_con_reset_session(con);
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-
-		/* Tell ceph about it. */
-		mutex_unlock(&con->mutex);
-		if (con->ops->peer_reset)
-			con->ops->peer_reset(con);
-		mutex_lock(&con->mutex);
-		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
-			return -EAGAIN;
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_SESSION:
-		/*
-		 * If we sent a smaller connect_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_reply.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_GLOBAL:
-		/*
-		 * If we sent a smaller global_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.global_seq));
-		ceph_get_global_seq(con->msgr,
-				    le32_to_cpu(con->in_reply.global_seq));
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_SEQ:
-	case CEPH_MSGR_TAG_READY:
-		if (req_feat & ~server_feat) {
-			pr_err("%s%lld %s protocol feature mismatch,"
-			       " my required %llx > server's %llx, need %llx\n",
-			       ENTITY_NAME(con->peer_name),
-			       ceph_pr_addr(&con->peer_addr),
-			       req_feat, server_feat, req_feat & ~server_feat);
-			con->error_msg = "missing required protocol features";
-			return -1;
-		}
-
-		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
-		con->state = CEPH_CON_S_OPEN;
-		con->auth_retry = 0;    /* we authenticated; clear flag */
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
-		con->peer_features = server_feat;
-		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
-
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
-
-		con->delay = 0;      /* reset backoff memory */
-
-		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
-			prepare_write_seq(con);
-			prepare_read_seq(con);
-		} else {
-			prepare_read_tag(con);
-		}
-		break;
-
-	case CEPH_MSGR_TAG_WAIT:
-		/*
-		 * If there is a connection race (we are opening
-		 * connections to each other), one of us may just have
-		 * to WAIT.  This shouldn't happen if we are the
-		 * client.
-		 */
-		con->error_msg = "protocol error, got WAIT as client";
-		return -1;
-
-	default:
-		con->error_msg = "protocol error, garbage tag during connect";
-		return -1;
-	}
-	return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-	int size = sizeof (con->in_temp_ack);
-	int end = size;
-
-	return read_partial(con, end, size, &con->in_temp_ack);
-}
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-	u64 ack = le64_to_cpu(con->in_temp_ack);
-
-	if (con->in_tag == CEPH_MSGR_TAG_ACK)
-		ceph_con_discard_sent(con, ack);
-	else
-		ceph_con_discard_requeued(con, ack);
-
-	prepare_read_tag(con);
-}
-
-
-static int read_partial_message_section(struct ceph_connection *con,
-					struct kvec *section,
-					unsigned int sec_len, u32 *crc)
-{
-	int ret, left;
-
-	BUG_ON(!section);
-
-	while (section->iov_len < sec_len) {
-		BUG_ON(section->iov_base == NULL);
-		left = sec_len - section->iov_len;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-				       section->iov_len, left);
-		if (ret <= 0)
-			return ret;
-		section->iov_len += ret;
-	}
-	if (section->iov_len == sec_len)
-		*crc = crc32c(0, section->iov_base, section->iov_len);
-
-	return 1;
-}
-
-static int read_partial_msg_data(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	struct page *page;
-	size_t page_offset;
-	size_t length;
-	u32 crc = 0;
-	int ret;
-
-	if (!msg->num_data_items)
-		return -EIO;
-
-	if (do_datacrc)
-		crc = con->in_data_crc;
-	while (cursor->total_resid) {
-		if (!cursor->resid) {
-			ceph_msg_data_advance(cursor, 0);
-			continue;
-		}
-
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
-		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
-		if (ret <= 0) {
-			if (do_datacrc)
-				con->in_data_crc = crc;
-
-			return ret;
-		}
-
-		if (do_datacrc)
-			crc = ceph_crc32c_page(crc, page, page_offset, ret);
-		ceph_msg_data_advance(cursor, (size_t)ret);
-	}
-	if (do_datacrc)
-		con->in_data_crc = crc;
-
-	return 1;	/* must return > 0 to indicate success */
-}
-
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->in_msg;
-	int size;
-	int end;
-	int ret;
-	unsigned int front_len, middle_len, data_len;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
-	u64 seq;
-	u32 crc;
-
-	dout("read_partial_message con %p msg %p\n", con, m);
-
-	/* header */
-	size = sizeof (con->in_hdr);
-	end = size;
-	ret = read_partial(con, end, size, &con->in_hdr);
-	if (ret <= 0)
-		return ret;
-
-	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
-	if (cpu_to_le32(crc) != con->in_hdr.crc) {
-		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
-		       crc, con->in_hdr.crc);
-		return -EBADMSG;
-	}
-
-	front_len = le32_to_cpu(con->in_hdr.front_len);
-	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
-		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
-	if (data_len > CEPH_MSG_MAX_DATA_LEN)
-		return -EIO;
-
-	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
-	if ((s64)seq - (s64)con->in_seq < 1) {
-		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
-			ENTITY_NAME(con->peer_name),
-			ceph_pr_addr(&con->peer_addr),
-			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof_footer(con);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		return 1;
-	} else if ((s64)seq - (s64)con->in_seq > 1) {
-		pr_err("read_partial_message bad seq %lld expected %lld\n",
-		       seq, con->in_seq + 1);
-		con->error_msg = "bad message sequence # for incoming message";
-		return -EBADE;
-	}
-
-	/* allocate message? */
-	if (!con->in_msg) {
-		int skip = 0;
-
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-		     front_len, data_len);
-		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
-		if (ret < 0)
-			return ret;
-
-		BUG_ON(!con->in_msg ^ skip);
-		if (skip) {
-			/* skip this message */
-			dout("alloc_msg said skip message\n");
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof_footer(con);
-			con->in_tag = CEPH_MSGR_TAG_READY;
-			con->in_seq++;
-			return 1;
-		}
-
-		BUG_ON(!con->in_msg);
-		BUG_ON(con->in_msg->con != con);
-		m = con->in_msg;
-		m->front.iov_len = 0;    /* haven't read it yet */
-		if (m->middle)
-			m->middle->vec.iov_len = 0;
-
-		/* prepare for data payload, if any */
-
-		if (data_len)
-			prepare_message_data(con->in_msg, data_len);
-	}
-
-	/* front */
-	ret = read_partial_message_section(con, &m->front, front_len,
-					   &con->in_front_crc);
-	if (ret <= 0)
-		return ret;
-
-	/* middle */
-	if (m->middle) {
-		ret = read_partial_message_section(con, &m->middle->vec,
-						   middle_len,
-						   &con->in_middle_crc);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* (page) data */
-	if (data_len) {
-		ret = read_partial_msg_data(con);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* footer */
-	size = sizeof_footer(con);
-	end += size;
-	ret = read_partial(con, end, size, &m->footer);
-	if (ret <= 0)
-		return ret;
-
-	if (!need_sign) {
-		m->footer.flags = m->old_footer.flags;
-		m->footer.sig = 0;
-	}
-
-	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-	     m, front_len, m->footer.front_crc, middle_len,
-	     m->footer.middle_crc, data_len, m->footer.data_crc);
-
-	/* crc ok? */
-	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-		pr_err("read_partial_message %p front crc %u != exp. %u\n",
-		       m, con->in_front_crc, m->footer.front_crc);
-		return -EBADMSG;
-	}
-	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-		pr_err("read_partial_message %p middle crc %u != exp %u\n",
-		       m, con->in_middle_crc, m->footer.middle_crc);
-		return -EBADMSG;
-	}
-	if (do_datacrc &&
-	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-		return -EBADMSG;
-	}
-
-	if (need_sign && con->ops->check_message_signature &&
-	    con->ops->check_message_signature(m)) {
-		pr_err("read_partial_message %p signature check failed\n", m);
-		return -EBADMSG;
-	}
-
-	return 1; /* done! */
-}
-
 /*
  * Process message.  This happens in the worker thread.  The callback should
  * be careful not to do anything that waits on other incoming messages or it
@@ -2551,263 +1357,6 @@ void ceph_con_process_message(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 }
 
-static int read_keepalive_ack(struct ceph_connection *con)
-{
-	struct ceph_timespec ceph_ts;
-	size_t size = sizeof(ceph_ts);
-	int ret = read_partial(con, size, size, &ceph_ts);
-	if (ret <= 0)
-		return ret;
-	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
-	prepare_read_tag(con);
-	return 1;
-}
-
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-int ceph_con_v1_try_write(struct ceph_connection *con)
-{
-	int ret = 1;
-
-	dout("try_write start %p state %d\n", con, con->state);
-	if (con->state != CEPH_CON_S_PREOPEN &&
-	    con->state != CEPH_CON_S_V1_BANNER &&
-	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	    con->state != CEPH_CON_S_OPEN)
-		return 0;
-
-	/* open the socket first? */
-	if (con->state == CEPH_CON_S_PREOPEN) {
-		BUG_ON(con->sock);
-		con->state = CEPH_CON_S_V1_BANNER;
-
-		con_out_kvec_reset(con);
-		prepare_write_banner(con);
-		prepare_read_banner(con);
-
-		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %d\n",
-		     con, con->state);
-		ret = ceph_tcp_connect(con);
-		if (ret < 0) {
-			con->error_msg = "connect error";
-			goto out;
-		}
-	}
-
-more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-	BUG_ON(!con->sock);
-
-	/* kvec data queued? */
-	if (con->out_kvec_left) {
-		ret = write_partial_kvec(con);
-		if (ret <= 0)
-			goto out;
-	}
-	if (con->out_skip) {
-		ret = write_partial_skip(con);
-		if (ret <= 0)
-			goto out;
-	}
-
-	/* msg pages? */
-	if (con->out_msg) {
-		if (con->out_msg_done) {
-			ceph_msg_put(con->out_msg);
-			con->out_msg = NULL;   /* we're done with this one */
-			goto do_next;
-		}
-
-		ret = write_partial_message_data(con);
-		if (ret == 1)
-			goto more;  /* we need to send the footer, too! */
-		if (ret == 0)
-			goto out;
-		if (ret < 0) {
-			dout("try_write write_partial_message_data err %d\n",
-			     ret);
-			goto out;
-		}
-	}
-
-do_next:
-	if (con->state == CEPH_CON_S_OPEN) {
-		if (ceph_con_flag_test_and_clear(con,
-				CEPH_CON_F_KEEPALIVE_PENDING)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
-		/* is anything else pending? */
-		if (!list_empty(&con->out_queue)) {
-			prepare_write_message(con);
-			goto more;
-		}
-		if (con->in_seq > con->in_seq_acked) {
-			prepare_write_ack(con);
-			goto more;
-		}
-	}
-
-	/* Nothing to do! */
-	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
-	dout("try_write nothing else to write.\n");
-	ret = 0;
-out:
-	dout("try_write done on %p ret %d\n", con, ret);
-	return ret;
-}
-
-/*
- * Read what we can from the socket.
- */
-int ceph_con_v1_try_read(struct ceph_connection *con)
-{
-	int ret = -1;
-
-more:
-	dout("try_read start %p state %d\n", con, con->state);
-	if (con->state != CEPH_CON_S_V1_BANNER &&
-	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	    con->state != CEPH_CON_S_OPEN)
-		return 0;
-
-	BUG_ON(!con->sock);
-
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
-
-	if (con->state == CEPH_CON_S_V1_BANNER) {
-		ret = read_partial_banner(con);
-		if (ret <= 0)
-			goto out;
-		ret = process_banner(con);
-		if (ret < 0)
-			goto out;
-
-		con->state = CEPH_CON_S_V1_CONNECT_MSG;
-
-		/*
-		 * Received banner is good, exchange connection info.
-		 * Do not reset out_kvec, as sending our banner raced
-		 * with receiving peer banner after connect completed.
-		 */
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			goto out;
-		prepare_read_connect(con);
-
-		/* Send connection info before awaiting response */
-		goto out;
-	}
-
-	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
-		ret = read_partial_connect(con);
-		if (ret <= 0)
-			goto out;
-		ret = process_connect(con);
-		if (ret < 0)
-			goto out;
-		goto more;
-	}
-
-	WARN_ON(con->state != CEPH_CON_S_OPEN);
-
-	if (con->in_base_pos < 0) {
-		/*
-		 * skipping + discarding content.
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
-		if (ret <= 0)
-			goto out;
-		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
-			goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
-		/*
-		 * what's next?
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-		if (ret <= 0)
-			goto out;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
-		case CEPH_MSGR_TAG_MSG:
-			prepare_read_message(con);
-			break;
-		case CEPH_MSGR_TAG_ACK:
-			prepare_read_ack(con);
-			break;
-		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
-			prepare_read_keepalive_ack(con);
-			break;
-		case CEPH_MSGR_TAG_CLOSE:
-			ceph_con_close_socket(con);
-			con->state = CEPH_CON_S_CLOSED;
-			goto out;
-		default:
-			goto bad_tag;
-		}
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-		ret = read_partial_message(con);
-		if (ret <= 0) {
-			switch (ret) {
-			case -EBADMSG:
-				con->error_msg = "bad crc/signature";
-				fallthrough;
-			case -EBADE:
-				ret = -EIO;
-				break;
-			case -EIO:
-				con->error_msg = "io error";
-				break;
-			}
-			goto out;
-		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
-			goto more;
-		ceph_con_process_message(con);
-		if (con->state == CEPH_CON_S_OPEN)
-			prepare_read_tag(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
-	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
-		/*
-		 * the final handshake seq exchange is semantically
-		 * equivalent to an ACK
-		 */
-		ret = read_partial_ack(con);
-		if (ret <= 0)
-			goto out;
-		process_ack(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
-		ret = read_keepalive_ack(con);
-		if (ret <= 0)
-			goto out;
-		goto more;
-	}
-
-out:
-	dout("try_read done on %p ret %d\n", con, ret);
-	return ret;
-
-bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-	con->error_msg = "protocol error, garbage tag";
-	ret = -1;
-	goto out;
-}
-
-
 /*
  * Atomically queue work on a connection after the specified delay.
  * Bump @con reference to avoid races with connection teardown.
@@ -3026,7 +1575,6 @@ static void con_fault(struct ceph_connection *con)
 	}
 }
 
-
 void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
 {
 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
@@ -3131,29 +1679,6 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 }
 EXPORT_SYMBOL(ceph_con_send);
 
-void ceph_con_v1_revoke(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-
-	WARN_ON(con->out_skip);
-	/* footer */
-	if (con->out_msg_done) {
-		con->out_skip += con_out_kvec_skip(con);
-	} else {
-		WARN_ON(!msg->data_length);
-		con->out_skip += sizeof_footer(con);
-	}
-	/* data, middle, front */
-	if (msg->data_length)
-		con->out_skip += msg->cursor.total_resid;
-	if (msg->middle)
-		con->out_skip += con_out_kvec_skip(con);
-	con->out_skip += con_out_kvec_skip(con);
-
-	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
-	     con->out_kvec_bytes, con->out_skip);
-}
-
 /*
  * Revoke a message that was previously queued for send
  */
@@ -3191,26 +1716,6 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	mutex_unlock(&con->mutex);
 }
 
-void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
-{
-	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
-	/* skip rest of message */
-	con->in_base_pos = con->in_base_pos -
-			sizeof(struct ceph_msg_header) -
-			front_len -
-			middle_len -
-			data_len -
-			sizeof(struct ceph_msg_footer);
-
-	con->in_tag = CEPH_MSGR_TAG_READY;
-	con->in_seq++;
-
-	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
-}
-
 /*
  * Revoke a message that we may be reading data into
  */
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
new file mode 100644
index 000000000000..899038a9678e
--- /dev/null
+++ b/net/ceph/messenger_v1.c
@@ -0,0 +1,1502 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
+
+/*
+ * If @buf is NULL, discard up to @len bytes.
+ */
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (!buf)
+		msg.msg_flags |= MSG_TRUNC;
+
+	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
+	r = sock_recvmsg(sock, &msg, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+		     int page_offset, size_t length)
+{
+	struct bio_vec bvec = {
+		.bv_page = page,
+		.bv_offset = page_offset,
+		.bv_len = length
+	};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	BUG_ON(page_offset + length > PAGE_SIZE);
+	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
+	r = sock_recvmsg(sock, &msg, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+			    size_t kvlen, size_t len, bool more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+			     int offset, size_t size, int more)
+{
+	ssize_t (*sendpage)(struct socket *sock, struct page *page,
+			    int offset, size_t size, int flags);
+	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+	int ret;
+
+	/*
+	 * sendpage cannot properly handle pages with page_count == 0,
+	 * we need to fall back to sendmsg if that's the case.
+	 *
+	 * Same goes for slab pages: skb_can_coalesce() allows
+	 * coalescing neighboring slab objects into a single frag which
+	 * triggers one of hardened usercopy checks.
+	 */
+	if (sendpage_ok(page))
+		sendpage = sock->ops->sendpage;
+	else
+		sendpage = sock_no_sendpage;
+
+	ret = sendpage(sock, page, offset, size, flags);
+	if (ret == -EAGAIN)
+		ret = 0;
+
+	return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+	BUG_ON(con->out_skip);
+
+	con->out_kvec_left = 0;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_cur = &con->out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+				size_t size, void *data)
+{
+	int index = con->out_kvec_left;
+
+	BUG_ON(con->out_skip);
+	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+
+	con->out_kvec[index].iov_len = size;
+	con->out_kvec[index].iov_base = data;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += size;
+}
+
+/*
+ * Chop off a kvec from the end.  Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+	int off = con->out_kvec_cur - con->out_kvec;
+	int skip = 0;
+
+	if (con->out_kvec_bytes > 0) {
+		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
+		BUG_ON(con->out_kvec_bytes < skip);
+		BUG_ON(!con->out_kvec_left);
+		con->out_kvec_bytes -= skip;
+		con->out_kvec_left--;
+	}
+
+	return skip;
+}
+
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+	    sizeof(struct ceph_msg_footer) :
+	    sizeof(struct ceph_msg_footer_old);
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+	/* Initialize data cursor */
+
+	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
+	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+		if (con->ops->sign_message)
+			con->ops->sign_message(m);
+		else
+			m->footer.sig = 0;
+	} else {
+		m->old_footer.flags = m->footer.flags;
+	}
+	con->out_more = m->more_to_follow;
+	con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u32 crc;
+
+	con_out_kvec_reset(con);
+	con->out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			&con->out_temp_ack);
+	}
+
+	ceph_con_get_out_msg(con);
+	m = con->out_msg;
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     m->data_length);
+	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+	/* tag + hdr + front + middle */
+	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
+	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+	if (m->middle)
+		con_out_kvec_add(con, m->middle->vec.iov_len,
+			m->middle->vec.iov_base);
+
+	/* fill in hdr crc and finalize hdr */
+	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+	con->out_msg->hdr.crc = cpu_to_le32(crc);
+	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+
+	/* fill in front and middle crc, footer */
+	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+	con->out_msg->footer.front_crc = cpu_to_le32(crc);
+	if (m->middle) {
+		crc = crc32c(0, m->middle->vec.iov_base,
+				m->middle->vec.iov_len);
+		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+	} else
+		con->out_msg->footer.middle_crc = 0;
+	dout("%s front_crc %u middle_crc %u\n", __func__,
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+	con->out_msg->footer.flags = 0;
+
+	/* is there a data payload? */
+	con->out_msg->footer.data_crc = 0;
+	if (m->data_length) {
+		prepare_message_data(con->out_msg, m->data_length);
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
+				&con->out_temp_ack);
+
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+	dout("prepare_write_seq %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			 &con->out_temp_ack);
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con_out_kvec_reset(con);
+	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+		struct timespec64 now;
+
+		ktime_get_real_ts64(&now);
+		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
+		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
+				 &con->out_temp_keepalive2);
+	} else {
+		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+	}
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int get_connect_authorizer(struct ceph_connection *con)
+{
+	struct ceph_auth_handshake *auth;
+	int auth_proto;
+
+	if (!con->ops->get_authorizer) {
+		con->auth = NULL;
+		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+		con->out_connect.authorizer_len = 0;
+		return 0;
+	}
+
+	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
+	if (IS_ERR(auth))
+		return PTR_ERR(auth);
+
+	con->auth = auth;
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
+	return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+					&con->msgr->my_enc_addr);
+
+	con->out_more = 0;
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
+	if (con->auth)
+		con_out_kvec_add(con, con->auth->authorizer_buf_len,
+				 con->auth->authorizer_buf);
+
+	con->out_more = 0;
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
+	int proto;
+	int ret;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+
+	con->out_connect.features =
+	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+
+	ret = get_connect_authorizer(con);
+	if (ret)
+		return ret;
+
+	__prepare_write_connect(con);
+	return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+
+		/* account for full iov entries consumed */
+		while (ret >= con->out_kvec_cur->iov_len) {
+			BUG_ON(!con->out_kvec_left);
+			ret -= con->out_kvec_cur->iov_len;
+			con->out_kvec_cur++;
+			con->out_kvec_left--;
+		}
+		/* and for a partially-consumed entry */
+		if (ret) {
+			con->out_kvec_cur->iov_len -= ret;
+			con->out_kvec_cur->iov_base += ret;
+		}
+	}
+	con->out_kvec_left = 0;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	u32 crc;
+
+	dout("%s %p msg %p\n", __func__, con, msg);
+
+	if (!msg->num_data_items)
+		return -EINVAL;
+
+	/*
+	 * Iterate through each page that contains data to be
+	 * written, and send as much as possible for each.
+	 *
+	 * If we are calculating the data crc (the default), we will
+	 * need to map the page.  If we have no pages, they have
+	 * been revoked, so use the zero page.
+	 */
+	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+	while (cursor->total_resid) {
+		struct page *page;
+		size_t page_offset;
+		size_t length;
+		int ret;
+
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		if (length == cursor->total_resid)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+					more);
+		if (ret <= 0) {
+			if (do_datacrc)
+				msg->footer.data_crc = cpu_to_le32(crc);
+
+			return ret;
+		}
+		if (do_datacrc && cursor->need_crc)
+			crc = ceph_crc32c_page(crc, page, page_offset, length);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+	}
+
+	dout("%s %p msg %p done\n", __func__, con, msg);
+
+	/* prepare and queue up footer, too */
+	if (do_datacrc)
+		msg->footer.data_crc = cpu_to_le32(crc);
+	else
+		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con_out_kvec_reset(con);
+	prepare_write_message_footer(con);
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	int ret;
+
+	dout("%s %p %d left\n", __func__, con, con->out_skip);
+	while (con->out_skip > 0) {
+		size_t size = min(con->out_skip, (int) PAGE_SIZE);
+
+		if (size == con->out_skip)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+					more);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+	dout("prepare_read_seq %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_keepalive_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+static int read_partial(struct ceph_connection *con,
+			int end, int size, void *object)
+{
+	while (con->in_base_pos < end) {
+		int left = end - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int size;
+	int end;
+	int ret;
+
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	size = strlen(CEPH_BANNER);
+	end = size;
+	ret = read_partial(con, end, size, con->in_banner);
+	if (ret <= 0)
+		goto out;
+
+	size = sizeof (con->actual_peer_addr);
+	end += size;
+	ret = read_partial(con, end, size, &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ceph_decode_banner_addr(&con->actual_peer_addr);
+
+	size = sizeof (con->peer_addr_for_me);
+	end += size;
+	ret = read_partial(con, end, size, &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+	ceph_decode_banner_addr(&con->peer_addr_for_me);
+
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int size;
+	int end;
+	int ret;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	size = sizeof (con->in_reply);
+	end = size;
+	ret = read_partial(con, end, size, &con->in_reply);
+	if (ret <= 0)
+		goto out;
+
+	if (con->auth) {
+		size = le32_to_cpu(con->in_reply.authorizer_len);
+		if (size > con->auth->authorizer_reply_buf_len) {
+			pr_err("authorizer reply too big: %d > %zu\n", size,
+			       con->auth->authorizer_reply_buf_len);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		end += size;
+		ret = read_partial(con, end, size,
+				   con->auth->authorizer_reply_buf);
+		if (ret <= 0)
+			goto out;
+	}
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       ceph_pr_addr(&con->peer_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
+			ceph_pr_addr(&con->peer_addr),
+			le32_to_cpu(con->peer_addr.nonce),
+			ceph_pr_addr(&con->actual_peer_addr),
+			le32_to_cpu(con->actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (ceph_addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		ceph_addr_set_port(my_addr, 0);
+		ceph_encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     ceph_pr_addr(my_addr));
+	}
+
+	return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = from_msgr(con->msgr)->supported_features;
+	u64 req_feat = from_msgr(con->msgr)->required_features;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+	int ret;
+
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	if (con->auth) {
+		int len = le32_to_cpu(con->in_reply.authorizer_len);
+
+		/*
+		 * Any connection that defines ->get_authorizer()
+		 * should also define ->add_authorizer_challenge() and
+		 * ->verify_authorizer_reply().
+		 *
+		 * See get_connect_authorizer().
+		 */
+		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+			ret = con->ops->add_authorizer_challenge(
+				    con, con->auth->authorizer_reply_buf, len);
+			if (ret < 0)
+				return ret;
+
+			con_out_kvec_reset(con);
+			__prepare_write_connect(con);
+			prepare_read_connect(con);
+			return 0;
+		}
+
+		if (len) {
+			ret = con->ops->verify_authorizer_reply(con);
+			if (ret < 0) {
+				con->error_msg = "bad authorize reply";
+				return ret;
+			}
+		}
+	}
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			return -1;
+		}
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_reply.connect_seq));
+		pr_info("%s%lld %s session reset\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr));
+		ceph_con_reset_session(con);
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
+			return -EAGAIN;
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_reply.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.global_seq));
+		ceph_get_global_seq(con->msgr,
+				    le32_to_cpu(con->in_reply.global_seq));
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_SEQ:
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       ceph_pr_addr(&con->peer_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			return -1;
+		}
+
+		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+		con->state = CEPH_CON_S_OPEN;
+		con->auth_retry = 0;    /* we authenticated; clear flag */
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		con->peer_features = server_feat;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+
+		con->delay = 0;      /* reset backoff memory */
+
+		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+			prepare_write_seq(con);
+			prepare_read_seq(con);
+		} else {
+			prepare_read_tag(con);
+		}
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		con->error_msg = "protocol error, got WAIT as client";
+		return -1;
+
+	default:
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int size = sizeof (con->in_temp_ack);
+	int end = size;
+
+	return read_partial(con, end, size, &con->in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+
+	if (con->in_tag == CEPH_MSGR_TAG_ACK)
+		ceph_con_discard_sent(con, ack);
+	else
+		ceph_con_discard_requeued(con, ack);
+
+	prepare_read_tag(con);
+}
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section,
+					unsigned int sec_len, u32 *crc)
+{
+	int ret, left;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+	}
+	if (section->iov_len == sec_len)
+		*crc = crc32c(0, section->iov_base, section->iov_len);
+
+	return 1;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->in_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	struct page *page;
+	size_t page_offset;
+	size_t length;
+	u32 crc = 0;
+	int ret;
+
+	if (!msg->num_data_items)
+		return -EIO;
+
+	if (do_datacrc)
+		crc = con->in_data_crc;
+	while (cursor->total_resid) {
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+		if (ret <= 0) {
+			if (do_datacrc)
+				con->in_data_crc = crc;
+
+			return ret;
+		}
+
+		if (do_datacrc)
+			crc = ceph_crc32c_page(crc, page, page_offset, ret);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+	}
+	if (do_datacrc)
+		con->in_data_crc = crc;
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	int size;
+	int end;
+	int ret;
+	unsigned int front_len, middle_len, data_len;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+	u64 seq;
+	u32 crc;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	size = sizeof (con->in_hdr);
+	end = size;
+	ret = read_partial(con, end, size, &con->in_hdr);
+	if (ret <= 0)
+		return ret;
+
+	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
+	if (cpu_to_le32(crc) != con->in_hdr.crc) {
+		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+		       crc, con->in_hdr.crc);
+		return -EBADMSG;
+	}
+
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+
+	/* verify seq# */
+	seq = le64_to_cpu(con->in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr),
+			seq, con->in_seq + 1);
+		con->in_base_pos = -front_len - middle_len - data_len -
+			sizeof_footer(con);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		return 1;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADE;
+	}
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		int skip = 0;
+
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     front_len, data_len);
+		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
+		if (ret < 0)
+			return ret;
+
+		BUG_ON(!con->in_msg ^ skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg said skip message\n");
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof_footer(con);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
+			return 1;
+		}
+
+		BUG_ON(!con->in_msg);
+		BUG_ON(con->in_msg->con != con);
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		/* prepare for data payload, if any */
+
+		if (data_len)
+			prepare_message_data(con->in_msg, data_len);
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec,
+						   middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* (page) data */
+	if (data_len) {
+		ret = read_partial_msg_data(con);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* footer */
+	size = sizeof_footer(con);
+	end += size;
+	ret = read_partial(con, end, size, &m->footer);
+	if (ret <= 0)
+		return ret;
+
+	if (!need_sign) {
+		m->footer.flags = m->old_footer.flags;
+		m->footer.sig = 0;
+	}
+
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (do_datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	if (need_sign && con->ops->check_message_signature &&
+	    con->ops->check_message_signature(m)) {
+		pr_err("read_partial_message %p signature check failed\n", m);
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+	struct ceph_timespec ceph_ts;
+	size_t size = sizeof(ceph_ts);
+	int ret = read_partial(con, size, size, &ceph_ts);
+	if (ret <= 0)
+		return ret;
+	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
+	prepare_read_tag(con);
+	return 1;
+}
+
+/*
+ * Read what we can from the socket.
+ */
+int ceph_con_v1_try_read(struct ceph_connection *con)
+{
+	int ret = -1;
+
+more:
+	dout("try_read start %p state %d\n", con, con->state);
+	if (con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
+		return 0;
+
+	BUG_ON(!con->sock);
+
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+
+	if (con->state == CEPH_CON_S_V1_BANNER) {
+		ret = read_partial_banner(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_banner(con);
+		if (ret < 0)
+			goto out;
+
+		con->state = CEPH_CON_S_V1_CONNECT_MSG;
+
+		/*
+		 * Received banner is good, exchange connection info.
+		 * Do not reset out_kvec, as sending our banner raced
+		 * with receiving peer banner after connect completed.
+		 */
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			goto out;
+		prepare_read_connect(con);
+
+		/* Send connection info before awaiting response */
+		goto out;
+	}
+
+	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_connect(con);
+		if (ret < 0)
+			goto out;
+		goto more;
+	}
+
+	WARN_ON(con->state != CEPH_CON_S_OPEN);
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
+		if (ret <= 0)
+			goto out;
+		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto out;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+			prepare_read_keepalive_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			ceph_con_close_socket(con);
+			con->state = CEPH_CON_S_CLOSED;
+			goto out;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc/signature";
+				fallthrough;
+			case -EBADE:
+				ret = -EIO;
+				break;
+			case -EIO:
+				con->error_msg = "io error";
+				break;
+			}
+			goto out;
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		ceph_con_process_message(con);
+		if (con->state == CEPH_CON_S_OPEN)
+			prepare_read_tag(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
+		/*
+		 * the final handshake seq exchange is semantically
+		 * equivalent to an ACK
+		 */
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto out;
+		process_ack(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+		ret = read_keepalive_ack(con);
+		if (ret <= 0)
+			goto out;
+		goto more;
+	}
+
+out:
+	dout("try_read done on %p ret %d\n", con, ret);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+int ceph_con_v1_try_write(struct ceph_connection *con)
+{
+	int ret = 1;
+
+	dout("try_write start %p state %d\n", con, con->state);
+	if (con->state != CEPH_CON_S_PREOPEN &&
+	    con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
+		return 0;
+
+	/* open the socket first? */
+	if (con->state == CEPH_CON_S_PREOPEN) {
+		BUG_ON(con->sock);
+		con->state = CEPH_CON_S_V1_BANNER;
+
+		con_out_kvec_reset(con);
+		prepare_write_banner(con);
+		prepare_read_banner(con);
+
+		BUG_ON(con->in_msg);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %d\n",
+		     con, con->state);
+		ret = ceph_tcp_connect(con);
+		if (ret < 0) {
+			con->error_msg = "connect error";
+			goto out;
+		}
+	}
+
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+	BUG_ON(!con->sock);
+
+	/* kvec data queued? */
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto out;
+	}
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto out;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_message_data(con);
+		if (ret == 1)
+			goto more;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto out;
+		if (ret < 0) {
+			dout("try_write write_partial_message_data err %d\n",
+			     ret);
+			goto out;
+		}
+	}
+
+do_next:
+	if (con->state == CEPH_CON_S_OPEN) {
+		if (ceph_con_flag_test_and_clear(con,
+				CEPH_CON_F_KEEPALIVE_PENDING)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	dout("try_write nothing else to write.\n");
+	ret = 0;
+out:
+	dout("try_write done on %p ret %d\n", con, ret);
+	return ret;
+}
+
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	WARN_ON(con->out_skip);
+	/* footer */
+	if (con->out_msg_done) {
+		con->out_skip += con_out_kvec_skip(con);
+	} else {
+		WARN_ON(!msg->data_length);
+		con->out_skip += sizeof_footer(con);
+	}
+	/* data, middle, front */
+	if (msg->data_length)
+		con->out_skip += msg->cursor.total_resid;
+	if (msg->middle)
+		con->out_skip += con_out_kvec_skip(con);
+	con->out_skip += con_out_kvec_skip(con);
+
+	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+	     con->out_kvec_bytes, con->out_skip);
+}
+
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+
+	/* skip rest of message */
+	con->in_base_pos = con->in_base_pos -
+			sizeof(struct ceph_msg_header) -
+			front_len -
+			middle_len -
+			data_len -
+			sizeof(struct ceph_msg_footer);
+
+	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->in_seq++;
+
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+}
+
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+	return con->connect_seq;
+}
+
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+	con->connect_seq = 0;
+	con->peer_global_seq = 0;
+}
+
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+	con->out_skip = 0;
+}
-- 
cgit 


From a56dd9bf47220c3206f27075af8bdfb219a2a3cf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 16:31:41 +0100
Subject: libceph: move msgr1 protocol specific fields to its own struct

A couple whitespace fixups, no functional changes.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  76 ++++----
 net/ceph/messenger.c           |   8 +-
 net/ceph/messenger_v1.c        | 420 +++++++++++++++++++++--------------------
 3 files changed, 257 insertions(+), 247 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b5268127c55e..54a64e8dfce6 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -264,6 +264,43 @@ struct ceph_msg {
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
 
+struct ceph_connection_v1_info {
+	struct kvec out_kvec[8],         /* sending header/footer data */
+		*out_kvec_cur;
+	int out_kvec_left;   /* kvec's left in out_kvec */
+	int out_skip;        /* skip this many bytes */
+	int out_kvec_bytes;  /* total bytes left */
+	bool out_more;       /* there is more data after the kvecs */
+	bool out_msg_done;
+
+	struct ceph_auth_handshake *auth;
+	int auth_retry;       /* true if we need a newer authorizer */
+
+	/* connection negotiation temps */
+	u8 in_banner[CEPH_BANNER_MAX_LEN];
+	struct ceph_entity_addr actual_peer_addr;
+	struct ceph_entity_addr peer_addr_for_me;
+	struct ceph_msg_connect out_connect;
+	struct ceph_msg_connect_reply in_reply;
+
+	int in_base_pos;     /* bytes read */
+
+	/* message in temps */
+	u8 in_tag;           /* protocol control byte */
+	struct ceph_msg_header in_hdr;
+	__le64 in_temp_ack;  /* for reading an ack */
+
+	/* message out temps */
+	struct ceph_msg_header out_hdr;
+	__le64 out_temp_ack;  /* for writing an ack */
+	struct ceph_timespec out_temp_keepalive2;  /* for writing keepalive2
+						      stamp */
+
+	u32 connect_seq;      /* identify the most recent connection
+				 attempt for this session */
+	u32 peer_global_seq;  /* peer's global seq for this connection */
+};
+
 /*
  * A single connection with another host.
  *
@@ -281,21 +318,13 @@ struct ceph_connection {
 	int state;  /* CEPH_CON_S_* */
 	atomic_t sock_state;
 	struct socket *sock;
-	struct ceph_entity_addr peer_addr; /* peer address */
-	struct ceph_entity_addr peer_addr_for_me;
 
 	unsigned long flags;  /* CEPH_CON_F_* */
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
-
+	struct ceph_entity_addr peer_addr; /* peer address */
 	u64 peer_features;
-	u32 connect_seq;      /* identify the most recent connection
-				 attempt for this connection, client */
-	u32 peer_global_seq;  /* peer's global seq for this connection */
-
-	struct ceph_auth_handshake *auth;
-	int auth_retry;       /* true if we need a newer authorizer */
 
 	struct mutex mutex;
 
@@ -306,41 +335,18 @@ struct ceph_connection {
 
 	u64 in_seq, in_seq_acked;  /* last message received, acked */
 
-	/* connection negotiation temps */
-	char in_banner[CEPH_BANNER_MAX_LEN];
-	struct ceph_msg_connect out_connect;
-	struct ceph_msg_connect_reply in_reply;
-	struct ceph_entity_addr actual_peer_addr;
-
-	/* message out temps */
-	struct ceph_msg_header out_hdr;
+	struct ceph_msg *in_msg;
 	struct ceph_msg *out_msg;        /* sending message (== tail of
 					    out_sent) */
-	bool out_msg_done;
-
-	struct kvec out_kvec[8],         /* sending header/footer data */
-		*out_kvec_cur;
-	int out_kvec_left;   /* kvec's left in out_kvec */
-	int out_skip;        /* skip this many bytes */
-	int out_kvec_bytes;  /* total bytes left */
-	int out_more;        /* there is more data after the kvecs */
-	__le64 out_temp_ack; /* for writing an ack */
-	struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
-						     stamp */
 
-	/* message in temps */
-	struct ceph_msg_header in_hdr;
-	struct ceph_msg *in_msg;
 	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
 
-	char in_tag;         /* protocol control byte */
-	int in_base_pos;     /* bytes read */
-	__le64 in_temp_ack;  /* for reading an ack */
-
 	struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */
 
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
+
+	struct ceph_connection_v1_info v1;
 };
 
 extern struct page *ceph_zero_page;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 544cfdbe52d6..4fb3c33a7b03 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1448,11 +1448,11 @@ static void con_fault_finish(struct ceph_connection *con)
 	 * in case we faulted due to authentication, invalidate our
 	 * current tickets so that we can get new ones.
 	 */
-	if (con->auth_retry) {
-		dout("auth_retry %d, invalidating\n", con->auth_retry);
+	if (con->v1.auth_retry) {
+		dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
 		if (con->ops->invalidate_authorizer)
 			con->ops->invalidate_authorizer(con);
-		con->auth_retry = 0;
+		con->v1.auth_retry = 0;
 	}
 
 	if (con->ops->fault)
@@ -1631,7 +1631,7 @@ static void clear_standby(struct ceph_connection *con)
 	if (con->state == CEPH_CON_S_STANDBY) {
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
-		con->connect_seq++;
+		con->v1.connect_seq++;
 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 899038a9678e..04f653b3c897 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -110,25 +110,25 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
 
 static void con_out_kvec_reset(struct ceph_connection *con)
 {
-	BUG_ON(con->out_skip);
+	BUG_ON(con->v1.out_skip);
 
-	con->out_kvec_left = 0;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_cur = &con->out_kvec[0];
+	con->v1.out_kvec_left = 0;
+	con->v1.out_kvec_bytes = 0;
+	con->v1.out_kvec_cur = &con->v1.out_kvec[0];
 }
 
 static void con_out_kvec_add(struct ceph_connection *con,
 				size_t size, void *data)
 {
-	int index = con->out_kvec_left;
+	int index = con->v1.out_kvec_left;
 
-	BUG_ON(con->out_skip);
-	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+	BUG_ON(con->v1.out_skip);
+	BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
 
-	con->out_kvec[index].iov_len = size;
-	con->out_kvec[index].iov_base = data;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += size;
+	con->v1.out_kvec[index].iov_len = size;
+	con->v1.out_kvec[index].iov_base = data;
+	con->v1.out_kvec_left++;
+	con->v1.out_kvec_bytes += size;
 }
 
 /*
@@ -138,15 +138,14 @@ static void con_out_kvec_add(struct ceph_connection *con,
  */
 static int con_out_kvec_skip(struct ceph_connection *con)
 {
-	int off = con->out_kvec_cur - con->out_kvec;
 	int skip = 0;
 
-	if (con->out_kvec_bytes > 0) {
-		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
-		BUG_ON(con->out_kvec_bytes < skip);
-		BUG_ON(!con->out_kvec_left);
-		con->out_kvec_bytes -= skip;
-		con->out_kvec_left--;
+	if (con->v1.out_kvec_bytes > 0) {
+		skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
+		BUG_ON(con->v1.out_kvec_bytes < skip);
+		BUG_ON(!con->v1.out_kvec_left);
+		con->v1.out_kvec_bytes -= skip;
+		con->v1.out_kvec_left--;
 	}
 
 	return skip;
@@ -186,8 +185,8 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	} else {
 		m->old_footer.flags = m->footer.flags;
 	}
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
+	con->v1.out_more = m->more_to_follow;
+	con->v1.out_msg_done = true;
 }
 
 /*
@@ -199,16 +198,16 @@ static void prepare_write_message(struct ceph_connection *con)
 	u32 crc;
 
 	con_out_kvec_reset(con);
-	con->out_msg_done = false;
+	con->v1.out_msg_done = false;
 
 	/* Sneak an ack in there first?  If we can get it into the same
 	 * TCP packet that's a good thing. */
 	if (con->in_seq > con->in_seq_acked) {
 		con->in_seq_acked = con->in_seq;
 		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			&con->out_temp_ack);
+		con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			&con->v1.out_temp_ack);
 	}
 
 	ceph_con_get_out_msg(con);
@@ -223,7 +222,7 @@ static void prepare_write_message(struct ceph_connection *con)
 
 	/* tag + hdr + front + middle */
 	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
+	con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
 	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
 
 	if (m->middle)
@@ -233,7 +232,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	/* fill in hdr crc and finalize hdr */
 	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
 	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+	memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
 
 	/* fill in front and middle crc, footer */
 	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
@@ -253,7 +252,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	con->out_msg->footer.data_crc = 0;
 	if (m->data_length) {
 		prepare_message_data(con->out_msg, m->data_length);
-		con->out_more = 1;  /* data + footer will follow */
+		con->v1.out_more = 1;  /* data + footer will follow */
 	} else {
 		/* no, queue up footer too and be done */
 		prepare_write_message_footer(con);
@@ -275,11 +274,11 @@ static void prepare_write_ack(struct ceph_connection *con)
 
 	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
 
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-				&con->out_temp_ack);
+	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			 &con->v1.out_temp_ack);
 
-	con->out_more = 1;  /* more will follow.. eventually.. */
+	con->v1.out_more = 1;  /* more will follow.. eventually.. */
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
@@ -294,9 +293,9 @@ static void prepare_write_seq(struct ceph_connection *con)
 
 	con_out_kvec_reset(con);
 
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			 &con->out_temp_ack);
+	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			 &con->v1.out_temp_ack);
 
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
@@ -313,9 +312,9 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 
 		ktime_get_real_ts64(&now);
 		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
-		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
-		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
-				 &con->out_temp_keepalive2);
+		ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
+		con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
+				 &con->v1.out_temp_keepalive2);
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
@@ -332,19 +331,20 @@ static int get_connect_authorizer(struct ceph_connection *con)
 	int auth_proto;
 
 	if (!con->ops->get_authorizer) {
-		con->auth = NULL;
-		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
-		con->out_connect.authorizer_len = 0;
+		con->v1.auth = NULL;
+		con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+		con->v1.out_connect.authorizer_len = 0;
 		return 0;
 	}
 
-	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
+	auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
 	if (IS_ERR(auth))
 		return PTR_ERR(auth);
 
-	con->auth = auth;
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
-	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
+	con->v1.auth = auth;
+	con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+	con->v1.out_connect.authorizer_len =
+		cpu_to_le32(auth->authorizer_buf_len);
 	return 0;
 }
 
@@ -357,18 +357,19 @@ static void prepare_write_banner(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
 					&con->msgr->my_enc_addr);
 
-	con->out_more = 0;
+	con->v1.out_more = 0;
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
 {
-	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
-	if (con->auth)
-		con_out_kvec_add(con, con->auth->authorizer_buf_len,
-				 con->auth->authorizer_buf);
+	con_out_kvec_add(con, sizeof(con->v1.out_connect),
+			 &con->v1.out_connect);
+	if (con->v1.auth)
+		con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
+				 con->v1.auth->authorizer_buf);
 
-	con->out_more = 0;
+	con->v1.out_more = 0;
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
@@ -393,15 +394,15 @@ static int prepare_write_connect(struct ceph_connection *con)
 	}
 
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
+	     con->v1.connect_seq, global_seq, proto);
 
-	con->out_connect.features =
-	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
+	con->v1.out_connect.features =
+		cpu_to_le64(from_msgr(con->msgr)->supported_features);
+	con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
+	con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
+	con->v1.out_connect.protocol_version = cpu_to_le32(proto);
+	con->v1.out_connect.flags = 0;
 
 	ret = get_connect_authorizer(con);
 	if (ret)
@@ -421,35 +422,36 @@ static int write_partial_kvec(struct ceph_connection *con)
 {
 	int ret;
 
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
+	dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
+	while (con->v1.out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
+				       con->v1.out_kvec_left,
+				       con->v1.out_kvec_bytes,
+				       con->v1.out_more);
 		if (ret <= 0)
 			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
+		con->v1.out_kvec_bytes -= ret;
+		if (!con->v1.out_kvec_bytes)
 			break;            /* done */
 
 		/* account for full iov entries consumed */
-		while (ret >= con->out_kvec_cur->iov_len) {
-			BUG_ON(!con->out_kvec_left);
-			ret -= con->out_kvec_cur->iov_len;
-			con->out_kvec_cur++;
-			con->out_kvec_left--;
+		while (ret >= con->v1.out_kvec_cur->iov_len) {
+			BUG_ON(!con->v1.out_kvec_left);
+			ret -= con->v1.out_kvec_cur->iov_len;
+			con->v1.out_kvec_cur++;
+			con->v1.out_kvec_left--;
 		}
 		/* and for a partially-consumed entry */
 		if (ret) {
-			con->out_kvec_cur->iov_len -= ret;
-			con->out_kvec_cur->iov_base += ret;
+			con->v1.out_kvec_cur->iov_len -= ret;
+			con->v1.out_kvec_cur->iov_base += ret;
 		}
 	}
-	con->out_kvec_left = 0;
+	con->v1.out_kvec_left = 0;
 	ret = 1;
 out:
 	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	     con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
 	return ret;  /* done! */
 }
 
@@ -530,17 +532,17 @@ static int write_partial_skip(struct ceph_connection *con)
 	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
 	int ret;
 
-	dout("%s %p %d left\n", __func__, con, con->out_skip);
-	while (con->out_skip > 0) {
-		size_t size = min(con->out_skip, (int) PAGE_SIZE);
+	dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
+	while (con->v1.out_skip > 0) {
+		size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
 
-		if (size == con->out_skip)
+		if (size == con->v1.out_skip)
 			more = MSG_MORE;
 		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
 					more);
 		if (ret <= 0)
 			goto out;
-		con->out_skip -= ret;
+		con->v1.out_skip -= ret;
 	}
 	ret = 1;
 out:
@@ -553,39 +555,39 @@ out:
 static void prepare_read_banner(struct ceph_connection *con)
 {
 	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_connect(struct ceph_connection *con)
 {
 	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_seq(struct ceph_connection *con)
 {
 	dout("prepare_read_seq %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_SEQ;
+	con->v1.in_base_pos = 0;
+	con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
 }
 
 static void prepare_read_tag(struct ceph_connection *con)
 {
 	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->v1.in_base_pos = 0;
+	con->v1.in_tag = CEPH_MSGR_TAG_READY;
 }
 
 static void prepare_read_keepalive_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_keepalive_ack %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 /*
@@ -595,7 +597,7 @@ static int prepare_read_message(struct ceph_connection *con)
 {
 	dout("prepare_read_message %p\n", con);
 	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
 	return 0;
 }
@@ -603,13 +605,13 @@ static int prepare_read_message(struct ceph_connection *con)
 static int read_partial(struct ceph_connection *con,
 			int end, int size, void *object)
 {
-	while (con->in_base_pos < end) {
-		int left = end - con->in_base_pos;
+	while (con->v1.in_base_pos < end) {
+		int left = end - con->v1.in_base_pos;
 		int have = size - left;
 		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
 		if (ret <= 0)
 			return ret;
-		con->in_base_pos += ret;
+		con->v1.in_base_pos += ret;
 	}
 	return 1;
 }
@@ -623,28 +625,28 @@ static int read_partial_banner(struct ceph_connection *con)
 	int end;
 	int ret;
 
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+	dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
 
 	/* peer's banner */
 	size = strlen(CEPH_BANNER);
 	end = size;
-	ret = read_partial(con, end, size, con->in_banner);
+	ret = read_partial(con, end, size, con->v1.in_banner);
 	if (ret <= 0)
 		goto out;
 
-	size = sizeof (con->actual_peer_addr);
+	size = sizeof(con->v1.actual_peer_addr);
 	end += size;
-	ret = read_partial(con, end, size, &con->actual_peer_addr);
+	ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_banner_addr(&con->actual_peer_addr);
+	ceph_decode_banner_addr(&con->v1.actual_peer_addr);
 
-	size = sizeof (con->peer_addr_for_me);
+	size = sizeof(con->v1.peer_addr_for_me);
 	end += size;
-	ret = read_partial(con, end, size, &con->peer_addr_for_me);
+	ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_banner_addr(&con->peer_addr_for_me);
+	ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
 
 out:
 	return ret;
@@ -656,34 +658,34 @@ static int read_partial_connect(struct ceph_connection *con)
 	int end;
 	int ret;
 
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+	dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
 
-	size = sizeof (con->in_reply);
+	size = sizeof(con->v1.in_reply);
 	end = size;
-	ret = read_partial(con, end, size, &con->in_reply);
+	ret = read_partial(con, end, size, &con->v1.in_reply);
 	if (ret <= 0)
 		goto out;
 
-	if (con->auth) {
-		size = le32_to_cpu(con->in_reply.authorizer_len);
-		if (size > con->auth->authorizer_reply_buf_len) {
+	if (con->v1.auth) {
+		size = le32_to_cpu(con->v1.in_reply.authorizer_len);
+		if (size > con->v1.auth->authorizer_reply_buf_len) {
 			pr_err("authorizer reply too big: %d > %zu\n", size,
-			       con->auth->authorizer_reply_buf_len);
+			       con->v1.auth->authorizer_reply_buf_len);
 			ret = -EINVAL;
 			goto out;
 		}
 
 		end += size;
 		ret = read_partial(con, end, size,
-				   con->auth->authorizer_reply_buf);
+				   con->v1.auth->authorizer_reply_buf);
 		if (ret <= 0)
 			goto out;
 	}
 
 	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
+	     con, con->v1.in_reply.tag,
+	     le32_to_cpu(con->v1.in_reply.connect_seq),
+	     le32_to_cpu(con->v1.in_reply.global_seq));
 out:
 	return ret;
 }
@@ -693,7 +695,7 @@ out:
  */
 static int verify_hello(struct ceph_connection *con)
 {
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+	if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
 		pr_err("connect to %s got bad banner\n",
 		       ceph_pr_addr(&con->peer_addr));
 		con->error_msg = "protocol error, bad banner";
@@ -716,15 +718,15 @@ static int process_banner(struct ceph_connection *con)
 	 * end may not yet know their ip address, so if it's 0.0.0.0, give
 	 * them the benefit of the doubt.
 	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+	if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
 		   sizeof(con->peer_addr)) != 0 &&
-	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+	    !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
+	      con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
 		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
 			ceph_pr_addr(&con->peer_addr),
 			le32_to_cpu(con->peer_addr.nonce),
-			ceph_pr_addr(&con->actual_peer_addr),
-			le32_to_cpu(con->actual_peer_addr.nonce));
+			ceph_pr_addr(&con->v1.actual_peer_addr),
+			le32_to_cpu(con->v1.actual_peer_addr.nonce));
 		con->error_msg = "wrong peer at address";
 		return -1;
 	}
@@ -734,8 +736,8 @@ static int process_banner(struct ceph_connection *con)
 	 */
 	if (ceph_addr_is_blank(my_addr)) {
 		memcpy(&my_addr->in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
+		       &con->v1.peer_addr_for_me.in_addr,
+		       sizeof(con->v1.peer_addr_for_me.in_addr));
 		ceph_addr_set_port(my_addr, 0);
 		ceph_encode_my_addr(con->msgr);
 		dout("process_banner learned my addr is %s\n",
@@ -749,13 +751,13 @@ static int process_connect(struct ceph_connection *con)
 {
 	u64 sup_feat = from_msgr(con->msgr)->supported_features;
 	u64 req_feat = from_msgr(con->msgr)->required_features;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
+	u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
 	int ret;
 
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+	dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
 
-	if (con->auth) {
-		int len = le32_to_cpu(con->in_reply.authorizer_len);
+	if (con->v1.auth) {
+		int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
 
 		/*
 		 * Any connection that defines ->get_authorizer()
@@ -764,9 +766,10 @@ static int process_connect(struct ceph_connection *con)
 		 *
 		 * See get_connect_authorizer().
 		 */
-		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+		if (con->v1.in_reply.tag ==
+				CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
 			ret = con->ops->add_authorizer_challenge(
-				    con, con->auth->authorizer_reply_buf, len);
+				con, con->v1.auth->authorizer_reply_buf, len);
 			if (ret < 0)
 				return ret;
 
@@ -785,7 +788,7 @@ static int process_connect(struct ceph_connection *con)
 		}
 	}
 
-	switch (con->in_reply.tag) {
+	switch (con->v1.in_reply.tag) {
 	case CEPH_MSGR_TAG_FEATURES:
 		pr_err("%s%lld %s feature set mismatch,"
 		       " my %llx < server's %llx, missing %llx\n",
@@ -800,16 +803,16 @@ static int process_connect(struct ceph_connection *con)
 		       " my %d != server's %d\n",
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
+		       le32_to_cpu(con->v1.out_connect.protocol_version),
+		       le32_to_cpu(con->v1.in_reply.protocol_version));
 		con->error_msg = "protocol version mismatch";
 		return -1;
 
 	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
+		con->v1.auth_retry++;
 		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
+		     con->v1.auth_retry);
+		if (con->v1.auth_retry == 2) {
 			con->error_msg = "connect authorization failure";
 			return -1;
 		}
@@ -829,7 +832,7 @@ static int process_connect(struct ceph_connection *con)
 		 * dropped messages.
 		 */
 		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_reply.connect_seq));
+		     le32_to_cpu(con->v1.in_reply.connect_seq));
 		pr_info("%s%lld %s session reset\n",
 			ENTITY_NAME(con->peer_name),
 			ceph_pr_addr(&con->peer_addr));
@@ -855,9 +858,9 @@ static int process_connect(struct ceph_connection *con)
 		 * again with a larger value.
 		 */
 		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_reply.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
+		     le32_to_cpu(con->v1.out_connect.connect_seq),
+		     le32_to_cpu(con->v1.in_reply.connect_seq));
+		con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -871,10 +874,10 @@ static int process_connect(struct ceph_connection *con)
 		 * again with a larger value.
 		 */
 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.global_seq));
+		     con->v1.peer_global_seq,
+		     le32_to_cpu(con->v1.in_reply.global_seq));
 		ceph_get_global_seq(con->msgr,
-				    le32_to_cpu(con->in_reply.global_seq));
+				    le32_to_cpu(con->v1.in_reply.global_seq));
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -896,23 +899,24 @@ static int process_connect(struct ceph_connection *con)
 
 		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
 		con->state = CEPH_CON_S_OPEN;
-		con->auth_retry = 0;    /* we authenticated; clear flag */
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
+		con->v1.auth_retry = 0;    /* we authenticated; clear flag */
+		con->v1.peer_global_seq =
+			le32_to_cpu(con->v1.in_reply.global_seq);
+		con->v1.connect_seq++;
 		con->peer_features = server_feat;
 		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
+		     con->v1.peer_global_seq,
+		     le32_to_cpu(con->v1.in_reply.connect_seq),
+		     con->v1.connect_seq);
+		WARN_ON(con->v1.connect_seq !=
+			le32_to_cpu(con->v1.in_reply.connect_seq));
 
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+		if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
 			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
-		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+		if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
 			prepare_write_seq(con);
 			prepare_read_seq(con);
 		} else {
@@ -942,10 +946,10 @@ static int process_connect(struct ceph_connection *con)
  */
 static int read_partial_ack(struct ceph_connection *con)
 {
-	int size = sizeof (con->in_temp_ack);
+	int size = sizeof(con->v1.in_temp_ack);
 	int end = size;
 
-	return read_partial(con, end, size, &con->in_temp_ack);
+	return read_partial(con, end, size, &con->v1.in_temp_ack);
 }
 
 /*
@@ -953,9 +957,9 @@ static int read_partial_ack(struct ceph_connection *con)
  */
 static void process_ack(struct ceph_connection *con)
 {
-	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 ack = le64_to_cpu(con->v1.in_temp_ack);
 
-	if (con->in_tag == CEPH_MSGR_TAG_ACK)
+	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
 		ceph_con_discard_sent(con, ack);
 	else
 		ceph_con_discard_requeued(con, ack);
@@ -1045,39 +1049,39 @@ static int read_partial_message(struct ceph_connection *con)
 	dout("read_partial_message con %p msg %p\n", con, m);
 
 	/* header */
-	size = sizeof (con->in_hdr);
+	size = sizeof(con->v1.in_hdr);
 	end = size;
-	ret = read_partial(con, end, size, &con->in_hdr);
+	ret = read_partial(con, end, size, &con->v1.in_hdr);
 	if (ret <= 0)
 		return ret;
 
-	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
-	if (cpu_to_le32(crc) != con->in_hdr.crc) {
+	crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
+	if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
 		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
-		       crc, con->in_hdr.crc);
+		       crc, con->v1.in_hdr.crc);
 		return -EBADMSG;
 	}
 
-	front_len = le32_to_cpu(con->in_hdr.front_len);
+	front_len = le32_to_cpu(con->v1.in_hdr.front_len);
 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
 		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
 	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
 		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
+	data_len = le32_to_cpu(con->v1.in_hdr.data_len);
 	if (data_len > CEPH_MSG_MAX_DATA_LEN)
 		return -EIO;
 
 	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
+	seq = le64_to_cpu(con->v1.in_hdr.seq);
 	if ((s64)seq - (s64)con->in_seq < 1) {
 		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
 			ENTITY_NAME(con->peer_name),
 			ceph_pr_addr(&con->peer_addr),
 			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof_footer(con);
-		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->v1.in_base_pos = -front_len - middle_len - data_len -
+				      sizeof_footer(con);
+		con->v1.in_tag = CEPH_MSGR_TAG_READY;
 		return 1;
 	} else if ((s64)seq - (s64)con->in_seq > 1) {
 		pr_err("read_partial_message bad seq %lld expected %lld\n",
@@ -1090,9 +1094,9 @@ static int read_partial_message(struct ceph_connection *con)
 	if (!con->in_msg) {
 		int skip = 0;
 
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
 		     front_len, data_len);
-		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
+		ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
 		if (ret < 0)
 			return ret;
 
@@ -1100,9 +1104,9 @@ static int read_partial_message(struct ceph_connection *con)
 		if (skip) {
 			/* skip this message */
 			dout("alloc_msg said skip message\n");
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof_footer(con);
-			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->v1.in_base_pos = -front_len - middle_len -
+					      data_len - sizeof_footer(con);
+			con->v1.in_tag = CEPH_MSGR_TAG_READY;
 			con->in_seq++;
 			return 1;
 		}
@@ -1214,8 +1218,8 @@ more:
 
 	BUG_ON(!con->sock);
 
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
+	dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
+	     con->v1.in_base_pos);
 
 	if (con->state == CEPH_CON_S_V1_BANNER) {
 		ret = read_partial_banner(con);
@@ -1253,27 +1257,27 @@ more:
 
 	WARN_ON(con->state != CEPH_CON_S_OPEN);
 
-	if (con->in_base_pos < 0) {
+	if (con->v1.in_base_pos < 0) {
 		/*
 		 * skipping + discarding content.
 		 */
-		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
 		if (ret <= 0)
 			goto out;
-		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
+		dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
+		con->v1.in_base_pos += ret;
+		if (con->v1.in_base_pos)
 			goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
 		/*
 		 * what's next?
 		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
 		if (ret <= 0)
 			goto out;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
+		dout("try_read got tag %d\n", con->v1.in_tag);
+		switch (con->v1.in_tag) {
 		case CEPH_MSGR_TAG_MSG:
 			prepare_read_message(con);
 			break;
@@ -1291,7 +1295,7 @@ more:
 			goto bad_tag;
 		}
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
 		ret = read_partial_message(con);
 		if (ret <= 0) {
 			switch (ret) {
@@ -1307,15 +1311,15 @@ more:
 			}
 			goto out;
 		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
+		if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
 		ceph_con_process_message(con);
 		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
-	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
 		/*
 		 * the final handshake seq exchange is semantically
 		 * equivalent to an ACK
@@ -1326,7 +1330,7 @@ more:
 		process_ack(con);
 		goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
 		ret = read_keepalive_ack(con);
 		if (ret <= 0)
 			goto out;
@@ -1338,7 +1342,7 @@ out:
 	return ret;
 
 bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	pr_err("try_read bad tag %d\n", con->v1.in_tag);
 	con->error_msg = "protocol error, garbage tag";
 	ret = -1;
 	goto out;
@@ -1369,7 +1373,7 @@ int ceph_con_v1_try_write(struct ceph_connection *con)
 		prepare_read_banner(con);
 
 		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->v1.in_tag = CEPH_MSGR_TAG_READY;
 		dout("try_write initiating connect on %p new state %d\n",
 		     con, con->state);
 		ret = ceph_tcp_connect(con);
@@ -1380,16 +1384,16 @@ int ceph_con_v1_try_write(struct ceph_connection *con)
 	}
 
 more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+	dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
 	BUG_ON(!con->sock);
 
 	/* kvec data queued? */
-	if (con->out_kvec_left) {
+	if (con->v1.out_kvec_left) {
 		ret = write_partial_kvec(con);
 		if (ret <= 0)
 			goto out;
 	}
-	if (con->out_skip) {
+	if (con->v1.out_skip) {
 		ret = write_partial_skip(con);
 		if (ret <= 0)
 			goto out;
@@ -1397,7 +1401,7 @@ more:
 
 	/* msg pages? */
 	if (con->out_msg) {
-		if (con->out_msg_done) {
+		if (con->v1.out_msg_done) {
 			ceph_msg_put(con->out_msg);
 			con->out_msg = NULL;   /* we're done with this one */
 			goto do_next;
@@ -1446,57 +1450,57 @@ void ceph_con_v1_revoke(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
 
-	WARN_ON(con->out_skip);
+	WARN_ON(con->v1.out_skip);
 	/* footer */
-	if (con->out_msg_done) {
-		con->out_skip += con_out_kvec_skip(con);
+	if (con->v1.out_msg_done) {
+		con->v1.out_skip += con_out_kvec_skip(con);
 	} else {
 		WARN_ON(!msg->data_length);
-		con->out_skip += sizeof_footer(con);
+		con->v1.out_skip += sizeof_footer(con);
 	}
 	/* data, middle, front */
 	if (msg->data_length)
-		con->out_skip += msg->cursor.total_resid;
+		con->v1.out_skip += msg->cursor.total_resid;
 	if (msg->middle)
-		con->out_skip += con_out_kvec_skip(con);
-	con->out_skip += con_out_kvec_skip(con);
+		con->v1.out_skip += con_out_kvec_skip(con);
+	con->v1.out_skip += con_out_kvec_skip(con);
 
 	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
-	     con->out_kvec_bytes, con->out_skip);
+	     con->v1.out_kvec_bytes, con->v1.out_skip);
 }
 
 void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
 {
-	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+	unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
 
 	/* skip rest of message */
-	con->in_base_pos = con->in_base_pos -
+	con->v1.in_base_pos = con->v1.in_base_pos -
 			sizeof(struct ceph_msg_header) -
 			front_len -
 			middle_len -
 			data_len -
 			sizeof(struct ceph_msg_footer);
 
-	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->v1.in_tag = CEPH_MSGR_TAG_READY;
 	con->in_seq++;
 
-	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
 }
 
 bool ceph_con_v1_opened(struct ceph_connection *con)
 {
-	return con->connect_seq;
+	return con->v1.connect_seq;
 }
 
 void ceph_con_v1_reset_session(struct ceph_connection *con)
 {
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
+	con->v1.connect_seq = 0;
+	con->v1.peer_global_seq = 0;
 }
 
 void ceph_con_v1_reset_protocol(struct ceph_connection *con)
 {
-	con->out_skip = 0;
+	con->v1.out_skip = 0;
 }
-- 
cgit 


From 285ea34fc876aa0a2c5e65d310c4a41269e2e5f2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 16:47:20 +0100
Subject: libceph, ceph: incorporate nautilus cephx changes

- request service tickets together with auth ticket.  Currently we get
  auth ticket via CEPHX_GET_AUTH_SESSION_KEY op and then request service
  tickets via CEPHX_GET_PRINCIPAL_SESSION_KEY op in a separate message.
  Since nautilus, desired service tickets are shared togther with auth
  ticket in CEPHX_GET_AUTH_SESSION_KEY reply.

- propagate session key and connection secret, if any.  In preparation
  for msgr2, update handle_reply() and verify_authorizer_reply() auth
  ops to propagate session key and connection secret.  Since nautilus,
  if secure mode is negotiated, connection secret is shared either in
  CEPHX_GET_AUTH_SESSION_KEY reply (for mons) or in a final authorizer
  reply (for osds and mdses).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c       |   5 +-
 include/linux/ceph/auth.h  |  16 +++-
 net/ceph/auth.c            |  12 ++-
 net/ceph/auth_none.c       |   4 +-
 net/ceph/auth_x.c          | 215 ++++++++++++++++++++++++++++++++++++---------
 net/ceph/auth_x_protocol.h |   3 +-
 net/ceph/crypto.h          |   3 +
 net/ceph/osd_client.c      |   5 +-
 8 files changed, 210 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a256d95ec99a..278fe67e2617 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5178,8 +5178,11 @@ static int verify_authorizer_reply(struct ceph_connection *con)
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
 
-	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+		NULL, NULL, NULL, NULL);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 6728c2ee0205..d9e7d0bcdaf1 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -53,7 +53,9 @@ struct ceph_auth_client_ops {
 	 */
 	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
 	int (*handle_reply)(struct ceph_auth_client *ac, int result,
-			    void *buf, void *end);
+			    void *buf, void *end, u8 *session_key,
+			    int *session_key_len, u8 *con_secret,
+			    int *con_secret_len);
 
 	/*
 	 * Create authorizer for connecting to a service, and verify
@@ -69,7 +71,10 @@ struct ceph_auth_client_ops {
 					void *challenge_buf,
 					int challenge_buf_len);
 	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
-				       struct ceph_authorizer *a);
+				       struct ceph_authorizer *a,
+				       void *reply, int reply_len,
+				       u8 *session_key, int *session_key_len,
+				       u8 *con_secret, int *con_secret_len);
 	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
 				      int peer_type);
 
@@ -126,8 +131,11 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
 				       int challenge_buf_len);
-extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
-					     struct ceph_authorizer *a);
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a,
+				      void *reply, int reply_len,
+				      u8 *session_key, int *session_key_len,
+				      u8 *con_secret, int *con_secret_len);
 extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
 					    int peer_type);
 
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index fbeee068ea14..40d3d95344d9 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -240,7 +240,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		ac->negotiating = false;
 	}
 
-	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end,
+				    NULL, NULL, NULL, NULL);
 	if (ret == -EAGAIN) {
 		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
 	} else if (ret) {
@@ -332,13 +333,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
 
 int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
+				      struct ceph_authorizer *a,
+				      void *reply, int reply_len,
+				      u8 *session_key, int *session_key_len,
+				      u8 *con_secret, int *con_secret_len)
 {
 	int ret = 0;
 
 	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->verify_authorizer_reply)
-		ret = ac->ops->verify_authorizer_reply(ac, a);
+		ret = ac->ops->verify_authorizer_reply(ac, a,
+			reply, reply_len, session_key, session_key_len,
+			con_secret, con_secret_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index edb7042479ed..af8ae507e861 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
  * authenticate state, so nothing happens here.
  */
 static int handle_reply(struct ceph_auth_client *ac, int result,
-			void *buf, void *end)
+			void *buf, void *end, u8 *session_key,
+			int *session_key_len, u8 *con_secret,
+			int *con_secret_len)
 {
 	struct ceph_auth_none_info *xi = ac->private;
 
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 425508d4dafd..a265792642dc 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -269,22 +269,21 @@ out:
 
 static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 				    struct ceph_crypto_key *secret,
-				    void *buf, void *end)
+				    void **p, void *end)
 {
-	void *p = buf;
 	u8 reply_struct_v;
 	u32 num;
 	int ret;
 
-	ceph_decode_8_safe(&p, end, reply_struct_v, bad);
+	ceph_decode_8_safe(p, end, reply_struct_v, bad);
 	if (reply_struct_v != 1)
 		return -EINVAL;
 
-	ceph_decode_32_safe(&p, end, num, bad);
+	ceph_decode_32_safe(p, end, num, bad);
 	dout("%d tickets\n", num);
 
 	while (num--) {
-		ret = process_one_ticket(ac, secret, &p, end);
+		ret = process_one_ticket(ac, secret, p, end);
 		if (ret)
 			return ret;
 	}
@@ -527,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		if (ret < 0)
 			return ret;
 
-		auth->struct_v = 1;
+		auth->struct_v = 2;  /* nautilus+ */
 		auth->key = 0;
 		for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
 			auth->key ^= *(__le64 *)u;
@@ -540,6 +539,10 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		if (ret < 0)
 			return ret;
 
+		/* nautilus+: request service tickets at the same time */
+		need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH;
+		WARN_ON(!need);
+		ceph_encode_32_safe(&p, end, need, e_range);
 		return p - buf;
 	}
 
@@ -566,8 +569,82 @@ e_range:
 	return -ERANGE;
 }
 
+static int handle_auth_session_key(struct ceph_auth_client *ac,
+				   void **p, void *end,
+				   u8 *session_key, int *session_key_len,
+				   u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_ticket_handler *th;
+	void *dp, *dend;
+	int len;
+	int ret;
+
+	/* AUTH ticket */
+	ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+	if (ret)
+		return ret;
+
+	if (*p == end) {
+		/* pre-nautilus (or didn't request service tickets!) */
+		WARN_ON(session_key || con_secret);
+		return 0;
+	}
+
+	th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	if (session_key) {
+		memcpy(session_key, th->session_key.key, th->session_key.len);
+		*session_key_len = th->session_key.len;
+	}
+
+	/* connection secret */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	dout("%s connection secret blob len %d\n", __func__, len);
+	if (len > 0) {
+		dp = *p + ceph_x_encrypt_offset();
+		ret = ceph_x_decrypt(&th->session_key, p, *p + len);
+		if (ret < 0)
+			return ret;
+
+		dout("%s decrypted %d bytes\n", __func__, ret);
+		dend = dp + ret;
+
+		ceph_decode_32_safe(&dp, dend, len, e_inval);
+		if (len > CEPH_MAX_CON_SECRET_LEN) {
+			pr_err("connection secret too big %d\n", len);
+			return -EINVAL;
+		}
+
+		dout("%s connection secret len %d\n", __func__, len);
+		if (con_secret) {
+			memcpy(con_secret, dp, len);
+			*con_secret_len = len;
+		}
+	}
+
+	/* service tickets */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	dout("%s service tickets blob len %d\n", __func__, len);
+	if (len > 0) {
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       p, *p + len);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-			       void *buf, void *end)
+			       void *buf, void *end,
+			       u8 *session_key, int *session_key_len,
+			       u8 *con_secret, int *con_secret_len)
 {
 	struct ceph_x_info *xi = ac->private;
 	struct ceph_x_ticket_handler *th;
@@ -599,8 +676,10 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 	dout("handle_reply op %d result %d\n", op, result);
 	switch (op) {
 	case CEPHX_GET_AUTH_SESSION_KEY:
-		/* verify auth key */
-		ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+		/* AUTH ticket + [connection secret] + service tickets */
+		ret = handle_auth_session_key(ac, &p, end, session_key,
+					      session_key_len, con_secret,
+					      con_secret_len);
 		break;
 
 	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
@@ -608,7 +687,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 		if (IS_ERR(th))
 			return PTR_ERR(th);
 
-		ret = ceph_x_proc_ticket_reply(ac, &th->session_key, p, end);
+		/* service tickets */
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end);
 		break;
 
 	default:
@@ -687,40 +767,44 @@ static int ceph_x_update_authorizer(
 	return 0;
 }
 
-static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
-				       void *challenge_buf,
-				       int challenge_buf_len,
-				       u64 *server_challenge)
+/*
+ * CephXAuthorizeChallenge
+ */
+static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret,
+					void *challenge, int challenge_len,
+					u64 *server_challenge)
 {
-	struct ceph_x_authorize_challenge *ch =
-	    challenge_buf + sizeof(struct ceph_x_encrypt_header);
+	void *dp, *dend;
 	int ret;
 
 	/* no leading len */
-	ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
-			       challenge_buf_len);
+	ret = __ceph_x_decrypt(secret, challenge, challenge_len);
 	if (ret < 0)
 		return ret;
-	if (ret < sizeof(*ch)) {
-		pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
-		return -EINVAL;
-	}
 
-	*server_challenge = le64_to_cpu(ch->server_challenge);
+	dout("%s decrypted %d bytes\n", __func__, ret);
+	dp = challenge + sizeof(struct ceph_x_encrypt_header);
+	dend = dp + ret;
+
+	ceph_decode_skip_8(&dp, dend, e_inval);  /* struct_v */
+	ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval);
+	dout("%s server_challenge %llu\n", __func__, *server_challenge);
 	return 0;
+
+e_inval:
+	return -EINVAL;
 }
 
 static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
 					   struct ceph_authorizer *a,
-					   void *challenge_buf,
-					   int challenge_buf_len)
+					   void *challenge, int challenge_len)
 {
 	struct ceph_x_authorizer *au = (void *)a;
 	u64 server_challenge;
 	int ret;
 
-	ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
-					  &server_challenge);
+	ret = decrypt_authorizer_challenge(&au->session_key, challenge,
+					   challenge_len, &server_challenge);
 	if (ret) {
 		pr_err("failed to decrypt authorize challenge: %d", ret);
 		return ret;
@@ -735,29 +819,76 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
 	return 0;
 }
 
+/*
+ * CephXAuthorizeReply
+ */
+static int decrypt_authorizer_reply(struct ceph_crypto_key *secret,
+				    void **p, void *end, u64 *nonce_plus_one,
+				    u8 *con_secret, int *con_secret_len)
+{
+	void *dp, *dend;
+	u8 struct_v;
+	int len;
+	int ret;
+
+	dp = *p + ceph_x_encrypt_offset();
+	ret = ceph_x_decrypt(secret, p, end);
+	if (ret < 0)
+		return ret;
+
+	dout("%s decrypted %d bytes\n", __func__, ret);
+	dend = dp + ret;
+
+	ceph_decode_8_safe(&dp, dend, struct_v, e_inval);
+	ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval);
+	dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one);
+	if (struct_v >= 2) {
+		ceph_decode_32_safe(&dp, dend, len, e_inval);
+		if (len > CEPH_MAX_CON_SECRET_LEN) {
+			pr_err("connection secret too big %d\n", len);
+			return -EINVAL;
+		}
+
+		dout("%s connection secret len %d\n", __func__, len);
+		if (con_secret) {
+			memcpy(con_secret, dp, len);
+			*con_secret_len = len;
+		}
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-					  struct ceph_authorizer *a)
+					  struct ceph_authorizer *a,
+					  void *reply, int reply_len,
+					  u8 *session_key, int *session_key_len,
+					  u8 *con_secret, int *con_secret_len)
 {
 	struct ceph_x_authorizer *au = (void *)a;
-	void *p = au->enc_buf;
-	struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+	u64 nonce_plus_one;
 	int ret;
 
-	ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
-	if (ret < 0)
+	if (session_key) {
+		memcpy(session_key, au->session_key.key, au->session_key.len);
+		*session_key_len = au->session_key.len;
+	}
+
+	ret = decrypt_authorizer_reply(&au->session_key, &reply,
+				       reply + reply_len, &nonce_plus_one,
+				       con_secret, con_secret_len);
+	if (ret)
 		return ret;
-	if (ret < sizeof(*reply)) {
-		pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
-		return -EINVAL;
+
+	if (nonce_plus_one != au->nonce + 1) {
+		pr_err("failed to authenticate server\n");
+		return -EPERM;
 	}
 
-	if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
-		ret = -EPERM;
-	else
-		ret = 0;
-	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-	     au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
-	return ret;
+	return 0;
 }
 
 static void ceph_x_reset(struct ceph_auth_client *ac)
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 24b0b74564d0..792fcb974dc3 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -38,7 +38,8 @@ struct ceph_x_authenticate {
 	__u8 struct_v;
 	__le64 client_challenge;
 	__le64 key;
-	/* ticket blob */
+	/* old_ticket blob */
+	/* nautilus+: other_keys */
 } __attribute__ ((packed));
 
 struct ceph_x_service_ticket_request {
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 96ef4d860bc9..13bd526349fa 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -5,6 +5,9 @@
 #include <linux/ceph/types.h>
 #include <linux/ceph/buffer.h>
 
+#define CEPH_KEY_LEN			16
+#define CEPH_MAX_CON_SECRET_LEN		64
+
 /*
  * cryptographic secret
  */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7901ab6c79fd..8966eae543d3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5623,8 +5623,11 @@ static int verify_authorizer_reply(struct ceph_connection *con)
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
 
-	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+		NULL, NULL, NULL, NULL);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
-- 
cgit 


From 59711f9ec219bf5245a8e95989803fb503adc52d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 17:01:53 +0100
Subject: libceph: amend cephx init_protocol() and build_request()

In msgr2, initial authentication happens with an exchange of msgr2
control frames -- MAuth message and struct ceph_mon_request_header
aren't used.  Make that optional.

Stop reporting cephx protocol as "x".  Use "cephx" instead.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h |  1 +
 net/ceph/auth.c              | 63 ++++++++++++++++++++++++--------------------
 net/ceph/ceph_strings.c      | 14 ++++++++++
 3 files changed, 50 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index d44d98033d58..6d986e52000b 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -95,6 +95,7 @@ struct ceph_dir_layout {
 
 #define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 
+const char *ceph_auth_proto_name(int proto);
 
 /*********************************************
  * message layer
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 40d3d95344d9..deaf267f8942 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -21,15 +21,18 @@ static u32 supported_protocols[] = {
 	CEPH_AUTH_CEPHX
 };
 
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int init_protocol(struct ceph_auth_client *ac, int proto)
 {
-	switch (protocol) {
+	dout("%s proto %d\n", __func__, proto);
+
+	switch (proto) {
 	case CEPH_AUTH_NONE:
 		return ceph_auth_none_init(ac);
 	case CEPH_AUTH_CEPHX:
 		return ceph_x_init(ac);
 	default:
-		return -ENOENT;
+		pr_err("bad auth protocol %d\n", proto);
+		return -EINVAL;
 	}
 }
 
@@ -145,31 +148,35 @@ bad:
 	goto out;
 }
 
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
-				   void *msg_buf, size_t msg_len)
+static int build_request(struct ceph_auth_client *ac, bool add_header,
+			 void *buf, int buf_len)
 {
-	struct ceph_mon_request_header *monhdr = msg_buf;
-	void *p = monhdr + 1;
-	void *end = msg_buf + msg_len;
+	void *end = buf + buf_len;
+	void *p;
 	int ret;
 
-	monhdr->have_version = 0;
-	monhdr->session_mon = cpu_to_le16(-1);
-	monhdr->session_mon_tid = 0;
-
-	ceph_encode_32(&p, ac->protocol);
+	p = buf;
+	if (add_header) {
+		/* struct ceph_mon_request_header + protocol */
+		ceph_encode_64_safe(&p, end, 0, e_range);
+		ceph_encode_16_safe(&p, end, -1, e_range);
+		ceph_encode_64_safe(&p, end, 0, e_range);
+		ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+	}
 
+	ceph_encode_need(&p, end, sizeof(u32), e_range);
 	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
 	if (ret < 0) {
-		pr_err("error %d building auth method %s request\n", ret,
-		       ac->ops->name);
-		goto out;
+		pr_err("auth protocol '%s' building request failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), ret);
+		return ret;
 	}
 	dout(" built request %d bytes\n", ret);
 	ceph_encode_32(&p, ret);
-	ret = p + ret - msg_buf;
-out:
-	return ret;
+	return p + ret - buf;
+
+e_range:
+	return -ERANGE;
 }
 
 /*
@@ -229,10 +236,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 			ac->ops = NULL;
 		}
 		if (ac->protocol != protocol) {
-			ret = ceph_auth_init_protocol(ac, protocol);
+			ret = init_protocol(ac, protocol);
 			if (ret) {
-				pr_err("error %d on auth protocol %d init\n",
-				       ret, protocol);
+				pr_err("auth protocol '%s' init failed: %d\n",
+				       ceph_auth_proto_name(protocol), ret);
 				goto out;
 			}
 		}
@@ -242,11 +249,11 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
 	ret = ac->ops->handle_reply(ac, result, payload, payload_end,
 				    NULL, NULL, NULL, NULL);
-	if (ret == -EAGAIN) {
-		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
-	} else if (ret) {
-		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-	}
+	if (ret == -EAGAIN)
+		ret = build_request(ac, true, reply_buf, reply_len);
+	else if (ret)
+		pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), result);
 
 out:
 	mutex_unlock(&ac->mutex);
@@ -265,7 +272,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
 
 	mutex_lock(&ac->mutex);
 	if (ac->ops->should_authenticate(ac))
-		ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+		ret = build_request(ac, true, msg_buf, msg_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 10e01494993c..69cd391e02a6 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -18,6 +18,20 @@ const char *ceph_entity_type_name(int type)
 }
 EXPORT_SYMBOL(ceph_entity_type_name);
 
+const char *ceph_auth_proto_name(int proto)
+{
+	switch (proto) {
+	case CEPH_AUTH_UNKNOWN:
+		return "unknown";
+	case CEPH_AUTH_NONE:
+		return "none";
+	case CEPH_AUTH_CEPHX:
+		return "cephx";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_op_name(int op)
 {
 	switch (op) {
-- 
cgit 


From c1c0ce78f479cf4d7dfe72c4c1cabbf0bc0730c9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 17:05:44 +0100
Subject: libceph: drop ac->ops->name field

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h | 2 --
 net/ceph/auth_none.c      | 1 -
 net/ceph/auth_x.c         | 1 -
 3 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index d9e7d0bcdaf1..5f64f66309fa 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -32,8 +32,6 @@ struct ceph_auth_handshake {
 };
 
 struct ceph_auth_client_ops {
-	const char *name;
-
 	/*
 	 * true if we are authenticated and can connect to
 	 * services.
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index af8ae507e861..70e86e462250 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -118,7 +118,6 @@ static int ceph_auth_none_create_authorizer(
 }
 
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-	.name = "none",
 	.reset = reset,
 	.destroy = destroy,
 	.is_authenticated = is_authenticated,
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a265792642dc..9815cfe42af0 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -1058,7 +1058,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
 }
 
 static const struct ceph_auth_client_ops ceph_x_ops = {
-	.name = "x",
 	.is_authenticated = ceph_x_is_authenticated,
 	.should_authenticate = ceph_x_should_authenticate,
 	.build_request = ceph_x_build_request,
-- 
cgit 


From a5cbd5fc22d5043a8a76e15d75d031fe24d1f69c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 30 Oct 2020 13:30:51 +0100
Subject: libceph, ceph: get and handle cluster maps with addrvecs

In preparation for msgr2, make the cluster send us maps with addrvecs
including both LEGACY and MSGR2 addrs instead of a single LEGACY addr.
This means advertising support for SERVER_NAUTILUS and also some older
features: SERVER_MIMIC, MONENC and MONNAMES.

MONNAMES and MONENC are actually pre-argonaut, we just never updated
ceph_monmap_decode() for them.  Decoding is unconditional, see commit
23c625ce3065 ("libceph: assume argonaut on the server side").

SERVER_MIMIC doesn't bear any meaning for the kernel client.

Since ceph_decode_entity_addrvec() is guarded by encoding version
checks (and in msgr2 case it is guarded implicitly by the fact that
server is speaking msgr2), we assume MSG_ADDR2 for it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c               |   2 +-
 fs/ceph/mdsmap.c                   |  21 +++---
 include/linux/ceph/ceph_features.h |  11 ++-
 include/linux/ceph/decode.h        |   4 +
 include/linux/ceph/mdsmap.h        |   2 +-
 include/linux/ceph/osdmap.h        |   4 +-
 net/ceph/decode.c                  |  56 ++++++++++++++
 net/ceph/mon_client.c              | 145 +++++++++++++++++++++++++++----------
 net/ceph/osd_client.c              |   4 +-
 net/ceph/osdmap.c                  |  45 ++++++++----
 10 files changed, 222 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 278fe67e2617..afd22815fbda 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5014,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		return;
 	}
 
-	newmap = ceph_mdsmap_decode(&p, end);
+	newmap = ceph_mdsmap_decode(&p, end, false);
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 1096d1d3a84c..abd9af7727ad 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -114,7 +114,7 @@ bad:
  * Ignore any fields we don't care about (there are quite a few of
  * them).
  */
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 {
 	struct ceph_mdsmap *m;
 	const void *start = *p;
@@ -201,18 +201,19 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		namelen = ceph_decode_32(p);  /* skip mds name */
 		*p += namelen;
 
-		ceph_decode_need(p, end,
-				 4*sizeof(u32) + sizeof(u64) +
-				 sizeof(addr) + sizeof(struct ceph_timespec),
-				 bad);
-		mds = ceph_decode_32(p);
-		inc = ceph_decode_32(p);
-		state = ceph_decode_32(p);
+		ceph_decode_32_safe(p, end, mds, bad);
+		ceph_decode_32_safe(p, end, inc, bad);
+		ceph_decode_32_safe(p, end, state, bad);
 		*p += sizeof(u64);		/* state_seq */
-		err = ceph_decode_entity_addr(p, end, &addr);
+		if (info_v >= 8)
+			err = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			err = ceph_decode_entity_addr(p, end, &addr);
 		if (err)
 			goto corrupt;
-		ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+
+		ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since),
+				      bad);
 		laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
 		*p += sizeof(u32);
 		ceph_decode_32_safe(p, end, namelen, bad);
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 999636d53cf2..3a47acd9cc14 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -8,7 +8,8 @@
  * feature.  Base case is 1 (first use).
  */
 #define CEPH_FEATURE_INCARNATION_1 (0ull)
-#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57)              // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
 
 #define DEFINE_CEPH_FEATURE(bit, incarnation, name)			\
 	static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit);		\
@@ -75,7 +76,7 @@
 DEFINE_CEPH_FEATURE( 0, 1, UID)
 DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
 DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
-
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
 DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
 DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
 DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
@@ -114,7 +115,7 @@ DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
 DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
 DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
 DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
-DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
 DEFINE_CEPH_FEATURE(29, 1, MDSENC)
 DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
 DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS)  // deprecate me
@@ -177,13 +178,16 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT		\
 	(CEPH_FEATURE_NOSRCADDR |		\
+	 CEPH_FEATURE_SERVER_NAUTILUS |		\
 	 CEPH_FEATURE_FLOCK |			\
 	 CEPH_FEATURE_SUBSCRIBE2 |		\
+	 CEPH_FEATURE_MONNAMES |		\
 	 CEPH_FEATURE_RECONNECT_SEQ |		\
 	 CEPH_FEATURE_DIRLAYOUTHASH |		\
 	 CEPH_FEATURE_PGID64 |			\
 	 CEPH_FEATURE_PGPOOL3 |			\
 	 CEPH_FEATURE_OSDENC |			\
+	 CEPH_FEATURE_MONENC |			\
 	 CEPH_FEATURE_CRUSH_TUNABLES |		\
 	 CEPH_FEATURE_SERVER_LUMINOUS |		\
 	 CEPH_FEATURE_RESEND_ON_SPLIT |		\
@@ -193,6 +197,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_MSG_AUTH |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
 	 CEPH_FEATURE_REPLY_CREATE_INODE |	\
+	 CEPH_FEATURE_SERVER_MIMIC |		\
 	 CEPH_FEATURE_MDSENC |			\
 	 CEPH_FEATURE_OSDHASHPSPOOL |		\
 	 CEPH_FEATURE_OSD_CACHEPOOL |		\
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 450384fe487c..9a934e04f841 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -220,6 +220,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
  */
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
+#define CEPH_ENTITY_ADDR_TYPE_MSGR2	__cpu_to_le32(2)
 
 static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
@@ -239,6 +240,9 @@ static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
 
 extern int ceph_decode_entity_addr(void **p, void *end,
 				   struct ceph_entity_addr *addr);
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+			       struct ceph_entity_addr *addr);
+
 /*
  * encoders
  */
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 35d385296fbb..523fd0452856 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -64,7 +64,7 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 }
 
 extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2);
 extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
 extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
 
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index cad9acfbc320..5553019c3f07 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -251,8 +251,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 }
 
 struct ceph_osdmap *ceph_osdmap_alloc(void);
-extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
 					     struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index eea529595a7a..6429b6713507 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/ceph/decode.h>
 
@@ -82,3 +83,58 @@ bad:
 }
 EXPORT_SYMBOL(ceph_decode_entity_addr);
 
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+			       struct ceph_entity_addr *addr)
+{
+	__le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+				 CEPH_ENTITY_ADDR_TYPE_LEGACY;
+	struct ceph_entity_addr tmp_addr;
+	int addr_cnt;
+	bool found;
+	u8 marker;
+	int ret;
+	int i;
+
+	ceph_decode_8_safe(p, end, marker, e_inval);
+	if (marker != 2) {
+		pr_err("bad addrvec marker %d\n", marker);
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+
+	found = false;
+	for (i = 0; i < addr_cnt; i++) {
+		ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+		if (ret)
+			return ret;
+
+		if (tmp_addr.type == my_type) {
+			if (found) {
+				pr_err("another match of type %d in addrvec\n",
+				       le32_to_cpu(my_type));
+				return -EINVAL;
+			}
+
+			memcpy(addr, &tmp_addr, sizeof(*addr));
+			found = true;
+		}
+	}
+	if (!found && addr_cnt != 0) {
+		pr_err("no match of type %d in addrvec\n",
+		       le32_to_cpu(my_type));
+		return -ENOENT;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ebfecf8d0918..a9754a7fa78c 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
 
 static int __validate_auth(struct ceph_mon_client *monc);
 
+static int decode_mon_info(void **p, void *end, bool msgr2,
+			   struct ceph_entity_addr *addr)
+{
+	void *mon_info_end;
+	u32 struct_len;
+	u8 struct_v;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	mon_info_end = *p + struct_len;
+	ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+	ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+	if (ret)
+		return ret;
+
+	*p = mon_info_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 /*
  * Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
  */
-static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
 {
-	struct ceph_monmap *m = NULL;
-	int i, err = -EINVAL;
+	struct ceph_monmap *monmap = NULL;
 	struct ceph_fsid fsid;
-	u32 epoch, num_mon;
-	u32 len;
+	u32 struct_len;
+	int blob_len;
+	int num_mon;
+	u8 struct_v;
+	u32 epoch;
+	int ret;
+	int i;
+
+	ceph_decode_32_safe(p, end, blob_len, e_inval);
+	ceph_decode_need(p, end, blob_len, e_inval);
+
+	ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+	if (ret)
+		goto fail;
+
+	dout("%s struct_v %d\n", __func__, struct_v);
+	ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+	ceph_decode_32_safe(p, end, epoch, e_inval);
+	if (struct_v >= 6) {
+		u32 feat_struct_len;
+		u8 feat_struct_v;
 
-	ceph_decode_32_safe(&p, end, len, bad);
-	ceph_decode_need(&p, end, len, bad);
+		*p += sizeof(struct ceph_timespec);  /* skip last_changed */
+		*p += sizeof(struct ceph_timespec);  /* skip created */
 
-	dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
-	p += sizeof(u16);  /* skip version */
+		ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+					  &feat_struct_v, &feat_struct_len);
+		if (ret)
+			goto fail;
 
-	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	epoch = ceph_decode_32(&p);
+		*p += feat_struct_len;  /* skip persistent_features */
 
-	num_mon = ceph_decode_32(&p);
+		ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+					  &feat_struct_v, &feat_struct_len);
+		if (ret)
+			goto fail;
 
+		*p += feat_struct_len;  /* skip optional_features */
+	}
+	ceph_decode_32_safe(p, end, num_mon, e_inval);
+
+	dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+	     num_mon);
 	if (num_mon > CEPH_MAX_MON)
-		goto bad;
-	m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
-	if (m == NULL)
-		return ERR_PTR(-ENOMEM);
-	m->fsid = fsid;
-	m->epoch = epoch;
-	m->num_mon = num_mon;
-	for (i = 0; i < num_mon; ++i) {
-		struct ceph_entity_inst *inst = &m->mon_inst[i];
-
-		/* copy name portion */
-		ceph_decode_copy_safe(&p, end, &inst->name,
-					sizeof(inst->name), bad);
-		err = ceph_decode_entity_addr(&p, end, &inst->addr);
-		if (err)
-			goto bad;
+		goto e_inval;
+
+	monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+	if (!monmap) {
+		ret = -ENOMEM;
+		goto fail;
 	}
-	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-	     m->num_mon);
-	for (i = 0; i < m->num_mon; i++)
-		dout("monmap_decode  mon%d is %s\n", i,
-		     ceph_pr_addr(&m->mon_inst[i].addr));
-	return m;
-bad:
-	dout("monmap_decode failed with %d\n", err);
-	kfree(m);
-	return ERR_PTR(err);
+	monmap->fsid = fsid;
+	monmap->epoch = epoch;
+	monmap->num_mon = num_mon;
+
+	/* legacy_mon_addr map or mon_info map */
+	for (i = 0; i < num_mon; i++) {
+		struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+		ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+		inst->name.type = CEPH_ENTITY_TYPE_MON;
+		inst->name.num = cpu_to_le64(i);
+
+		if (struct_v >= 6)
+			ret = decode_mon_info(p, end, msgr2, &inst->addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &inst->addr);
+		if (ret)
+			goto fail;
+
+		dout("%s mon%d addr %s\n", __func__, i,
+		     ceph_pr_addr(&inst->addr));
+	}
+
+	return monmap;
+
+e_inval:
+	ret = -EINVAL;
+fail:
+	kfree(monmap);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -476,7 +541,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
-	monmap = ceph_monmap_decode(p, end);
+	monmap = ceph_monmap_decode(&p, end, false);
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8966eae543d3..51be5a7482fc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,9 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	set_pool_was_full(osdc);
 
 	if (incremental)
-		newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+		newmap = osdmap_apply_incremental(&p, end, false, osdc->osdmap);
 	else
-		newmap = ceph_osdmap_decode(&p, end);
+		newmap = ceph_osdmap_decode(&p, end, false);
 	if (IS_ERR(newmap))
 		return PTR_ERR(newmap);
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index fa08c15be0c0..2b1dd252f231 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
 /*
  * decode a full map.
  */
-static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+static int osdmap_decode(void **p, void *end, bool msgr2,
+			 struct ceph_osdmap *map)
 {
 	u8 struct_v;
 	u32 epoch = 0;
@@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		goto e_inval;
 
 	for (i = 0; i < map->max_osd; i++) {
-		err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
+		struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+		if (struct_v >= 8)
+			err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+		else
+			err = ceph_decode_entity_addr(p, end, addr);
 		if (err)
 			goto bad;
+
+		dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
 	}
 
 	/* pg_temp */
@@ -1790,7 +1798,7 @@ bad:
 /*
  * Allocate and decode a full map.
  */
-struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
 {
 	struct ceph_osdmap *map;
 	int ret;
@@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	ret = osdmap_decode(p, end, map);
+	ret = osdmap_decode(p, end, msgr2, map);
 	if (ret) {
 		ceph_osdmap_destroy(map);
 		return ERR_PTR(ret);
@@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
  */
 static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
-				      struct ceph_osdmap *map)
+				      bool msgr2, struct ceph_osdmap *map)
 {
 	void *new_up_client;
 	void *new_state;
 	void *new_weight_end;
 	u32 len;
+	int ret;
 	int i;
 
 	new_up_client = *p;
@@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 		struct ceph_entity_addr addr;
 
 		ceph_decode_skip_32(p, end, e_inval);
-		if (ceph_decode_entity_addr(p, end, &addr))
-			goto e_inval;
+		if (struct_v >= 7)
+			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &addr);
+		if (ret)
+			return ret;
 	}
 
 	new_state = *p;
@@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 	while (len--) {
 		s32 osd;
 		u32 xorstate;
-		int ret;
 
 		osd = ceph_decode_32(p);
 		if (struct_v >= 5)
@@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 
 		osd = ceph_decode_32(p);
 		BUG_ON(osd >= map->max_osd);
-		if (ceph_decode_entity_addr(p, end, &addr))
-			goto e_inval;
+		if (struct_v >= 7)
+			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &addr);
+		if (ret)
+			return ret;
+
+		dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
 		pr_info("osd%d up\n", osd);
 		map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
 		map->osd_addr[osd] = addr;
@@ -1927,7 +1946,7 @@ e_inval:
 /*
  * decode and apply an incremental map update.
  */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
 					     struct ceph_osdmap *map)
 {
 	struct ceph_fsid fsid;
@@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (len > 0) {
 		dout("apply_incremental full map len %d, %p to %p\n",
 		     len, *p, end);
-		return ceph_osdmap_decode(p, min(*p+len, end));
+		return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
 	}
 
 	/* new crush? */
@@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_up_client, new_state, new_weight */
-	err = decode_new_up_state_weight(p, end, struct_v, map);
+	err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
 	if (err)
 		goto bad;
 
-- 
cgit 


From 313771e80fd253d4b5472e61a2d12b03c5293aa9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 25 Nov 2020 14:41:59 +0100
Subject: libceph, rbd: ignore addr->type while comparing in some cases

For libceph, this ensures that libceph instance sharing (share option)
continues to work.  For rbd, this avoids blocklisting alive lock owners
(locker addr is always LEGACY, while watcher addr is ANY in nautilus).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c       | 8 ++++++--
 include/linux/ceph/msgr.h | 9 ++++++++-
 net/ceph/mon_client.c     | 6 ++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f84128abade3..bec85c054522 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3957,8 +3957,12 @@ static int find_watcher(struct rbd_device *rbd_dev,
 
 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
 	for (i = 0; i < num_watchers; i++) {
-		if (!memcmp(&watchers[i].addr, &locker->info.addr,
-			    sizeof(locker->info.addr)) &&
+		/*
+		 * Ignore addr->type while comparing.  This mimics
+		 * entity_addr_t::get_legacy_str() + strcmp().
+		 */
+		if (ceph_addr_equal_no_type(&watchers[i].addr,
+					    &locker->info.addr) &&
 		    watchers[i].cookie == cookie) {
 			struct rbd_client_id cid = {
 				.gid = le64_to_cpu(watchers[i].name.num),
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 46939485f2c3..9a897a60f20b 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -52,11 +52,18 @@ extern const char *ceph_entity_type_name(int type);
  * entity_addr -- network address
  */
 struct ceph_entity_addr {
-	__le32 type;
+	__le32 type;  /* CEPH_ENTITY_ADDR_TYPE_* */
 	__le32 nonce;  /* unique id for process (e.g. pid) */
 	struct sockaddr_storage in_addr;
 } __attribute__ ((packed));
 
+static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs,
+					   const struct ceph_entity_addr *rhs)
+{
+	return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) &&
+	       lhs->nonce == rhs->nonce;
+}
+
 struct ceph_entity_inst {
 	struct ceph_entity_name name;
 	struct ceph_entity_addr addr;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index a9754a7fa78c..f5f090b4e409 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -161,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
 {
 	int i;
 
-	for (i = 0; i < m->num_mon; i++)
-		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+	for (i = 0; i < m->num_mon; i++) {
+		if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr))
 			return 1;
+	}
+
 	return 0;
 }
 
-- 
cgit 


From 00498b994113a871a556f7ff24a4cf8a00611700 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 16:04:58 +0100
Subject: libceph: introduce connection modes and ms_mode option

msgr2 supports two connection modes: crc (plain) and secure (on-wire
encryption).  Connection mode is picked by server based on input from
client.

Introduce ms_mode option:

  ms_mode=legacy        - msgr1 (default)
  ms_mode=crc           - crc mode, if denied fail
  ms_mode=secure        - secure mode, if denied fail
  ms_mode=prefer-crc    - crc mode, if denied agree to secure mode
  ms_mode=prefer-secure - secure mode, if denied agree to crc mode

ms_mode affects all connections, we don't separate connections to mons
like it's done in userspace with ms_client_mode vs ms_mon_client_mode.

For now the default is legacy, to be flipped to prefer-crc after some
time.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h    |  8 ++++--
 include/linux/ceph/ceph_fs.h |  6 +++++
 include/linux/ceph/libceph.h |  1 +
 net/ceph/auth.c              | 12 ++++++---
 net/ceph/ceph_common.c       | 63 ++++++++++++++++++++++++++++++++++++++++++++
 net/ceph/ceph_strings.c      | 14 ++++++++++
 net/ceph/mon_client.c        |  4 +--
 7 files changed, 100 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 5f64f66309fa..6fc058fe9efa 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -98,11 +98,15 @@ struct ceph_auth_client {
 	const struct ceph_crypto_key *key;     /* our secret key */
 	unsigned want_keys;     /* which services we want */
 
+	int preferred_mode;	/* CEPH_CON_MODE_* */
+	int fallback_mode;	/* ditto */
+
 	struct mutex mutex;
 };
 
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
-					       const struct ceph_crypto_key *key);
+struct ceph_auth_client *ceph_auth_init(const char *name,
+					const struct ceph_crypto_key *key,
+					const int *con_modes);
 extern void ceph_auth_destroy(struct ceph_auth_client *ac);
 
 extern void ceph_auth_reset(struct ceph_auth_client *ac);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 6d986e52000b..ce22d5469670 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -93,9 +93,15 @@ struct ceph_dir_layout {
 #define CEPH_AUTH_NONE	 	0x1
 #define CEPH_AUTH_CEPHX	 	0x2
 
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN	0x0
+#define CEPH_CON_MODE_CRC	0x1
+#define CEPH_CON_MODE_SECURE	0x2
+
 #define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 
 const char *ceph_auth_proto_name(int proto);
+const char *ceph_con_mode_name(int mode);
 
 /*********************************************
  * message layer
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index eb5a7ca13f9c..8765a5ad267a 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -53,6 +53,7 @@ struct ceph_options {
 	unsigned long osd_keepalive_timeout;	/* jiffies */
 	unsigned long osd_request_timeout;	/* jiffies */
 	u32 read_from_replica;  /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */
+	int con_modes[2];  /* CEPH_CON_MODE_* */
 
 	/*
 	 * any type that can't be simply compared or doesn't need
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index deaf267f8942..4a0f32b32cc6 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -39,13 +39,13 @@ static int init_protocol(struct ceph_auth_client *ac, int proto)
 /*
  * setup, teardown.
  */
-struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+struct ceph_auth_client *ceph_auth_init(const char *name,
+					const struct ceph_crypto_key *key,
+					const int *con_modes)
 {
 	struct ceph_auth_client *ac;
 	int ret;
 
-	dout("auth_init name '%s'\n", name);
-
 	ret = -ENOMEM;
 	ac = kzalloc(sizeof(*ac), GFP_NOFS);
 	if (!ac)
@@ -57,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
 		ac->name = name;
 	else
 		ac->name = CEPH_AUTH_NAME_DEFAULT;
-	dout("auth_init name %s\n", ac->name);
 	ac->key = key;
+	ac->preferred_mode = con_modes[0];
+	ac->fallback_mode = con_modes[1];
+
+	dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__,
+	     ac->name, ac->preferred_mode, ac->fallback_mode);
 	return ac;
 
 out:
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4e7edd707a14..271287c5ec12 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -265,6 +265,7 @@ enum {
 	Opt_ip,
 	Opt_crush_location,
 	Opt_read_from_replica,
+	Opt_ms_mode,
 	/* string args above */
 	Opt_share,
 	Opt_crc,
@@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = {
 	{}
 };
 
+enum ceph_ms_mode {
+	Opt_ms_mode_legacy,
+	Opt_ms_mode_crc,
+	Opt_ms_mode_secure,
+	Opt_ms_mode_prefer_crc,
+	Opt_ms_mode_prefer_secure
+};
+
+static const struct constant_table ceph_param_ms_mode[] = {
+	{"legacy",		Opt_ms_mode_legacy},
+	{"crc",			Opt_ms_mode_crc},
+	{"secure",		Opt_ms_mode_secure},
+	{"prefer-crc",		Opt_ms_mode_prefer_crc},
+	{"prefer-secure",	Opt_ms_mode_prefer_secure},
+	{}
+};
+
 static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag	("abort_on_full",		Opt_abort_on_full),
 	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
@@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
 			 fs_param_deprecated, NULL),
 	fsparam_enum	("read_from_replica",		Opt_read_from_replica,
 			 ceph_param_read_from_replica),
+	fsparam_enum	("ms_mode",			Opt_ms_mode,
+			 ceph_param_ms_mode),
 	fsparam_string	("secret",			Opt_secret),
 	fsparam_flag_no ("share",			Opt_share),
 	fsparam_flag_no ("tcp_nodelay",			Opt_tcp_nodelay),
@@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void)
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
 	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
 	opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
+	opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+	opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
 	return opt;
 }
 EXPORT_SYMBOL(ceph_alloc_options);
@@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 			BUG();
 		}
 		break;
+	case Opt_ms_mode:
+		switch (result.uint_32) {
+		case Opt_ms_mode_legacy:
+			opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_crc:
+			opt->con_modes[0] = CEPH_CON_MODE_CRC;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_secure:
+			opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_prefer_crc:
+			opt->con_modes[0] = CEPH_CON_MODE_CRC;
+			opt->con_modes[1] = CEPH_CON_MODE_SECURE;
+			break;
+		case Opt_ms_mode_prefer_secure:
+			opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+			opt->con_modes[1] = CEPH_CON_MODE_CRC;
+			break;
+		default:
+			BUG();
+		}
+		break;
 
 	case Opt_osdtimeout:
 		warn_plog(&log, "Ignoring osdtimeout");
@@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 	} else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
 		seq_puts(m, "read_from_replica=localize,");
 	}
+	if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) {
+		if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+		    opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+			seq_puts(m, "ms_mode=crc,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+			   opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+			seq_puts(m, "ms_mode=secure,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+			   opt->con_modes[1] == CEPH_CON_MODE_SECURE) {
+			seq_puts(m, "ms_mode=prefer-crc,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+			   opt->con_modes[1] == CEPH_CON_MODE_CRC) {
+			seq_puts(m, "ms_mode=prefer-secure,");
+		}
+	}
 
 	if (opt->flags & CEPH_OPT_FSID)
 		seq_printf(m, "fsid=%pU,", &opt->fsid);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 69cd391e02a6..355fea272120 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -32,6 +32,20 @@ const char *ceph_auth_proto_name(int proto)
 	}
 }
 
+const char *ceph_con_mode_name(int mode)
+{
+	switch (mode) {
+	case CEPH_CON_MODE_UNKNOWN:
+		return "unknown";
+	case CEPH_CON_MODE_CRC:
+		return "crc";
+	case CEPH_CON_MODE_SECURE:
+		return "secure";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_op_name(int op)
 {
 	switch (op) {
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index f5f090b4e409..792a8c4164d7 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1156,8 +1156,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	/* connection */
 	/* authentication */
-	monc->auth = ceph_auth_init(cl->options->name,
-				    cl->options->key);
+	monc->auth = ceph_auth_init(cl->options->name, cl->options->key,
+				    cl->options->con_modes);
 	if (IS_ERR(monc->auth)) {
 		err = PTR_ERR(monc->auth);
 		goto out_monmap;
-- 
cgit 


From cd1a677cad994021b19665ed476aea63f5d54f31 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 16:59:08 +0100
Subject: libceph, ceph: implement msgr2.1 protocol (crc and secure modes)

Implement msgr2.1 wire protocol, available since nautilus 14.2.11
and octopus 15.2.5.  msgr2.0 wire protocol is not implemented -- it
has several security, integrity and robustness issues and therefore
considered deprecated.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c           |   80 +-
 include/linux/ceph/auth.h      |   36 +-
 include/linux/ceph/ceph_fs.h   |    4 +
 include/linux/ceph/decode.h    |    4 +
 include/linux/ceph/libceph.h   |    9 +-
 include/linux/ceph/messenger.h |  136 +-
 include/linux/ceph/msgr.h      |   48 +
 net/ceph/Kconfig               |    3 +
 net/ceph/Makefile              |    2 +-
 net/ceph/auth.c                |  309 ++++
 net/ceph/decode.c              |   45 +
 net/ceph/messenger.c           |   68 +-
 net/ceph/messenger_v2.c        | 3443 ++++++++++++++++++++++++++++++++++++++++
 net/ceph/mon_client.c          |  115 +-
 net/ceph/osd_client.c          |   85 +-
 15 files changed, 4356 insertions(+), 31 deletions(-)
 create mode 100644 net/ceph/messenger_v2.c

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index afd22815fbda..740d63d0fc50 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5014,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		return;
 	}
 
-	newmap = ceph_mdsmap_decode(&p, end, false);
+	newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
@@ -5196,6 +5196,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 
+static int mds_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+	int ret;
+
+	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
+				       buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int mds_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+	int ret;
+
+	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+					      buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int mds_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+
+	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+					       session_key, session_key_len,
+					       con_secret, con_secret_len);
+}
+
+static int mds_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;
+	int ret;
+
+	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,
+					    used_proto, result,
+					    allowed_protos, proto_cnt,
+					    allowed_modes, mode_cnt)) {
+		ret = ceph_monc_validate_auth(monc);
+		if (ret)
+			return ret;
+	}
+
+	return -EACCES;
+}
+
 static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
 				struct ceph_msg_header *hdr, int *skip)
 {
@@ -5245,6 +5319,10 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.alloc_msg = mds_alloc_msg,
 	.sign_message = mds_sign_message,
 	.check_message_signature = mds_check_message_signature,
+	.get_auth_request = mds_get_auth_request,
+	.handle_auth_reply_more = mds_handle_auth_reply_more,
+	.handle_auth_done = mds_handle_auth_done,
+	.handle_auth_bad_method = mds_handle_auth_bad_method,
 };
 
 /* eof */
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 6fc058fe9efa..3fbe72ebd779 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -120,8 +120,12 @@ int ceph_auth_entity_name_encode(const char *name, void **p, void *end);
 
 extern int ceph_build_auth(struct ceph_auth_client *ac,
 		    void *msg_buf, size_t msg_len);
-
 extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			       struct ceph_auth_handshake *auth,
+			       int peer_type, bool force_new,
+			       int *proto, int *pref_mode, int *fallb_mode);
 extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 				       int peer_type,
 				       struct ceph_auth_handshake *auth);
@@ -157,4 +161,34 @@ int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth,
 		return auth->check_message_signature(auth, msg);
 	return 0;
 }
+
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len);
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+				int reply_len, void *buf, int buf_len);
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+				 int used_proto, int result,
+				 const int *allowed_protos, int proto_cnt,
+				 const int *allowed_modes, int mode_cnt);
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			     struct ceph_auth_handshake *auth,
+			     int peer_type, void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    u8 *session_key, int *session_key_len,
+				    u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+				     int peer_type, int used_proto, int result,
+				     const int *allowed_protos, int proto_cnt,
+				     const int *allowed_modes, int mode_cnt);
+
 #endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ce22d5469670..e41a811026f6 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -93,6 +93,10 @@ struct ceph_dir_layout {
 #define CEPH_AUTH_NONE	 	0x1
 #define CEPH_AUTH_CEPHX	 	0x2
 
+#define CEPH_AUTH_MODE_NONE		0
+#define CEPH_AUTH_MODE_AUTHORIZER	1
+#define CEPH_AUTH_MODE_MON		10
+
 /* msgr2 protocol modes */
 #define CEPH_CON_MODE_UNKNOWN	0x0
 #define CEPH_CON_MODE_CRC	0x1
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 9a934e04f841..04f3ace5787b 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -221,6 +221,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
 #define CEPH_ENTITY_ADDR_TYPE_MSGR2	__cpu_to_le32(2)
+#define CEPH_ENTITY_ADDR_TYPE_ANY	__cpu_to_le32(3)
 
 static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
@@ -243,6 +244,9 @@ extern int ceph_decode_entity_addr(void **p, void *end,
 int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
 			       struct ceph_entity_addr *addr);
 
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr);
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr);
+
 /*
  * encoders
  */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 8765a5ad267a..eb9008bb3992 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -31,10 +31,10 @@
 #define CEPH_OPT_FSID             (1<<0)
 #define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
 #define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes */
+#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes (msgr1) */
 #define CEPH_OPT_NOMSGAUTH	  (1<<4) /* don't require msg signing feat */
 #define CEPH_OPT_TCP_NODELAY	  (1<<5) /* TCP_NODELAY on TCP sockets */
-#define CEPH_OPT_NOMSGSIGN	  (1<<6) /* don't sign msgs */
+#define CEPH_OPT_NOMSGSIGN	  (1<<6) /* don't sign msgs (msgr1) */
 #define CEPH_OPT_ABORT_ON_FULL	  (1<<7) /* abort w/ ENOSPC when full */
 
 #define CEPH_OPT_DEFAULT   (CEPH_OPT_TCP_NODELAY)
@@ -84,6 +84,7 @@ struct ceph_options {
 #define CEPH_MONC_HUNT_BACKOFF		2
 #define CEPH_MONC_HUNT_MAX_MULT		10
 
+#define CEPH_MSG_MAX_CONTROL_LEN (16*1024*1024)
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
 
@@ -152,6 +153,10 @@ struct ceph_client {
 
 #define from_msgr(ms)	container_of(ms, struct ceph_client, msgr)
 
+static inline bool ceph_msgr2(struct ceph_client *client)
+{
+	return client->options->con_modes[0] != CEPH_CON_MODE_UNKNOWN;
+}
 
 /*
  * snapshots
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 54a64e8dfce6..0e6e9ad3c3bf 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -3,6 +3,7 @@
 #define __FS_CEPH_MESSENGER_H
 
 #include <linux/bvec.h>
+#include <linux/crypto.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/net.h>
@@ -52,6 +53,23 @@ struct ceph_connection_operations {
 
 	int (*sign_message) (struct ceph_msg *msg);
 	int (*check_message_signature) (struct ceph_msg *msg);
+
+	/* msgr2 authentication exchange */
+	int (*get_auth_request)(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len);
+	int (*handle_auth_reply_more)(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len);
+	int (*handle_auth_done)(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len);
+	int (*handle_auth_bad_method)(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt);
 };
 
 /* use format string %s%lld */
@@ -246,8 +264,15 @@ struct ceph_msg {
 #define CEPH_CON_S_PREOPEN		2
 #define CEPH_CON_S_V1_BANNER		3
 #define CEPH_CON_S_V1_CONNECT_MSG	4
-#define CEPH_CON_S_OPEN			5
-#define CEPH_CON_S_STANDBY		6
+#define CEPH_CON_S_V2_BANNER_PREFIX	5
+#define CEPH_CON_S_V2_BANNER_PAYLOAD	6
+#define CEPH_CON_S_V2_HELLO		7
+#define CEPH_CON_S_V2_AUTH		8
+#define CEPH_CON_S_V2_AUTH_SIGNATURE	9
+#define CEPH_CON_S_V2_SESSION_CONNECT	10
+#define CEPH_CON_S_V2_SESSION_RECONNECT	11
+#define CEPH_CON_S_OPEN			12
+#define CEPH_CON_S_STANDBY		13
 
 /*
  * ceph_connection flag bits
@@ -301,6 +326,99 @@ struct ceph_connection_v1_info {
 	u32 peer_global_seq;  /* peer's global seq for this connection */
 };
 
+#define CEPH_CRC_LEN			4
+#define CEPH_GCM_KEY_LEN		16
+#define CEPH_GCM_IV_LEN			sizeof(struct ceph_gcm_nonce)
+#define CEPH_GCM_BLOCK_LEN		16
+#define CEPH_GCM_TAG_LEN		16
+
+#define CEPH_PREAMBLE_LEN		32
+#define CEPH_PREAMBLE_INLINE_LEN	48
+#define CEPH_PREAMBLE_PLAIN_LEN		CEPH_PREAMBLE_LEN
+#define CEPH_PREAMBLE_SECURE_LEN	(CEPH_PREAMBLE_LEN +		\
+					 CEPH_PREAMBLE_INLINE_LEN +	\
+					 CEPH_GCM_TAG_LEN)
+#define CEPH_EPILOGUE_PLAIN_LEN		(1 + 3 * CEPH_CRC_LEN)
+#define CEPH_EPILOGUE_SECURE_LEN	(CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN)
+
+#define CEPH_FRAME_MAX_SEGMENT_COUNT	4
+
+struct ceph_frame_desc {
+	int fd_tag;  /* FRAME_TAG_* */
+	int fd_seg_cnt;
+	int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT];  /* logical */
+	int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT];
+};
+
+struct ceph_gcm_nonce {
+	__le32 fixed;
+	__le64 counter __packed;
+};
+
+struct ceph_connection_v2_info {
+	struct iov_iter in_iter;
+	struct kvec in_kvecs[5];  /* recvmsg */
+	struct bio_vec in_bvec;  /* recvmsg (in_cursor) */
+	int in_kvec_cnt;
+	int in_state;  /* IN_S_* */
+
+	struct iov_iter out_iter;
+	struct kvec out_kvecs[8];  /* sendmsg */
+	struct bio_vec out_bvec;  /* sendpage (out_cursor, out_zero),
+				     sendmsg (out_enc_pages) */
+	int out_kvec_cnt;
+	int out_state;  /* OUT_S_* */
+
+	int out_zero;  /* # of zero bytes to send */
+	bool out_iter_sendpage;  /* use sendpage if possible */
+
+	struct ceph_frame_desc in_desc;
+	struct ceph_msg_data_cursor in_cursor;
+	struct ceph_msg_data_cursor out_cursor;
+
+	struct crypto_shash *hmac_tfm;  /* post-auth signature */
+	struct crypto_aead *gcm_tfm;  /* on-wire encryption */
+	struct aead_request *gcm_req;
+	struct crypto_wait gcm_wait;
+	struct ceph_gcm_nonce in_gcm_nonce;
+	struct ceph_gcm_nonce out_gcm_nonce;
+
+	struct page **out_enc_pages;
+	int out_enc_page_cnt;
+	int out_enc_resid;
+	int out_enc_i;
+
+	int con_mode;  /* CEPH_CON_MODE_* */
+
+	void *conn_bufs[16];
+	int conn_buf_cnt;
+
+	struct kvec in_sign_kvecs[8];
+	struct kvec out_sign_kvecs[8];
+	int in_sign_kvec_cnt;
+	int out_sign_kvec_cnt;
+
+	u64 client_cookie;
+	u64 server_cookie;
+	u64 global_seq;
+	u64 connect_seq;
+	u64 peer_global_seq;
+
+	u8 in_buf[CEPH_PREAMBLE_SECURE_LEN];
+	u8 out_buf[CEPH_PREAMBLE_SECURE_LEN];
+	struct {
+		u8 late_status;  /* FRAME_LATE_STATUS_* */
+		union {
+			struct {
+				u32 front_crc;
+				u32 middle_crc;
+				u32 data_crc;
+			} __packed;
+			u8 pad[CEPH_GCM_BLOCK_LEN - 1];
+		};
+	} out_epil;
+};
+
 /*
  * A single connection with another host.
  *
@@ -346,7 +464,10 @@ struct ceph_connection {
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
 
-	struct ceph_connection_v1_info v1;
+	union {
+		struct ceph_connection_v1_info v1;
+		struct ceph_connection_v2_info v2;
+	};
 };
 
 extern struct page *ceph_zero_page;
@@ -397,6 +518,15 @@ bool ceph_con_v1_opened(struct ceph_connection *con);
 void ceph_con_v1_reset_session(struct ceph_connection *con);
 void ceph_con_v1_reset_protocol(struct ceph_connection *con);
 
+/* messenger_v2.c */
+int ceph_con_v2_try_read(struct ceph_connection *con);
+int ceph_con_v2_try_write(struct ceph_connection *con);
+void ceph_con_v2_revoke(struct ceph_connection *con);
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v2_opened(struct ceph_connection *con);
+void ceph_con_v2_reset_session(struct ceph_connection *con);
+void ceph_con_v2_reset_protocol(struct ceph_connection *con);
+
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 9a897a60f20b..f5e02f6c0655 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -14,9 +14,39 @@
  * constant.
  */
 #define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_LEN 9
 #define CEPH_BANNER_MAX_LEN 30
 
 
+/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2 "ceph v2\n"
+#define CEPH_BANNER_V2_LEN 8
+#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16))
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name)               \
+	static const uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+	static const uint64_t CEPH_MSGR2_FEATUREMASK_##name =            \
+			(1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+	(((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1)   // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES  (CEPH_MSGR2_FEATURE_REVISION_1)
+
+
 /*
  * Rollover-safe type and comparator for 32-bit sequence numbers.
  * Comparator returns -1, 0, or 1.
@@ -158,6 +188,24 @@ struct ceph_msg_header {
 	__le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
 
+struct ceph_msg_header2 {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 data_pre_padding_len;
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	__le64 ack_seq;
+	__u8 flags;
+	/* oldest code we think can decode this.  unknown if zero. */
+	__le16 compat_version;
+	__le16 reserved;
+} __attribute__ ((packed));
+
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f36f9a3a4e20..c5c4eef3a9ff 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -5,6 +5,9 @@ config CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
 	select CRYPTO_CBC
+	select CRYPTO_GCM
+	select CRYPTO_HMAC
+	select CRYPTO_SHA256
 	select CRYPTO
 	select KEYS
 	default n
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index df02bd8d6c7b..8802a0c0155d 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -15,4 +15,4 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	auth_x.o \
 	ceph_strings.o ceph_hash.o \
 	pagevec.o snapshot.o string_table.o \
-	messenger_v1.o
+	messenger_v1.o messenger_v2.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 4a0f32b32cc6..6b315c8212b1 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -293,6 +293,39 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
 }
 EXPORT_SYMBOL(ceph_auth_is_authenticated);
 
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			       struct ceph_auth_handshake *auth,
+			       int peer_type, bool force_new,
+			       int *proto, int *pref_mode, int *fallb_mode)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (force_new && auth->authorizer) {
+		ceph_auth_destroy_authorizer(auth->authorizer);
+		auth->authorizer = NULL;
+	}
+	if (!auth->authorizer)
+		ret = ac->ops->create_authorizer(ac, peer_type, auth);
+	else if (ac->ops->update_authorizer)
+		ret = ac->ops->update_authorizer(ac, peer_type, auth);
+	else
+		ret = 0;
+	if (ret)
+		goto out;
+
+	*proto = ac->protocol;
+	if (pref_mode && fallb_mode) {
+		*pref_mode = ac->preferred_mode;
+		*fallb_mode = ac->fallback_mode;
+	}
+
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
+
 int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 				int peer_type,
 				struct ceph_auth_handshake *auth)
@@ -369,3 +402,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
 	mutex_unlock(&ac->mutex);
 }
 EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		if (arr[i] == val)
+			return true;
+	}
+
+	return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+	WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+	if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+		ceph_encode_32_safe(p, end, 2, e_range);
+		ceph_encode_32_safe(p, end, pref_mode, e_range);
+		ceph_encode_32_safe(p, end, fallb_mode, e_range);
+	} else {
+		ceph_encode_32_safe(p, end, 1, e_range);
+		ceph_encode_32_safe(p, end, pref_mode, e_range);
+	}
+
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+	int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+	void *end = buf + buf_len;
+	void *lenp;
+	void *p;
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+		ret = init_protocol(ac, proto);
+		if (ret) {
+			pr_err("auth protocol '%s' init failed: %d\n",
+			       ceph_auth_proto_name(proto), ret);
+			goto out;
+		}
+	} else {
+		WARN_ON(ac->protocol != proto);
+		ac->ops->reset(ac);
+	}
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+	ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+	if (ret)
+		goto out;
+
+	lenp = p;
+	p += 4;  /* space for len */
+
+	ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+	ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+	if (ret)
+		goto out;
+
+	ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+	ceph_encode_32(&lenp, p - lenp - 4);
+	ret = p - buf;
+
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
+
+e_range:
+	ret = -ERANGE;
+	goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+				int reply_len, void *buf, int buf_len)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+				    NULL, NULL, NULL, NULL);
+	if (ret == -EAGAIN)
+		ret = build_request(ac, false, buf, buf_len);
+	else
+		WARN_ON(ret >= 0);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (global_id && ac->global_id != global_id) {
+		dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
+		     global_id);
+		ac->global_id = global_id;
+	}
+
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+				    session_key, session_key_len,
+				    con_secret, con_secret_len);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+				 int used_proto, int result,
+				 const int *allowed_protos, int proto_cnt,
+				 const int *allowed_modes, int mode_cnt)
+{
+	mutex_lock(&ac->mutex);
+	WARN_ON(used_proto != ac->protocol);
+
+	if (result == -EOPNOTSUPP) {
+		if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+			pr_err("auth protocol '%s' not allowed\n",
+			       ceph_auth_proto_name(ac->protocol));
+			goto not_allowed;
+		}
+		if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+		    (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+		     !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+			pr_err("preferred mode '%s' not allowed\n",
+			       ceph_con_mode_name(ac->preferred_mode));
+			if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+				pr_err("no fallback mode\n");
+			else
+				pr_err("fallback mode '%s' not allowed\n",
+				       ceph_con_mode_name(ac->fallback_mode));
+			goto not_allowed;
+		}
+	}
+
+	WARN_ON(result == -EOPNOTSUPP || result >= 0);
+	pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+	       ceph_auth_proto_name(ac->protocol), result);
+
+	mutex_unlock(&ac->mutex);
+	return true;
+
+not_allowed:
+	mutex_unlock(&ac->mutex);
+	return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			     struct ceph_auth_handshake *auth,
+			     int peer_type, void *buf, int *buf_len)
+{
+	void *end = buf + *buf_len;
+	int pref_mode, fallb_mode;
+	int proto;
+	void *p;
+	int ret;
+
+	ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+					 &pref_mode, &fallb_mode);
+	if (ret)
+		return ret;
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, proto, e_range);
+	ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+	if (ret)
+		return ret;
+
+	ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+	*buf_len = p - buf;
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    void *buf, int *buf_len)
+{
+	void *end = buf + *buf_len;
+	void *p;
+	int ret;
+
+	ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+						 reply, reply_len);
+	if (ret)
+		return ret;
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+	*buf_len = p - buf;
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    u8 *session_key, int *session_key_len,
+				    u8 *con_secret, int *con_secret_len)
+{
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		reply, reply_len, session_key, session_key_len,
+		con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+				     int peer_type, int used_proto, int result,
+				     const int *allowed_protos, int proto_cnt,
+				     const int *allowed_modes, int mode_cnt)
+{
+	mutex_lock(&ac->mutex);
+	WARN_ON(used_proto != ac->protocol);
+
+	if (result == -EOPNOTSUPP) {
+		if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+			pr_err("auth protocol '%s' not allowed by %s\n",
+			       ceph_auth_proto_name(ac->protocol),
+			       ceph_entity_type_name(peer_type));
+			goto not_allowed;
+		}
+		if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+		    (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+		     !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+			pr_err("preferred mode '%s' not allowed by %s\n",
+			       ceph_con_mode_name(ac->preferred_mode),
+			       ceph_entity_type_name(peer_type));
+			if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+				pr_err("no fallback mode\n");
+			else
+				pr_err("fallback mode '%s' not allowed by %s\n",
+				       ceph_con_mode_name(ac->fallback_mode),
+				       ceph_entity_type_name(peer_type));
+			goto not_allowed;
+		}
+	}
+
+	WARN_ON(result == -EOPNOTSUPP || result >= 0);
+	pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+	       ceph_auth_proto_name(ac->protocol),
+	       ceph_entity_type_name(peer_type), result);
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, peer_type);
+
+	mutex_unlock(&ac->mutex);
+	return true;
+
+not_allowed:
+	mutex_unlock(&ac->mutex);
+	return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index 6429b6713507..b44f7651be04 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/ceph/ceph_debug.h>
 
+#include <linux/inet.h>
+
 #include <linux/ceph/decode.h>
 
 static int
@@ -138,3 +140,46 @@ e_inval:
 	return -EINVAL;
 }
 EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sin;
+		struct sockaddr_in6 sin6;
+	} u;
+
+	switch (family) {
+	case AF_INET:
+		return sizeof(u.sin);
+	case AF_INET6:
+		return sizeof(u.sin6);
+	default:
+		return sizeof(u);
+	}
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+	sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+	int addr_len = get_sockaddr_encoding_len(family);
+
+	return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+	sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+	int addr_len = get_sockaddr_encoding_len(family);
+
+	ceph_encode_8(p, 1);  /* marker */
+	ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+				     sizeof(addr->nonce) +
+				     sizeof(u32) + addr_len);
+	ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+	ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+	ceph_encode_32(p, addr_len);
+	ceph_encode_16(p, family);
+	ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4fb3c33a7b03..57d043b382ed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -195,8 +195,11 @@ EXPORT_SYMBOL(ceph_pr_addr);
 
 void ceph_encode_my_addr(struct ceph_messenger *msgr)
 {
-	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_banner_addr(&msgr->my_enc_addr);
+	if (!ceph_msgr2(from_msgr(msgr))) {
+		memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+		       sizeof(msgr->my_enc_addr));
+		ceph_encode_banner_addr(&msgr->my_enc_addr);
+	}
 }
 
 /*
@@ -513,7 +516,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 
-	ceph_con_v1_reset_protocol(con);
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		ceph_con_v2_reset_protocol(con);
+	else
+		ceph_con_v1_reset_protocol(con);
 }
 
 /*
@@ -526,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
 
 	ceph_msg_put(msg);
 }
+
 static void ceph_msg_remove_list(struct list_head *head)
 {
 	while (!list_empty(head)) {
@@ -547,7 +554,10 @@ void ceph_con_reset_session(struct ceph_connection *con)
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 
-	ceph_con_v1_reset_session(con);
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		ceph_con_v2_reset_session(con);
+	else
+		ceph_con_v1_reset_session(con);
 }
 
 /*
@@ -600,6 +610,9 @@ EXPORT_SYMBOL(ceph_con_open);
  */
 bool ceph_con_opened(struct ceph_connection *con)
 {
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		return ceph_con_v2_opened(con);
+
 	return ceph_con_v1_opened(con);
 }
 
@@ -1302,7 +1315,16 @@ int ceph_parse_ips(const char *c, const char *end,
 		}
 
 		ceph_addr_set_port(&addr[i], port);
+		/*
+		 * We want the type to be set according to ms_mode
+		 * option, but options are normally parsed after mon
+		 * addresses.  Rather than complicating parsing, set
+		 * to LEGACY and override in build_initial_monmap()
+		 * for mon addresses and ceph_messenger_init() for
+		 * ip option.
+		 */
 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+		addr[i].nonce = 0;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
 
@@ -1410,6 +1432,13 @@ static bool con_sock_closed(struct ceph_connection *con)
 	CASE(PREOPEN);
 	CASE(V1_BANNER);
 	CASE(V1_CONNECT_MSG);
+	CASE(V2_BANNER_PREFIX);
+	CASE(V2_BANNER_PAYLOAD);
+	CASE(V2_HELLO);
+	CASE(V2_AUTH);
+	CASE(V2_AUTH_SIGNATURE);
+	CASE(V2_SESSION_CONNECT);
+	CASE(V2_SESSION_RECONNECT);
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
@@ -1494,7 +1523,10 @@ static void ceph_con_workfn(struct work_struct *work)
 			BUG_ON(con->sock);
 		}
 
-		ret = ceph_con_v1_try_read(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ret = ceph_con_v2_try_read(con);
+		else
+			ret = ceph_con_v1_try_read(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -1504,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work)
 			break;
 		}
 
-		ret = ceph_con_v1_try_write(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ret = ceph_con_v2_try_write(con);
+		else
+			ret = ceph_con_v1_try_write(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -1538,9 +1573,8 @@ static void con_fault(struct ceph_connection *con)
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
-	WARN_ON(con->state != CEPH_CON_S_V1_BANNER &&
-	       con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	       con->state != CEPH_CON_S_OPEN);
+	WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+		con->state == CEPH_CON_S_CLOSED);
 
 	ceph_con_reset_protocol(con);
 
@@ -1596,7 +1630,11 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 		ceph_addr_set_port(&msgr->inst.addr, 0);
 	}
 
-	msgr->inst.addr.type = 0;
+	/*
+	 * Since nautilus, clients are identified using type ANY.
+	 * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+	 */
+	msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
 
 	/* generate a random non-zero nonce */
 	do {
@@ -1706,7 +1744,10 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	if (con->out_msg == msg) {
 		WARN_ON(con->state != CEPH_CON_S_OPEN);
 		dout("%s con %p msg %p was sending\n", __func__, con, msg);
-		ceph_con_v1_revoke(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ceph_con_v2_revoke(con);
+		else
+			ceph_con_v1_revoke(con);
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	} else {
@@ -1732,7 +1773,10 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 	if (con->in_msg == msg) {
 		WARN_ON(con->state != CEPH_CON_S_OPEN);
 		dout("%s con %p msg %p was recving\n", __func__, con, msg);
-		ceph_con_v1_revoke_incoming(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ceph_con_v2_revoke_incoming(con);
+		else
+			ceph_con_v1_revoke_incoming(con);
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
 	} else {
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..5e38c847317b
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3443 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/algapi.h>  /* for crypto_memneq() */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h"  /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO			1
+#define FRAME_TAG_AUTH_REQUEST		2
+#define FRAME_TAG_AUTH_BAD_METHOD	3
+#define FRAME_TAG_AUTH_REPLY_MORE	4
+#define FRAME_TAG_AUTH_REQUEST_MORE	5
+#define FRAME_TAG_AUTH_DONE		6
+#define FRAME_TAG_AUTH_SIGNATURE	7
+#define FRAME_TAG_CLIENT_IDENT		8
+#define FRAME_TAG_SERVER_IDENT		9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT	11
+#define FRAME_TAG_SESSION_RESET		12
+#define FRAME_TAG_SESSION_RETRY		13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL	14
+#define FRAME_TAG_SESSION_RECONNECT_OK	15
+#define FRAME_TAG_WAIT			16
+#define FRAME_TAG_MESSAGE		17
+#define FRAME_TAG_KEEPALIVE2		18
+#define FRAME_TAG_KEEPALIVE2_ACK	19
+#define FRAME_TAG_ACK			20
+
+#define FRAME_LATE_STATUS_ABORTED	0x1
+#define FRAME_LATE_STATUS_COMPLETE	0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK	0xf
+
+#define IN_S_HANDLE_PREAMBLE		1
+#define IN_S_HANDLE_CONTROL		2
+#define IN_S_HANDLE_CONTROL_REMAINDER	3
+#define IN_S_PREPARE_READ_DATA		4
+#define IN_S_PREPARE_READ_DATA_CONT	5
+#define IN_S_HANDLE_EPILOGUE		6
+#define IN_S_FINISH_SKIP		7
+
+#define OUT_S_QUEUE_DATA		1
+#define OUT_S_QUEUE_DATA_CONT		2
+#define OUT_S_QUEUE_ENC_PAGE		3
+#define OUT_S_QUEUE_ZEROS		4
+#define OUT_S_FINISH_MESSAGE		5
+#define OUT_S_GET_NEXT			6
+
+#define CTRL_BODY(p)	((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p)	((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p)	(FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p)	(MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	int ret;
+
+	msg.msg_iter = *it;
+	while (iov_iter_count(it)) {
+		ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	WARN_ON(msg_data_left(&msg));
+	return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ *   1 - done, nothing (else) to read
+ *   0 - socket is empty, need to wait
+ *  <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p %s %zu\n", __func__, con,
+	     iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+	     iov_iter_count(&con->v2.in_iter));
+	ret = do_recvmsg(con->sock, &con->v2.in_iter);
+	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+	     iov_iter_count(&con->v2.in_iter));
+	return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	int ret;
+
+	msg.msg_iter = *it;
+	while (iov_iter_count(it)) {
+		ret = sock_sendmsg(sock, &msg);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	WARN_ON(msg_data_left(&msg));
+	return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	struct bio_vec bv;
+	int ret;
+
+	if (WARN_ON(!iov_iter_is_bvec(it)))
+		return -EINVAL;
+
+	while (iov_iter_count(it)) {
+		/* iov_iter_iovec() for ITER_BVEC */
+		bv.bv_page = it->bvec->bv_page;
+		bv.bv_offset = it->bvec->bv_offset + it->iov_offset;
+		bv.bv_len = min(iov_iter_count(it),
+				it->bvec->bv_len - it->iov_offset);
+
+		/*
+		 * sendpage cannot properly handle pages with
+		 * page_count == 0, we need to fall back to sendmsg if
+		 * that's the case.
+		 *
+		 * Same goes for slab pages: skb_can_coalesce() allows
+		 * coalescing neighboring slab objects into a single frag
+		 * which triggers one of hardened usercopy checks.
+		 */
+		if (sendpage_ok(bv.bv_page)) {
+			ret = sock->ops->sendpage(sock, bv.bv_page,
+						  bv.bv_offset, bv.bv_len,
+						  CEPH_MSG_FLAGS);
+		} else {
+			iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len);
+			ret = sock_sendmsg(sock, &msg);
+		}
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	return 1;
+}
+
+/*
+ * Write as much as possible.  The socket is expected to be corked,
+ * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ *
+ * Return:
+ *   1 - done, nothing (else) to write
+ *   0 - socket is full, need to wait
+ *  <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+	     iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+	if (con->v2.out_iter_sendpage)
+		ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+	else
+		ret = do_sendmsg(con->sock, &con->v2.out_iter);
+	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+	     iov_iter_count(&con->v2.out_iter));
+	return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+	con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+	con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+	con->v2.in_kvec_cnt++;
+
+	con->v2.in_iter.nr_segs++;
+	con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	con->v2.in_kvec_cnt = 0;
+	iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	con->v2.in_bvec = *bv;
+	iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	dout("%s con %p len %d\n", __func__, con, len);
+	iov_iter_discard(&con->v2.in_iter, READ, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+	con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+	con->v2.out_kvec_cnt++;
+
+	con->v2.out_iter.nr_segs++;
+	con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_kvec_cnt = 0;
+
+	iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0);
+	con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+			 bool zerocopy)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_bvec = *bv;
+	con->v2.out_iter_sendpage = zerocopy;
+	iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+		      con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(!con->v2.out_zero);
+
+	con->v2.out_bvec.bv_page = ceph_zero_page;
+	con->v2.out_bvec.bv_offset = 0;
+	con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE);
+	con->v2.out_iter_sendpage = true;
+	iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+		      con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+	dout("%s con %p len %d\n", __func__, con, len);
+	con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+	void *buf;
+
+	dout("%s con %p len %d\n", __func__, con, len);
+
+	if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+		return NULL;
+
+	buf = ceph_kvmalloc(len, GFP_NOIO);
+	if (!buf)
+		return NULL;
+
+	con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+	return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+	while (con->v2.conn_buf_cnt)
+		kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+	con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+	con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+	con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+	con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+	con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+	con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+	con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+	con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+	return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+	return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+	return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+	return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+	int head_len;
+	int rem_len;
+
+	if (secure) {
+		head_len = CEPH_PREAMBLE_SECURE_LEN;
+		if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+			rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+			head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+		}
+	} else {
+		head_len = CEPH_PREAMBLE_PLAIN_LEN;
+		if (ctrl_len)
+			head_len += ctrl_len + CEPH_CRC_LEN;
+	}
+	return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+			     bool secure)
+{
+	if (!front_len && !middle_len && !data_len)
+		return 0;
+
+	if (!secure)
+		return front_len + middle_len + data_len +
+		       CEPH_EPILOGUE_PLAIN_LEN;
+
+	return padded_len(front_len) + padded_len(middle_len) +
+	       padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+	return __tail_onwire_len(front_len(msg), middle_len(msg),
+				 data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN	(CEPH_PREAMBLE_PLAIN_LEN +		\
+				 sizeof(struct ceph_msg_header2) +	\
+				 CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+	sizeof(void *),
+	sizeof(void *),
+	sizeof(void *),
+	PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+	int i;
+
+	for (i = len_cnt - 1; i >= 0; i--) {
+		if (lens[i])
+			return i + 1;
+	}
+
+	return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+			    const int *lens, int len_cnt)
+{
+	int i;
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->fd_tag = tag;
+	desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+	BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		desc->fd_lens[i] = lens[i];
+		desc->fd_aligns[i] = frame_aligns[i];
+	}
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+	void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+	void *start = p;
+	int i;
+
+	memset(p, 0, CEPH_PREAMBLE_LEN);
+
+	ceph_encode_8(&p, desc->fd_tag);
+	ceph_encode_8(&p, desc->fd_seg_cnt);
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		ceph_encode_32(&p, desc->fd_lens[i]);
+		ceph_encode_16(&p, desc->fd_aligns[i]);
+	}
+
+	put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+	void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+	u32 crc, expected_crc;
+	int i;
+
+	crc = crc32c(0, p, crcp - p);
+	expected_crc = get_unaligned_le32(crcp);
+	if (crc != expected_crc) {
+		pr_err("bad preamble crc, calculated %u, expected %u\n",
+		       crc, expected_crc);
+		return -EBADMSG;
+	}
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->fd_tag = ceph_decode_8(&p);
+	desc->fd_seg_cnt = ceph_decode_8(&p);
+	if (desc->fd_seg_cnt < 1 ||
+	    desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+		pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		desc->fd_lens[i] = ceph_decode_32(&p);
+		desc->fd_aligns[i] = ceph_decode_16(&p);
+	}
+
+	/*
+	 * This would fire for FRAME_TAG_WAIT (it has one empty
+	 * segment), but we should never get it as client.
+	 */
+	if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
+		pr_err("last segment empty\n");
+		return -EINVAL;
+	}
+
+	if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
+		pr_err("control segment too big %d\n", desc->fd_lens[0]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
+		pr_err("front segment too big %d\n", desc->fd_lens[1]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
+		pr_err("middle segment too big %d\n", desc->fd_lens[2]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
+		pr_err("data segment too big %d\n", desc->fd_lens[3]);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+	con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+						 FRAME_LATE_STATUS_COMPLETE;
+	cpu_to_le32s(&con->v2.out_epil.front_crc);
+	cpu_to_le32s(&con->v2.out_epil.middle_crc);
+	cpu_to_le32s(&con->v2.out_epil.data_crc);
+}
+
+static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
+{
+	memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
+	con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+						 FRAME_LATE_STATUS_COMPLETE;
+}
+
+static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
+			   u32 *data_crc)
+{
+	u8 late_status;
+
+	late_status = ceph_decode_8(&p);
+	if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
+			FRAME_LATE_STATUS_COMPLETE) {
+		/* we should never get an aborted message as client */
+		pr_err("bad late_status 0x%x\n", late_status);
+		return -EINVAL;
+	}
+
+	if (front_crc && middle_crc && data_crc) {
+		*front_crc = ceph_decode_32(&p);
+		*middle_crc = ceph_decode_32(&p);
+		*data_crc = ceph_decode_32(&p);
+	}
+
+	return 0;
+}
+
+static void fill_header(struct ceph_msg_header *hdr,
+			const struct ceph_msg_header2 *hdr2,
+			int front_len, int middle_len, int data_len,
+			const struct ceph_entity_name *peer_name)
+{
+	hdr->seq = hdr2->seq;
+	hdr->tid = hdr2->tid;
+	hdr->type = hdr2->type;
+	hdr->priority = hdr2->priority;
+	hdr->version = hdr2->version;
+	hdr->front_len = cpu_to_le32(front_len);
+	hdr->middle_len = cpu_to_le32(middle_len);
+	hdr->data_len = cpu_to_le32(data_len);
+	hdr->data_off = hdr2->data_off;
+	hdr->src = *peer_name;
+	hdr->compat_version = hdr2->compat_version;
+	hdr->reserved = 0;
+	hdr->crc = 0;
+}
+
+static void fill_header2(struct ceph_msg_header2 *hdr2,
+			 const struct ceph_msg_header *hdr, u64 ack_seq)
+{
+	hdr2->seq = hdr->seq;
+	hdr2->tid = hdr->tid;
+	hdr2->type = hdr->type;
+	hdr2->priority = hdr->priority;
+	hdr2->version = hdr->version;
+	hdr2->data_pre_padding_len = 0;
+	hdr2->data_off = hdr->data_off;
+	hdr2->ack_seq = cpu_to_le64(ack_seq);
+	hdr2->flags = 0;
+	hdr2->compat_version = hdr->compat_version;
+	hdr2->reserved = 0;
+}
+
+static int verify_control_crc(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	u32 crc, expected_crc;
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
+	WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
+
+	crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
+	expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
+	if (crc != expected_crc) {
+		pr_err("bad control crc, calculated %u, expected %u\n",
+		       crc, expected_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
+				u32 middle_crc, u32 data_crc)
+{
+	if (front_len(con->in_msg)) {
+		con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
+					   front_len(con->in_msg));
+	} else {
+		WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
+		con->in_front_crc = -1;
+	}
+
+	if (middle_len(con->in_msg))
+		con->in_middle_crc = crc32c(-1,
+					    con->in_msg->middle->vec.iov_base,
+					    middle_len(con->in_msg));
+	else if (data_len(con->in_msg))
+		con->in_middle_crc = -1;
+	else
+		con->in_middle_crc = 0;
+
+	if (!data_len(con->in_msg))
+		con->in_data_crc = 0;
+
+	dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+
+	if (con->in_front_crc != front_crc) {
+		pr_err("bad front crc, calculated %u, expected %u\n",
+		       con->in_front_crc, front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != middle_crc) {
+		pr_err("bad middle crc, calculated %u, expected %u\n",
+		       con->in_middle_crc, middle_crc);
+		return -EBADMSG;
+	}
+	if (con->in_data_crc != data_crc) {
+		pr_err("bad data crc, calculated %u, expected %u\n",
+		       con->in_data_crc, data_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+static int setup_crypto(struct ceph_connection *con,
+			u8 *session_key, int session_key_len,
+			u8 *con_secret, int con_secret_len)
+{
+	unsigned int noio_flag;
+	void *p;
+	int ret;
+
+	dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
+	     __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
+	WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req);
+
+	if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
+	    con->v2.con_mode != CEPH_CON_MODE_SECURE) {
+		pr_err("bad con_mode %d\n", con->v2.con_mode);
+		return -EINVAL;
+	}
+
+	if (!session_key_len) {
+		WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
+		WARN_ON(con_secret_len);
+		return 0;  /* auth_none */
+	}
+
+	noio_flag = memalloc_noio_save();
+	con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(con->v2.hmac_tfm)) {
+		ret = PTR_ERR(con->v2.hmac_tfm);
+		con->v2.hmac_tfm = NULL;
+		pr_err("failed to allocate hmac tfm context: %d\n", ret);
+		return ret;
+	}
+
+	WARN_ON((unsigned long)session_key &
+		crypto_shash_alignmask(con->v2.hmac_tfm));
+	ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key,
+				  session_key_len);
+	if (ret) {
+		pr_err("failed to set hmac key: %d\n", ret);
+		return ret;
+	}
+
+	if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
+		WARN_ON(con_secret_len);
+		return 0;  /* auth_x, plain mode */
+	}
+
+	if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
+		pr_err("con_secret too small %d\n", con_secret_len);
+		return -EINVAL;
+	}
+
+	noio_flag = memalloc_noio_save();
+	con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(con->v2.gcm_tfm)) {
+		ret = PTR_ERR(con->v2.gcm_tfm);
+		con->v2.gcm_tfm = NULL;
+		pr_err("failed to allocate gcm tfm context: %d\n", ret);
+		return ret;
+	}
+
+	p = con_secret;
+	WARN_ON((unsigned long)p & crypto_aead_alignmask(con->v2.gcm_tfm));
+	ret = crypto_aead_setkey(con->v2.gcm_tfm, p, CEPH_GCM_KEY_LEN);
+	if (ret) {
+		pr_err("failed to set gcm key: %d\n", ret);
+		return ret;
+	}
+
+	p += CEPH_GCM_KEY_LEN;
+	WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
+	ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
+	if (ret) {
+		pr_err("failed to set gcm tag size: %d\n", ret);
+		return ret;
+	}
+
+	con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
+	if (!con->v2.gcm_req) {
+		pr_err("failed to allocate gcm request\n");
+		return -ENOMEM;
+	}
+
+	crypto_init_wait(&con->v2.gcm_wait);
+	aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  crypto_req_done, &con->v2.gcm_wait);
+
+	memcpy(&con->v2.in_gcm_nonce, p, CEPH_GCM_IV_LEN);
+	memcpy(&con->v2.out_gcm_nonce, p + CEPH_GCM_IV_LEN, CEPH_GCM_IV_LEN);
+	return 0;  /* auth_x, secure mode */
+}
+
+static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs,
+		       int kvec_cnt, u8 *hmac)
+{
+	SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm);  /* tfm arg is ignored */
+	int ret;
+	int i;
+
+	dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con,
+	     con->v2.hmac_tfm, kvec_cnt);
+
+	if (!con->v2.hmac_tfm) {
+		memset(hmac, 0, SHA256_DIGEST_SIZE);
+		return 0;  /* auth_none */
+	}
+
+	desc->tfm = con->v2.hmac_tfm;
+	ret = crypto_shash_init(desc);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < kvec_cnt; i++) {
+		WARN_ON((unsigned long)kvecs[i].iov_base &
+			crypto_shash_alignmask(con->v2.hmac_tfm));
+		ret = crypto_shash_update(desc, kvecs[i].iov_base,
+					  kvecs[i].iov_len);
+		if (ret)
+			return ret;
+	}
+
+	ret = crypto_shash_final(desc, hmac);
+	if (ret)
+		return ret;
+
+	shash_desc_zero(desc);
+	return 0;  /* auth_x, both plain and secure modes */
+}
+
+static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
+{
+	u64 counter;
+
+	counter = le64_to_cpu(nonce->counter);
+	nonce->counter = cpu_to_le64(counter + 1);
+}
+
+static int gcm_crypt(struct ceph_connection *con, bool encrypt,
+		     struct scatterlist *src, struct scatterlist *dst,
+		     int src_len)
+{
+	struct ceph_gcm_nonce *nonce;
+	int ret;
+
+	nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
+
+	aead_request_set_ad(con->v2.gcm_req, 0);  /* no AAD */
+	aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
+	ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
+					crypto_aead_decrypt(con->v2.gcm_req),
+			      &con->v2.gcm_wait);
+	if (ret)
+		return ret;
+
+	gcm_inc_nonce(nonce);
+	return 0;
+}
+
+static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
+			struct bio_vec *bv)
+{
+	struct page *page;
+	size_t off, len;
+
+	WARN_ON(!cursor->total_resid);
+
+	/* skip zero-length data items */
+	while (!cursor->resid)
+		ceph_msg_data_advance(cursor, 0);
+
+	/* get a piece of data, cursor isn't advanced */
+	page = ceph_msg_data_next(cursor, &off, &len, NULL);
+
+	bv->bv_page = page;
+	bv->bv_offset = off;
+	bv->bv_len = len;
+}
+
+static int calc_sg_cnt(void *buf, int buf_len)
+{
+	int sg_cnt;
+
+	if (!buf_len)
+		return 0;
+
+	sg_cnt = need_padding(buf_len) ? 1 : 0;
+	if (is_vmalloc_addr(buf)) {
+		WARN_ON(offset_in_page(buf));
+		sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
+	} else {
+		sg_cnt++;
+	}
+
+	return sg_cnt;
+}
+
+static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
+{
+	int data_len = cursor->total_resid;
+	struct bio_vec bv;
+	int sg_cnt;
+
+	if (!data_len)
+		return 0;
+
+	sg_cnt = need_padding(data_len) ? 1 : 0;
+	do {
+		get_bvec_at(cursor, &bv);
+		sg_cnt++;
+
+		ceph_msg_data_advance(cursor, bv.bv_len);
+	} while (cursor->total_resid);
+
+	return sg_cnt;
+}
+
+static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
+{
+	void *end = buf + buf_len;
+	struct page *page;
+	int len;
+	void *p;
+
+	if (!buf_len)
+		return;
+
+	if (is_vmalloc_addr(buf)) {
+		p = buf;
+		do {
+			page = vmalloc_to_page(p);
+			len = min_t(int, end - p, PAGE_SIZE);
+			WARN_ON(!page || !len || offset_in_page(p));
+			sg_set_page(*sg, page, len, 0);
+			*sg = sg_next(*sg);
+			p += len;
+		} while (p != end);
+	} else {
+		sg_set_buf(*sg, buf, buf_len);
+		*sg = sg_next(*sg);
+	}
+
+	if (need_padding(buf_len)) {
+		sg_set_buf(*sg, pad, padding_len(buf_len));
+		*sg = sg_next(*sg);
+	}
+}
+
+static void init_sgs_cursor(struct scatterlist **sg,
+			    struct ceph_msg_data_cursor *cursor, u8 *pad)
+{
+	int data_len = cursor->total_resid;
+	struct bio_vec bv;
+
+	if (!data_len)
+		return;
+
+	do {
+		get_bvec_at(cursor, &bv);
+		sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+		*sg = sg_next(*sg);
+
+		ceph_msg_data_advance(cursor, bv.bv_len);
+	} while (cursor->total_resid);
+
+	if (need_padding(data_len)) {
+		sg_set_buf(*sg, pad, padding_len(data_len));
+		*sg = sg_next(*sg);
+	}
+}
+
+static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
+			     u8 *front_pad, u8 *middle_pad, u8 *data_pad,
+			     void *epilogue, bool add_tag)
+{
+	struct ceph_msg_data_cursor cursor;
+	struct scatterlist *cur_sg;
+	int sg_cnt;
+	int ret;
+
+	if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+		return 0;
+
+	sg_cnt = 1;  /* epilogue + [auth tag] */
+	if (front_len(msg))
+		sg_cnt += calc_sg_cnt(msg->front.iov_base,
+				      front_len(msg));
+	if (middle_len(msg))
+		sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
+				      middle_len(msg));
+	if (data_len(msg)) {
+		ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+		sg_cnt += calc_sg_cnt_cursor(&cursor);
+	}
+
+	ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	cur_sg = sgt->sgl;
+	if (front_len(msg))
+		init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
+			 front_pad);
+	if (middle_len(msg))
+		init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
+			 middle_pad);
+	if (data_len(msg)) {
+		ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+		init_sgs_cursor(&cur_sg, &cursor, data_pad);
+	}
+
+	WARN_ON(!sg_is_last(cur_sg));
+	sg_set_buf(cur_sg, epilogue,
+		   CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
+	return 0;
+}
+
+static int decrypt_preamble(struct ceph_connection *con)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
+	return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
+}
+
+static int decrypt_control_remainder(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
+	struct scatterlist sgs[2];
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
+	WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
+
+	sg_init_table(sgs, 2);
+	sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
+	sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
+
+	return gcm_crypt(con, false, sgs, sgs,
+			 padded_len(rem_len) + CEPH_GCM_TAG_LEN);
+}
+
+static int decrypt_message(struct ceph_connection *con)
+{
+	struct sg_table sgt = {};
+	int ret;
+
+	ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
+			MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
+			con->v2.in_buf, true);
+	if (ret)
+		goto out;
+
+	ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl,
+			tail_onwire_len(con->in_msg, true));
+
+out:
+	sg_free_table(&sgt);
+	return ret;
+}
+
+static int prepare_banner(struct ceph_connection *con)
+{
+	int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
+	void *buf, *p;
+
+	buf = alloc_conn_buf(con, buf_len);
+	if (!buf)
+		return -ENOMEM;
+
+	p = buf;
+	ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
+	ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
+	ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
+	ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
+	WARN_ON(p != buf + buf_len);
+
+	add_out_kvec(con, buf, buf_len);
+	add_out_sign_kvec(con, buf, buf_len);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+/*
+ * base:
+ *   preamble
+ *   control body (ctrl_len bytes)
+ *   space for control crc
+ *
+ * extdata (optional):
+ *   control body (extdata_len bytes)
+ *
+ * Compute control crc and gather base and extdata into:
+ *
+ *   preamble
+ *   control body (ctrl_len + extdata_len bytes)
+ *   control crc
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static void prepare_head_plain(struct ceph_connection *con, void *base,
+			       int ctrl_len, void *extdata, int extdata_len,
+			       bool to_be_signed)
+{
+	int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
+	void *crcp = base + base_len - CEPH_CRC_LEN;
+	u32 crc;
+
+	crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
+	if (extdata_len)
+		crc = crc32c(crc, extdata, extdata_len);
+	put_unaligned_le32(crc, crcp);
+
+	if (!extdata_len) {
+		add_out_kvec(con, base, base_len);
+		if (to_be_signed)
+			add_out_sign_kvec(con, base, base_len);
+		return;
+	}
+
+	add_out_kvec(con, base, crcp - base);
+	add_out_kvec(con, extdata, extdata_len);
+	add_out_kvec(con, crcp, CEPH_CRC_LEN);
+	if (to_be_signed) {
+		add_out_sign_kvec(con, base, crcp - base);
+		add_out_sign_kvec(con, extdata, extdata_len);
+		add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
+	}
+}
+
+static int prepare_head_secure_small(struct ceph_connection *con,
+				     void *base, int ctrl_len)
+{
+	struct scatterlist sg;
+	int ret;
+
+	/* inline buffer padding? */
+	if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
+		memset(CTRL_BODY(base) + ctrl_len, 0,
+		       CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
+
+	sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
+	ret = gcm_crypt(con, true, &sg, &sg,
+			CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
+	if (ret)
+		return ret;
+
+	add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
+	return 0;
+}
+
+/*
+ * base:
+ *   preamble
+ *   control body (ctrl_len bytes)
+ *   space for padding, if needed
+ *   space for control remainder auth tag
+ *   space for preamble auth tag
+ *
+ * Encrypt preamble and the inline portion, then encrypt the remainder
+ * and gather into:
+ *
+ *   preamble
+ *   control body (48 bytes)
+ *   preamble auth tag
+ *   control body (ctrl_len - 48 bytes)
+ *   zero padding, if needed
+ *   control remainder auth tag
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static int prepare_head_secure_big(struct ceph_connection *con,
+				   void *base, int ctrl_len)
+{
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
+	void *rem_tag = rem + padded_len(rem_len);
+	void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
+	struct scatterlist sgs[2];
+	int ret;
+
+	sg_init_table(sgs, 2);
+	sg_set_buf(&sgs[0], base, rem - base);
+	sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
+	ret = gcm_crypt(con, true, sgs, sgs, rem - base);
+	if (ret)
+		return ret;
+
+	/* control remainder padding? */
+	if (need_padding(rem_len))
+		memset(rem + rem_len, 0, padding_len(rem_len));
+
+	sg_init_one(&sgs[0], rem, pmbl_tag - rem);
+	ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
+	if (ret)
+		return ret;
+
+	add_out_kvec(con, base, rem - base);
+	add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
+	add_out_kvec(con, rem, pmbl_tag - rem);
+	return 0;
+}
+
+static int __prepare_control(struct ceph_connection *con, int tag,
+			     void *base, int ctrl_len, void *extdata,
+			     int extdata_len, bool to_be_signed)
+{
+	int total_len = ctrl_len + extdata_len;
+	struct ceph_frame_desc desc;
+	int ret;
+
+	dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
+	     total_len, ctrl_len, extdata_len);
+
+	/* extdata may be vmalloc'ed but not base */
+	if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
+		return -EINVAL;
+
+	init_frame_desc(&desc, tag, &total_len, 1);
+	encode_preamble(&desc, base);
+
+	if (con_secure(con)) {
+		if (WARN_ON(extdata_len || to_be_signed))
+			return -EINVAL;
+
+		if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
+			/* fully inlined, inline buffer may need padding */
+			ret = prepare_head_secure_small(con, base, ctrl_len);
+		else
+			/* partially inlined, inline buffer is full */
+			ret = prepare_head_secure_big(con, base, ctrl_len);
+		if (ret)
+			return ret;
+	} else {
+		prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
+				   to_be_signed);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+static int prepare_control(struct ceph_connection *con, int tag,
+			   void *base, int ctrl_len)
+{
+	return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
+}
+
+static int prepare_hello(struct ceph_connection *con)
+{
+	void *buf, *p;
+	int ctrl_len;
+
+	ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_entity_addr(&p, &con->peer_addr);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
+				 NULL, 0, true);
+}
+
+/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
+#define AUTH_BUF_LEN	(512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
+
+static int prepare_auth_request(struct ceph_connection *con)
+{
+	void *authorizer, *authorizer_copy;
+	int ctrl_len, authorizer_len;
+	void *buf;
+	int ret;
+
+	ctrl_len = AUTH_BUF_LEN;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
+					 &authorizer, &authorizer_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_HELLO) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	authorizer_copy = alloc_conn_buf(con, authorizer_len);
+	if (!authorizer_copy)
+		return -ENOMEM;
+
+	memcpy(authorizer_copy, authorizer, authorizer_len);
+
+	return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
+				 authorizer_copy, authorizer_len, true);
+}
+
+static int prepare_auth_request_more(struct ceph_connection *con,
+				     void *reply, int reply_len)
+{
+	int ctrl_len, authorizer_len;
+	void *authorizer;
+	void *buf;
+	int ret;
+
+	ctrl_len = AUTH_BUF_LEN;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
+					       CTRL_BODY(buf), &ctrl_len,
+					       &authorizer, &authorizer_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
+				 ctrl_len, authorizer, authorizer_len, true);
+}
+
+static int prepare_auth_signature(struct ceph_connection *con)
+{
+	void *buf;
+	int ret;
+
+	buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, false));
+	if (!buf)
+		return -ENOMEM;
+
+	ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+			  CTRL_BODY(buf));
+	if (ret)
+		return ret;
+
+	return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
+			       SHA256_DIGEST_SIZE);
+}
+
+static int prepare_client_ident(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 global_id = ceph_client_gid(client);
+	void *buf, *p;
+	int ctrl_len;
+
+	WARN_ON(con->v2.server_cookie);
+	WARN_ON(con->v2.connect_seq);
+	WARN_ON(con->v2.peer_global_seq);
+
+	if (!con->v2.client_cookie) {
+		do {
+			get_random_bytes(&con->v2.client_cookie,
+					 sizeof(con->v2.client_cookie));
+		} while (!con->v2.client_cookie);
+		dout("%s con %p generated cookie 0x%llx\n", __func__, con,
+		     con->v2.client_cookie);
+	} else {
+		dout("%s con %p cookie already set 0x%llx\n", __func__, con,
+		     con->v2.client_cookie);
+	}
+
+	dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
+	     __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+	     ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
+	     global_id, con->v2.global_seq, client->supported_features,
+	     client->required_features, con->v2.client_cookie);
+
+	ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
+		   ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, 2);  /* addrvec marker */
+	ceph_encode_32(&p, 1);  /* addr_cnt */
+	ceph_encode_entity_addr(&p, my_addr);
+	ceph_encode_entity_addr(&p, &con->peer_addr);
+	ceph_encode_64(&p, global_id);
+	ceph_encode_64(&p, con->v2.global_seq);
+	ceph_encode_64(&p, client->supported_features);
+	ceph_encode_64(&p, client->required_features);
+	ceph_encode_64(&p, 0);  /* flags */
+	ceph_encode_64(&p, con->v2.client_cookie);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
+}
+
+static int prepare_session_reconnect(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	void *buf, *p;
+	int ctrl_len;
+
+	WARN_ON(!con->v2.client_cookie);
+	WARN_ON(!con->v2.server_cookie);
+	WARN_ON(!con->v2.connect_seq);
+	WARN_ON(!con->v2.peer_global_seq);
+
+	dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
+	     __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+	     con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
+	     con->v2.connect_seq, con->in_seq);
+
+	ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, 2);  /* entity_addrvec_t marker */
+	ceph_encode_32(&p, 1);  /* my_addrs len */
+	ceph_encode_entity_addr(&p, my_addr);
+	ceph_encode_64(&p, con->v2.client_cookie);
+	ceph_encode_64(&p, con->v2.server_cookie);
+	ceph_encode_64(&p, con->v2.global_seq);
+	ceph_encode_64(&p, con->v2.connect_seq);
+	ceph_encode_64(&p, con->in_seq);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
+}
+
+static int prepare_keepalive2(struct ceph_connection *con)
+{
+	struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
+	struct timespec64 now;
+
+	ktime_get_real_ts64(&now);
+	dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec,
+	     now.tv_nsec);
+
+	ceph_encode_timespec64(ts, &now);
+
+	reset_out_kvecs(con);
+	return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
+			       sizeof(struct ceph_timespec));
+}
+
+static int prepare_ack(struct ceph_connection *con)
+{
+	void *p;
+
+	dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	p = CTRL_BODY(con->v2.out_buf);
+	ceph_encode_64(&p, con->in_seq_acked);
+
+	reset_out_kvecs(con);
+	return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
+}
+
+static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+	dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
+	     con->out_msg, aborted, con->v2.out_epil.front_crc,
+	     con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
+
+	encode_epilogue_plain(con, aborted);
+	add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
+}
+
+/*
+ * For "used" empty segments, crc is -1.  For unused (trailing)
+ * segments, crc is 0.
+ */
+static void prepare_message_plain(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	prepare_head_plain(con, con->v2.out_buf,
+			   sizeof(struct ceph_msg_header2), NULL, 0, false);
+
+	if (!front_len(msg) && !middle_len(msg)) {
+		if (!data_len(msg)) {
+			/*
+			 * Empty message: once the head is written,
+			 * we are done -- there is no epilogue.
+			 */
+			con->v2.out_state = OUT_S_FINISH_MESSAGE;
+			return;
+		}
+
+		con->v2.out_epil.front_crc = -1;
+		con->v2.out_epil.middle_crc = -1;
+		con->v2.out_state = OUT_S_QUEUE_DATA;
+		return;
+	}
+
+	if (front_len(msg)) {
+		con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
+						    front_len(msg));
+		add_out_kvec(con, msg->front.iov_base, front_len(msg));
+	} else {
+		/* middle (at least) is there, checked above */
+		con->v2.out_epil.front_crc = -1;
+	}
+
+	if (middle_len(msg)) {
+		con->v2.out_epil.middle_crc =
+			crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
+		add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+	} else {
+		con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
+	}
+
+	if (data_len(msg)) {
+		con->v2.out_state = OUT_S_QUEUE_DATA;
+	} else {
+		con->v2.out_epil.data_crc = 0;
+		prepare_epilogue_plain(con, false);
+		con->v2.out_state = OUT_S_FINISH_MESSAGE;
+	}
+}
+
+/*
+ * Unfortunately the kernel crypto API doesn't support streaming
+ * (piecewise) operation for AEAD algorithms, so we can't get away
+ * with a fixed size buffer and a couple sgs.  Instead, we have to
+ * allocate pages for the entire tail of the message (currently up
+ * to ~32M) and two sgs arrays (up to ~256K each)...
+ */
+static int prepare_message_secure(struct ceph_connection *con)
+{
+	void *zerop = page_address(ceph_zero_page);
+	struct sg_table enc_sgt = {};
+	struct sg_table sgt = {};
+	struct page **enc_pages;
+	int enc_page_cnt;
+	int tail_len;
+	int ret;
+
+	ret = prepare_head_secure_small(con, con->v2.out_buf,
+					sizeof(struct ceph_msg_header2));
+	if (ret)
+		return ret;
+
+	tail_len = tail_onwire_len(con->out_msg, true);
+	if (!tail_len) {
+		/*
+		 * Empty message: once the head is written,
+		 * we are done -- there is no epilogue.
+		 */
+		con->v2.out_state = OUT_S_FINISH_MESSAGE;
+		return 0;
+	}
+
+	encode_epilogue_secure(con, false);
+	ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop,
+				&con->v2.out_epil, false);
+	if (ret)
+		goto out;
+
+	enc_page_cnt = calc_pages_for(0, tail_len);
+	enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+	if (IS_ERR(enc_pages)) {
+		ret = PTR_ERR(enc_pages);
+		goto out;
+	}
+
+	WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
+	con->v2.out_enc_pages = enc_pages;
+	con->v2.out_enc_page_cnt = enc_page_cnt;
+	con->v2.out_enc_resid = tail_len;
+	con->v2.out_enc_i = 0;
+
+	ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
+					0, tail_len, GFP_NOIO);
+	if (ret)
+		goto out;
+
+	ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
+			tail_len - CEPH_GCM_TAG_LEN);
+	if (ret)
+		goto out;
+
+	dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
+	     con->out_msg, sgt.orig_nents, enc_page_cnt);
+	con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
+
+out:
+	sg_free_table(&sgt);
+	sg_free_table(&enc_sgt);
+	return ret;
+}
+
+static int prepare_message(struct ceph_connection *con)
+{
+	int lens[] = {
+		sizeof(struct ceph_msg_header2),
+		front_len(con->out_msg),
+		middle_len(con->out_msg),
+		data_len(con->out_msg)
+	};
+	struct ceph_frame_desc desc;
+	int ret;
+
+	dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
+	     con->out_msg, lens[0], lens[1], lens[2], lens[3]);
+
+	if (con->in_seq > con->in_seq_acked) {
+		dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+		     con->in_seq_acked, con->in_seq);
+		con->in_seq_acked = con->in_seq;
+	}
+
+	reset_out_kvecs(con);
+	init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
+	encode_preamble(&desc, con->v2.out_buf);
+	fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr,
+		     con->in_seq_acked);
+
+	if (con_secure(con)) {
+		ret = prepare_message_secure(con);
+		if (ret)
+			return ret;
+	} else {
+		prepare_message_plain(con);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+static int prepare_read_banner_prefix(struct ceph_connection *con)
+{
+	void *buf;
+
+	buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
+	if (!buf)
+		return -ENOMEM;
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+	add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+	con->state = CEPH_CON_S_V2_BANNER_PREFIX;
+	return 0;
+}
+
+static int prepare_read_banner_payload(struct ceph_connection *con,
+				       int payload_len)
+{
+	void *buf;
+
+	buf = alloc_conn_buf(con, payload_len);
+	if (!buf)
+		return -ENOMEM;
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf, payload_len);
+	add_in_sign_kvec(con, buf, payload_len);
+	con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
+	return 0;
+}
+
+static void prepare_read_preamble(struct ceph_connection *con)
+{
+	reset_in_kvecs(con);
+	add_in_kvec(con, con->v2.in_buf,
+		    con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
+				      CEPH_PREAMBLE_PLAIN_LEN);
+	con->v2.in_state = IN_S_HANDLE_PREAMBLE;
+}
+
+static int prepare_read_control(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int head_len;
+	void *buf;
+
+	reset_in_kvecs(con);
+	if (con->state == CEPH_CON_S_V2_HELLO ||
+	    con->state == CEPH_CON_S_V2_AUTH) {
+		head_len = head_onwire_len(ctrl_len, false);
+		buf = alloc_conn_buf(con, head_len);
+		if (!buf)
+			return -ENOMEM;
+
+		/* preserve preamble */
+		memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
+
+		add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
+		add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
+		add_in_sign_kvec(con, buf, head_len);
+	} else {
+		if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+			buf = alloc_conn_buf(con, ctrl_len);
+			if (!buf)
+				return -ENOMEM;
+
+			add_in_kvec(con, buf, ctrl_len);
+		} else {
+			add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
+		}
+		add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
+	}
+	con->v2.in_state = IN_S_HANDLE_CONTROL;
+	return 0;
+}
+
+static int prepare_read_control_remainder(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	void *buf;
+
+	buf = alloc_conn_buf(con, ctrl_len);
+	if (!buf)
+		return -ENOMEM;
+
+	memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
+	add_in_kvec(con, con->v2.in_buf,
+		    padding_len(rem_len) + CEPH_GCM_TAG_LEN);
+	con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
+	return 0;
+}
+
+static void prepare_read_data(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	if (!con_secure(con))
+		con->in_data_crc = -1;
+	ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
+				  data_len(con->in_msg));
+
+	get_bvec_at(&con->v2.in_cursor, &bv);
+	set_in_bvec(con, &bv);
+	con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+}
+
+static void prepare_read_data_cont(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	if (!con_secure(con))
+		con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+						    con->v2.in_bvec.bv_page,
+						    con->v2.in_bvec.bv_offset,
+						    con->v2.in_bvec.bv_len);
+
+	ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
+	if (con->v2.in_cursor.total_resid) {
+		get_bvec_at(&con->v2.in_cursor, &bv);
+		set_in_bvec(con, &bv);
+		WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
+		return;
+	}
+
+	/*
+	 * We've read all data.  Prepare to read data padding (if any)
+	 * and epilogue.
+	 */
+	reset_in_kvecs(con);
+	if (con_secure(con)) {
+		if (need_padding(data_len(con->in_msg)))
+			add_in_kvec(con, DATA_PAD(con->v2.in_buf),
+				    padding_len(data_len(con->in_msg)));
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN);
+	} else {
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+	}
+	con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static void __finish_skip(struct ceph_connection *con)
+{
+	con->in_seq++;
+	prepare_read_preamble(con);
+}
+
+static void prepare_skip_message(struct ceph_connection *con)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	int tail_len;
+
+	dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
+	     desc->fd_lens[2], desc->fd_lens[3]);
+
+	tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
+				     desc->fd_lens[3], con_secure(con));
+	if (!tail_len) {
+		__finish_skip(con);
+	} else {
+		set_in_skip(con, tail_len);
+		con->v2.in_state = IN_S_FINISH_SKIP;
+	}
+}
+
+static int process_banner_prefix(struct ceph_connection *con)
+{
+	int payload_len;
+	void *p;
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
+
+	p = con->v2.in_kvecs[0].iov_base;
+	if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
+		if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
+			con->error_msg = "server is speaking msgr1 protocol";
+		else
+			con->error_msg = "protocol error, bad banner";
+		return -EINVAL;
+	}
+
+	p += CEPH_BANNER_V2_LEN;
+	payload_len = ceph_decode_16(&p);
+	dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+	return prepare_read_banner_payload(con, payload_len);
+}
+
+static int process_banner_payload(struct ceph_connection *con)
+{
+	void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
+	u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
+	u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
+	u64 server_feat, server_req_feat;
+	void *p;
+	int ret;
+
+	p = con->v2.in_kvecs[0].iov_base;
+	ceph_decode_64_safe(&p, end, server_feat, bad);
+	ceph_decode_64_safe(&p, end, server_req_feat, bad);
+
+	dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
+	     __func__, con, server_feat, server_req_feat);
+
+	if (req_feat & ~server_feat) {
+		pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+		       server_feat, req_feat & ~server_feat);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+	if (server_req_feat & ~feat) {
+		pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+		       feat, server_req_feat & ~feat);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+
+	/* no reset_out_kvecs() as our banner may still be pending */
+	ret = prepare_hello(con);
+	if (ret) {
+		pr_err("prepare_hello failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_HELLO;
+	prepare_read_preamble(con);
+	return 0;
+
+bad:
+	pr_err("failed to decode banner payload\n");
+	return -EINVAL;
+}
+
+static int process_hello(struct ceph_connection *con, void *p, void *end)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	struct ceph_entity_addr addr_for_me;
+	u8 entity_type;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_HELLO) {
+		con->error_msg = "protocol error, unexpected hello";
+		return -EINVAL;
+	}
+
+	ceph_decode_8_safe(&p, end, entity_type, bad);
+	ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
+	if (ret) {
+		pr_err("failed to decode addr_for_me: %d\n", ret);
+		return ret;
+	}
+
+	dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
+	     entity_type, ceph_pr_addr(&addr_for_me));
+
+	if (entity_type != con->peer_name.type) {
+		pr_err("bad peer type, want %d, got %d\n",
+		       con->peer_name.type, entity_type);
+		con->error_msg = "wrong peer at address";
+		return -EINVAL;
+	}
+
+	/*
+	 * Set our address to the address our first peer (i.e. monitor)
+	 * sees that we are connecting from.  If we are behind some sort
+	 * of NAT and want to be identified by some private (not NATed)
+	 * address, ip option should be used.
+	 */
+	if (ceph_addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
+		       sizeof(my_addr->in_addr));
+		ceph_addr_set_port(my_addr, 0);
+		dout("%s con %p set my addr %s, as seen by peer %s\n",
+		     __func__, con, ceph_pr_addr(my_addr),
+		     ceph_pr_addr(&con->peer_addr));
+	} else {
+		dout("%s con %p my addr already set %s\n",
+		     __func__, con, ceph_pr_addr(my_addr));
+	}
+
+	WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
+	WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
+	WARN_ON(!my_addr->nonce);
+
+	/* no reset_out_kvecs() as our hello may still be pending */
+	ret = prepare_auth_request(con);
+	if (ret) {
+		if (ret != -EAGAIN)
+			pr_err("prepare_auth_request failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_AUTH;
+	return 0;
+
+bad:
+	pr_err("failed to decode hello\n");
+	return -EINVAL;
+}
+
+static int process_auth_bad_method(struct ceph_connection *con,
+				   void *p, void *end)
+{
+	int allowed_protos[8], allowed_modes[8];
+	int allowed_proto_cnt, allowed_mode_cnt;
+	int used_proto, result;
+	int ret;
+	int i;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_bad_method";
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(&p, end, used_proto, bad);
+	ceph_decode_32_safe(&p, end, result, bad);
+	dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
+	     result);
+
+	ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
+	if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
+		pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < allowed_proto_cnt; i++) {
+		ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
+		dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
+		     i, allowed_protos[i]);
+	}
+
+	ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
+	if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
+		pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < allowed_mode_cnt; i++) {
+		ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
+		dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
+		     i, allowed_modes[i]);
+	}
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->handle_auth_bad_method(con, used_proto, result,
+					       allowed_protos,
+					       allowed_proto_cnt,
+					       allowed_modes,
+					       allowed_mode_cnt);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
+	return ret;
+
+bad:
+	pr_err("failed to decode auth_bad_method\n");
+	return -EINVAL;
+}
+
+static int process_auth_reply_more(struct ceph_connection *con,
+				   void *p, void *end)
+{
+	int payload_len;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_reply_more";
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(&p, end, payload_len, bad);
+	ceph_decode_need(&p, end, payload_len, bad);
+
+	dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+	reset_out_kvecs(con);
+	ret = prepare_auth_request_more(con, p, payload_len);
+	if (ret) {
+		if (ret != -EAGAIN)
+			pr_err("prepare_auth_request_more failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_reply_more\n");
+	return -EINVAL;
+}
+
+static int process_auth_done(struct ceph_connection *con, void *p, void *end)
+{
+	u8 session_key[CEPH_KEY_LEN];
+	u8 con_secret[CEPH_MAX_CON_SECRET_LEN];
+	int session_key_len, con_secret_len;
+	int payload_len;
+	u64 global_id;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_done";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, global_id, bad);
+	ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
+	ceph_decode_32_safe(&p, end, payload_len, bad);
+
+	dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
+	     __func__, con, global_id, con->v2.con_mode, payload_len);
+
+	mutex_unlock(&con->mutex);
+	session_key_len = 0;
+	con_secret_len = 0;
+	ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
+					 session_key, &session_key_len,
+					 con_secret, &con_secret_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	ret = setup_crypto(con, session_key, session_key_len, con_secret,
+			   con_secret_len);
+	if (ret)
+		return ret;
+
+	reset_out_kvecs(con);
+	ret = prepare_auth_signature(con);
+	if (ret) {
+		pr_err("prepare_auth_signature failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_done\n");
+	return -EINVAL;
+}
+
+static int process_auth_signature(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	u8 hmac[SHA256_DIGEST_SIZE];
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
+		con->error_msg = "protocol error, unexpected auth_signature";
+		return -EINVAL;
+	}
+
+	ret = hmac_sha256(con, con->v2.out_sign_kvecs,
+			  con->v2.out_sign_kvec_cnt, hmac);
+	if (ret)
+		return ret;
+
+	ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
+	if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
+		con->error_msg = "integrity error, bad auth signature";
+		return -EBADMSG;
+	}
+
+	dout("%s con %p auth signature ok\n", __func__, con);
+
+	/* no reset_out_kvecs() as our auth_signature may still be pending */
+	if (!con->v2.server_cookie) {
+		ret = prepare_client_ident(con);
+		if (ret) {
+			pr_err("prepare_client_ident failed: %d\n", ret);
+			return ret;
+		}
+
+		con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+	} else {
+		ret = prepare_session_reconnect(con);
+		if (ret) {
+			pr_err("prepare_session_reconnect failed: %d\n", ret);
+			return ret;
+		}
+
+		con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_signature\n");
+	return -EINVAL;
+}
+
+static int process_server_ident(struct ceph_connection *con,
+				void *p, void *end)
+{
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 features, required_features;
+	struct ceph_entity_addr addr;
+	u64 global_seq;
+	u64 global_id;
+	u64 cookie;
+	u64 flags;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+		con->error_msg = "protocol error, unexpected server_ident";
+		return -EINVAL;
+	}
+
+	ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
+	if (ret) {
+		pr_err("failed to decode server addrs: %d\n", ret);
+		return ret;
+	}
+
+	ceph_decode_64_safe(&p, end, global_id, bad);
+	ceph_decode_64_safe(&p, end, global_seq, bad);
+	ceph_decode_64_safe(&p, end, features, bad);
+	ceph_decode_64_safe(&p, end, required_features, bad);
+	ceph_decode_64_safe(&p, end, flags, bad);
+	ceph_decode_64_safe(&p, end, cookie, bad);
+
+	dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
+	     __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
+	     global_id, global_seq, features, required_features, flags, cookie);
+
+	/* is this who we intended to talk to? */
+	if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
+		pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
+		       ceph_pr_addr(&con->peer_addr),
+		       le32_to_cpu(con->peer_addr.nonce),
+		       ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -EINVAL;
+	}
+
+	if (client->required_features & ~features) {
+		pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+		       features, client->required_features & ~features);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+
+	/*
+	 * Both name->type and name->num are set in ceph_con_open() but
+	 * name->num may be bogus in the initial monmap.  name->type is
+	 * verified in handle_hello().
+	 */
+	WARN_ON(!con->peer_name.type);
+	con->peer_name.num = cpu_to_le64(global_id);
+	con->v2.peer_global_seq = global_seq;
+	con->peer_features = features;
+	WARN_ON(required_features & ~client->supported_features);
+	con->v2.server_cookie = cookie;
+
+	if (flags & CEPH_MSG_CONNECT_LOSSY) {
+		ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+		WARN_ON(con->v2.server_cookie);
+	} else {
+		WARN_ON(!con->v2.server_cookie);
+	}
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+	con->delay = 0;  /* reset backoff memory */
+
+	con->state = CEPH_CON_S_OPEN;
+	con->v2.out_state = OUT_S_GET_NEXT;
+	return 0;
+
+bad:
+	pr_err("failed to decode server_ident\n");
+	return -EINVAL;
+}
+
+static int process_ident_missing_features(struct ceph_connection *con,
+					  void *p, void *end)
+{
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 missing_features;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+		con->error_msg = "protocol error, unexpected ident_missing_features";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, missing_features, bad);
+	pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+	       client->supported_features, missing_features);
+	con->error_msg = "missing required protocol features";
+	return -EINVAL;
+
+bad:
+	pr_err("failed to decode ident_missing_features\n");
+	return -EINVAL;
+}
+
+static int process_session_reconnect_ok(struct ceph_connection *con,
+					void *p, void *end)
+{
+	u64 seq;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_reconnect_ok";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, seq, bad);
+
+	dout("%s con %p seq %llu\n", __func__, con, seq);
+	ceph_con_discard_requeued(con, seq);
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+	con->delay = 0;  /* reset backoff memory */
+
+	con->state = CEPH_CON_S_OPEN;
+	con->v2.out_state = OUT_S_GET_NEXT;
+	return 0;
+
+bad:
+	pr_err("failed to decode session_reconnect_ok\n");
+	return -EINVAL;
+}
+
+static int process_session_retry(struct ceph_connection *con,
+				 void *p, void *end)
+{
+	u64 connect_seq;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_retry";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, connect_seq, bad);
+
+	dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
+	WARN_ON(connect_seq <= con->v2.connect_seq);
+	con->v2.connect_seq = connect_seq + 1;
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_session_reconnect(con);
+	if (ret) {
+		pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode session_retry\n");
+	return -EINVAL;
+}
+
+static int process_session_retry_global(struct ceph_connection *con,
+					void *p, void *end)
+{
+	u64 global_seq;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_retry_global";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, global_seq, bad);
+
+	dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
+	WARN_ON(global_seq <= con->v2.global_seq);
+	con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_session_reconnect(con);
+	if (ret) {
+		pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode session_retry_global\n");
+	return -EINVAL;
+}
+
+static int process_session_reset(struct ceph_connection *con,
+				 void *p, void *end)
+{
+	bool full;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_reset";
+		return -EINVAL;
+	}
+
+	ceph_decode_8_safe(&p, end, full, bad);
+	if (!full) {
+		con->error_msg = "protocol error, bad session_reset";
+		return -EINVAL;
+	}
+
+	pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
+		ceph_pr_addr(&con->peer_addr));
+	ceph_con_reset_session(con);
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->peer_reset)
+		con->ops->peer_reset(con);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_client_ident(con);
+	if (ret) {
+		pr_err("prepare_client_ident (rst) failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+	return 0;
+
+bad:
+	pr_err("failed to decode session_reset\n");
+	return -EINVAL;
+}
+
+static int process_keepalive2_ack(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	if (con->state != CEPH_CON_S_OPEN) {
+		con->error_msg = "protocol error, unexpected keepalive2_ack";
+		return -EINVAL;
+	}
+
+	ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
+	ceph_decode_timespec64(&con->last_keepalive_ack, p);
+
+	dout("%s con %p timestamp %lld.%09ld\n", __func__, con,
+	     con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec);
+
+	return 0;
+
+bad:
+	pr_err("failed to decode keepalive2_ack\n");
+	return -EINVAL;
+}
+
+static int process_ack(struct ceph_connection *con, void *p, void *end)
+{
+	u64 seq;
+
+	if (con->state != CEPH_CON_S_OPEN) {
+		con->error_msg = "protocol error, unexpected ack";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, seq, bad);
+
+	dout("%s con %p seq %llu\n", __func__, con, seq);
+	ceph_con_discard_sent(con, seq);
+	return 0;
+
+bad:
+	pr_err("failed to decode ack\n");
+	return -EINVAL;
+}
+
+static int process_control(struct ceph_connection *con, void *p, void *end)
+{
+	int tag = con->v2.in_desc.fd_tag;
+	int ret;
+
+	dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
+
+	switch (tag) {
+	case FRAME_TAG_HELLO:
+		ret = process_hello(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_BAD_METHOD:
+		ret = process_auth_bad_method(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_REPLY_MORE:
+		ret = process_auth_reply_more(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_DONE:
+		ret = process_auth_done(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_SIGNATURE:
+		ret = process_auth_signature(con, p, end);
+		break;
+	case FRAME_TAG_SERVER_IDENT:
+		ret = process_server_ident(con, p, end);
+		break;
+	case FRAME_TAG_IDENT_MISSING_FEATURES:
+		ret = process_ident_missing_features(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RECONNECT_OK:
+		ret = process_session_reconnect_ok(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RETRY:
+		ret = process_session_retry(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RETRY_GLOBAL:
+		ret = process_session_retry_global(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RESET:
+		ret = process_session_reset(con, p, end);
+		break;
+	case FRAME_TAG_KEEPALIVE2_ACK:
+		ret = process_keepalive2_ack(con, p, end);
+		break;
+	case FRAME_TAG_ACK:
+		ret = process_ack(con, p, end);
+		break;
+	default:
+		pr_err("bad tag %d\n", tag);
+		con->error_msg = "protocol error, bad tag";
+		return -EINVAL;
+	}
+	if (ret) {
+		dout("%s con %p error %d\n", __func__, con, ret);
+		return ret;
+	}
+
+	prepare_read_preamble(con);
+	return 0;
+}
+
+/*
+ * Return:
+ *   1 - con->in_msg set, read message
+ *   0 - skip message
+ *  <0 - error
+ */
+static int process_message_header(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	struct ceph_msg_header2 *hdr2 = p;
+	struct ceph_msg_header hdr;
+	int skip;
+	int ret;
+	u64 seq;
+
+	/* verify seq# */
+	seq = le64_to_cpu(hdr2->seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr),
+			seq, con->in_seq + 1);
+		return 0;
+	}
+	if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADE;
+	}
+
+	ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
+
+	fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
+		    desc->fd_lens[3], &con->peer_name);
+	ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
+	if (ret)
+		return ret;
+
+	WARN_ON(!con->in_msg ^ skip);
+	if (skip)
+		return 0;
+
+	WARN_ON(!con->in_msg);
+	WARN_ON(con->in_msg->con != con);
+	return 1;
+}
+
+static int process_message(struct ceph_connection *con)
+{
+	ceph_con_process_message(con);
+
+	/*
+	 * We could have been closed by ceph_con_close() because
+	 * ceph_con_process_message() temporarily drops con->mutex.
+	 */
+	if (con->state != CEPH_CON_S_OPEN) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	prepare_read_preamble(con);
+	return 0;
+}
+
+static int __handle_control(struct ceph_connection *con, void *p)
+{
+	void *end = p + con->v2.in_desc.fd_lens[0];
+	struct ceph_msg *msg;
+	int ret;
+
+	if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
+		return process_control(con, p, end);
+
+	ret = process_message_header(con, p, end);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		prepare_skip_message(con);
+		return 0;
+	}
+
+	msg = con->in_msg;  /* set in process_message_header() */
+	if (!front_len(msg) && !middle_len(msg)) {
+		if (!data_len(msg))
+			return process_message(con);
+
+		prepare_read_data(con);
+		return 0;
+	}
+
+	reset_in_kvecs(con);
+	if (front_len(msg)) {
+		WARN_ON(front_len(msg) > msg->front_alloc_len);
+		add_in_kvec(con, msg->front.iov_base, front_len(msg));
+		msg->front.iov_len = front_len(msg);
+
+		if (con_secure(con) && need_padding(front_len(msg)))
+			add_in_kvec(con, FRONT_PAD(con->v2.in_buf),
+				    padding_len(front_len(msg)));
+	} else {
+		msg->front.iov_len = 0;
+	}
+	if (middle_len(msg)) {
+		WARN_ON(middle_len(msg) > msg->middle->alloc_len);
+		add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+		msg->middle->vec.iov_len = middle_len(msg);
+
+		if (con_secure(con) && need_padding(middle_len(msg)))
+			add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf),
+				    padding_len(middle_len(msg)));
+	} else if (msg->middle) {
+		msg->middle->vec.iov_len = 0;
+	}
+
+	if (data_len(msg)) {
+		con->v2.in_state = IN_S_PREPARE_READ_DATA;
+	} else {
+		add_in_kvec(con, con->v2.in_buf,
+			    con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN :
+					      CEPH_EPILOGUE_PLAIN_LEN);
+		con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+	}
+	return 0;
+}
+
+static int handle_preamble(struct ceph_connection *con)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	int ret;
+
+	if (con_secure(con)) {
+		ret = decrypt_preamble(con);
+		if (ret) {
+			if (ret == -EBADMSG)
+				con->error_msg = "integrity error, bad preamble auth tag";
+			return ret;
+		}
+	}
+
+	ret = decode_preamble(con->v2.in_buf, desc);
+	if (ret) {
+		if (ret == -EBADMSG)
+			con->error_msg = "integrity error, bad crc";
+		else
+			con->error_msg = "protocol error, bad preamble";
+		return ret;
+	}
+
+	dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
+	     con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
+	     desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
+
+	if (!con_secure(con))
+		return prepare_read_control(con);
+
+	if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
+		return prepare_read_control_remainder(con);
+
+	return __handle_control(con, CTRL_BODY(con->v2.in_buf));
+}
+
+static int handle_control(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	void *buf;
+	int ret;
+
+	WARN_ON(con_secure(con));
+
+	ret = verify_control_crc(con);
+	if (ret) {
+		con->error_msg = "integrity error, bad crc";
+		return ret;
+	}
+
+	if (con->state == CEPH_CON_S_V2_AUTH) {
+		buf = alloc_conn_buf(con, ctrl_len);
+		if (!buf)
+			return -ENOMEM;
+
+		memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
+		return __handle_control(con, buf);
+	}
+
+	return __handle_control(con, con->v2.in_kvecs[0].iov_base);
+}
+
+static int handle_control_remainder(struct ceph_connection *con)
+{
+	int ret;
+
+	WARN_ON(!con_secure(con));
+
+	ret = decrypt_control_remainder(con);
+	if (ret) {
+		if (ret == -EBADMSG)
+			con->error_msg = "integrity error, bad control remainder auth tag";
+		return ret;
+	}
+
+	return __handle_control(con, con->v2.in_kvecs[0].iov_base -
+				     CEPH_PREAMBLE_INLINE_LEN);
+}
+
+static int handle_epilogue(struct ceph_connection *con)
+{
+	u32 front_crc, middle_crc, data_crc;
+	int ret;
+
+	if (con_secure(con)) {
+		ret = decrypt_message(con);
+		if (ret) {
+			if (ret == -EBADMSG)
+				con->error_msg = "integrity error, bad epilogue auth tag";
+			return ret;
+		}
+
+		/* just late_status */
+		ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
+		if (ret) {
+			con->error_msg = "protocol error, bad epilogue";
+			return ret;
+		}
+	} else {
+		ret = decode_epilogue(con->v2.in_buf, &front_crc,
+				      &middle_crc, &data_crc);
+		if (ret) {
+			con->error_msg = "protocol error, bad epilogue";
+			return ret;
+		}
+
+		ret = verify_epilogue_crcs(con, front_crc, middle_crc,
+					   data_crc);
+		if (ret) {
+			con->error_msg = "integrity error, bad crc";
+			return ret;
+		}
+	}
+
+	return process_message(con);
+}
+
+static void finish_skip(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+
+	if (con_secure(con))
+		gcm_inc_nonce(&con->v2.in_gcm_nonce);
+
+	__finish_skip(con);
+}
+
+static int populate_in_iter(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
+	     con->v2.in_state);
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
+		ret = process_banner_prefix(con);
+	} else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
+		ret = process_banner_payload(con);
+	} else if ((con->state >= CEPH_CON_S_V2_HELLO &&
+		    con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
+		   con->state == CEPH_CON_S_OPEN) {
+		switch (con->v2.in_state) {
+		case IN_S_HANDLE_PREAMBLE:
+			ret = handle_preamble(con);
+			break;
+		case IN_S_HANDLE_CONTROL:
+			ret = handle_control(con);
+			break;
+		case IN_S_HANDLE_CONTROL_REMAINDER:
+			ret = handle_control_remainder(con);
+			break;
+		case IN_S_PREPARE_READ_DATA:
+			prepare_read_data(con);
+			ret = 0;
+			break;
+		case IN_S_PREPARE_READ_DATA_CONT:
+			prepare_read_data_cont(con);
+			ret = 0;
+			break;
+		case IN_S_HANDLE_EPILOGUE:
+			ret = handle_epilogue(con);
+			break;
+		case IN_S_FINISH_SKIP:
+			finish_skip(con);
+			ret = 0;
+			break;
+		default:
+			WARN(1, "bad in_state %d", con->v2.in_state);
+			return -EINVAL;
+		}
+	} else {
+		WARN(1, "bad state %d", con->state);
+		return -EINVAL;
+	}
+	if (ret) {
+		dout("%s con %p error %d\n", __func__, con, ret);
+		return ret;
+	}
+
+	if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+		return -ENODATA;
+	dout("%s con %p populated %zu\n", __func__, con,
+	     iov_iter_count(&con->v2.in_iter));
+	return 1;
+}
+
+int ceph_con_v2_try_read(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d need %zu\n", __func__, con, con->state,
+	     iov_iter_count(&con->v2.in_iter));
+
+	if (con->state == CEPH_CON_S_PREOPEN)
+		return 0;
+
+	/*
+	 * We should always have something pending here.  If not,
+	 * avoid calling populate_in_iter() as if we read something
+	 * (ceph_tcp_recv() would immediately return 1).
+	 */
+	if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+		return -ENODATA;
+
+	for (;;) {
+		ret = ceph_tcp_recv(con);
+		if (ret <= 0)
+			return ret;
+
+		ret = populate_in_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "read processing error";
+			return ret;
+		}
+	}
+}
+
+static void queue_data(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	con->v2.out_epil.data_crc = -1;
+	ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg,
+				  data_len(con->out_msg));
+
+	get_bvec_at(&con->v2.out_cursor, &bv);
+	set_out_bvec(con, &bv, true);
+	con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
+}
+
+static void queue_data_cont(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	con->v2.out_epil.data_crc = ceph_crc32c_page(
+		con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+		con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
+
+	ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
+	if (con->v2.out_cursor.total_resid) {
+		get_bvec_at(&con->v2.out_cursor, &bv);
+		set_out_bvec(con, &bv, true);
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
+		return;
+	}
+
+	/*
+	 * We've written all data.  Queue epilogue.  Once it's written,
+	 * we are done.
+	 */
+	reset_out_kvecs(con);
+	prepare_epilogue_plain(con, false);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_enc_page(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
+	     con->v2.out_enc_resid);
+	WARN_ON(!con->v2.out_enc_resid);
+
+	bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i];
+	bv.bv_offset = 0;
+	bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE);
+
+	set_out_bvec(con, &bv, false);
+	con->v2.out_enc_i++;
+	con->v2.out_enc_resid -= bv.bv_len;
+
+	if (con->v2.out_enc_resid) {
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
+		return;
+	}
+
+	/*
+	 * We've queued the last piece of ciphertext (ending with
+	 * epilogue) + auth tag.  Once it's written, we are done.
+	 */
+	WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_zeros(struct ceph_connection *con)
+{
+	dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
+
+	if (con->v2.out_zero) {
+		set_out_bvec_zero(con);
+		con->v2.out_zero -= con->v2.out_bvec.bv_len;
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	/*
+	 * We've zero-filled everything up to epilogue.  Queue epilogue
+	 * with late_status set to ABORTED and crcs adjusted for zeros.
+	 * Once it's written, we are done patching up for the revoke.
+	 */
+	reset_out_kvecs(con);
+	prepare_epilogue_plain(con, true);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void finish_message(struct ceph_connection *con)
+{
+	dout("%s con %p msg %p\n", __func__, con, con->out_msg);
+
+	/* we end up here both plain and secure modes */
+	if (con->v2.out_enc_pages) {
+		WARN_ON(!con->v2.out_enc_page_cnt);
+		ceph_release_page_vector(con->v2.out_enc_pages,
+					 con->v2.out_enc_page_cnt);
+		con->v2.out_enc_pages = NULL;
+		con->v2.out_enc_page_cnt = 0;
+	}
+	/* message may have been revoked */
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+
+	con->v2.out_state = OUT_S_GET_NEXT;
+}
+
+static int populate_out_iter(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
+	     con->v2.out_state);
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+
+	if (con->state != CEPH_CON_S_OPEN) {
+		WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
+			con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
+		goto nothing_pending;
+	}
+
+	switch (con->v2.out_state) {
+	case OUT_S_QUEUE_DATA:
+		WARN_ON(!con->out_msg);
+		queue_data(con);
+		goto populated;
+	case OUT_S_QUEUE_DATA_CONT:
+		WARN_ON(!con->out_msg);
+		queue_data_cont(con);
+		goto populated;
+	case OUT_S_QUEUE_ENC_PAGE:
+		queue_enc_page(con);
+		goto populated;
+	case OUT_S_QUEUE_ZEROS:
+		WARN_ON(con->out_msg);  /* revoked */
+		queue_zeros(con);
+		goto populated;
+	case OUT_S_FINISH_MESSAGE:
+		finish_message(con);
+		break;
+	case OUT_S_GET_NEXT:
+		break;
+	default:
+		WARN(1, "bad out_state %d", con->v2.out_state);
+		return -EINVAL;
+	}
+
+	WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
+	if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+		ret = prepare_keepalive2(con);
+		if (ret) {
+			pr_err("prepare_keepalive2 failed: %d\n", ret);
+			return ret;
+		}
+	} else if (!list_empty(&con->out_queue)) {
+		ceph_con_get_out_msg(con);
+		ret = prepare_message(con);
+		if (ret) {
+			pr_err("prepare_message failed: %d\n", ret);
+			return ret;
+		}
+	} else if (con->in_seq > con->in_seq_acked) {
+		ret = prepare_ack(con);
+		if (ret) {
+			pr_err("prepare_ack failed: %d\n", ret);
+			return ret;
+		}
+	} else {
+		goto nothing_pending;
+	}
+
+populated:
+	if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
+		return -ENODATA;
+	dout("%s con %p populated %zu\n", __func__, con,
+	     iov_iter_count(&con->v2.out_iter));
+	return 1;
+
+nothing_pending:
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	dout("%s con %p nothing pending\n", __func__, con);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+int ceph_con_v2_try_write(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d have %zu\n", __func__, con, con->state,
+	     iov_iter_count(&con->v2.out_iter));
+
+	/* open the socket first? */
+	if (con->state == CEPH_CON_S_PREOPEN) {
+		WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
+
+		/*
+		 * Always bump global_seq.  Bump connect_seq only if
+		 * there is a session (i.e. we are reconnecting and will
+		 * send session_reconnect instead of client_ident).
+		 */
+		con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
+		if (con->v2.server_cookie)
+			con->v2.connect_seq++;
+
+		ret = prepare_read_banner_prefix(con);
+		if (ret) {
+			pr_err("prepare_read_banner_prefix failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+
+		reset_out_kvecs(con);
+		ret = prepare_banner(con);
+		if (ret) {
+			pr_err("prepare_banner failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+
+		ret = ceph_tcp_connect(con);
+		if (ret) {
+			pr_err("ceph_tcp_connect failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+	}
+
+	if (!iov_iter_count(&con->v2.out_iter)) {
+		ret = populate_out_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "write processing error";
+			return ret;
+		}
+	}
+
+	tcp_sock_set_cork(con->sock->sk, true);
+	for (;;) {
+		ret = ceph_tcp_send(con);
+		if (ret <= 0)
+			break;
+
+		ret = populate_out_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "write processing error";
+			break;
+		}
+	}
+
+	tcp_sock_set_cork(con->sock->sk, false);
+	return ret;
+}
+
+static u32 crc32c_zeros(u32 crc, int zero_len)
+{
+	int len;
+
+	while (zero_len) {
+		len = min(zero_len, (int)PAGE_SIZE);
+		crc = crc32c(crc, page_address(ceph_zero_page), len);
+		zero_len -= len;
+	}
+
+	return crc;
+}
+
+static void prepare_zero_front(struct ceph_connection *con, int resid)
+{
+	int sent;
+
+	WARN_ON(!resid || resid > front_len(con->out_msg));
+	sent = front_len(con->out_msg) - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.front_crc =
+			crc32c(-1, con->out_msg->front.iov_base, sent);
+		con->v2.out_epil.front_crc =
+			crc32c_zeros(con->v2.out_epil.front_crc, resid);
+	} else {
+		con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
+	}
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, resid);
+}
+
+static void prepare_zero_middle(struct ceph_connection *con, int resid)
+{
+	int sent;
+
+	WARN_ON(!resid || resid > middle_len(con->out_msg));
+	sent = middle_len(con->out_msg) - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.middle_crc =
+			crc32c(-1, con->out_msg->middle->vec.iov_base, sent);
+		con->v2.out_epil.middle_crc =
+			crc32c_zeros(con->v2.out_epil.middle_crc, resid);
+	} else {
+		con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
+	}
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, resid);
+}
+
+static void prepare_zero_data(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+	con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg));
+	out_zero_add(con, data_len(con->out_msg));
+}
+
+static void revoke_at_queue_data(struct ceph_connection *con)
+{
+	int boundary;
+	int resid;
+
+	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+
+	boundary = front_len(con->out_msg) + middle_len(con->out_msg);
+	if (resid > boundary) {
+		resid -= boundary;
+		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head\n", __func__, con);
+		if (front_len(con->out_msg))
+			prepare_zero_front(con, front_len(con->out_msg));
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_data(con);
+		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	boundary = middle_len(con->out_msg);
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending front\n", __func__, con);
+		prepare_zero_front(con, resid);
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_data(con);
+		queue_zeros(con);
+		return;
+	}
+
+	WARN_ON(!resid);
+	dout("%s con %p was sending middle\n", __func__, con);
+	prepare_zero_middle(con, resid);
+	prepare_zero_data(con);
+	queue_zeros(con);
+}
+
+static void revoke_at_queue_data_cont(struct ceph_connection *con)
+{
+	int sent, resid;  /* current piece of data */
+
+	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+	WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
+	sent = con->v2.out_bvec.bv_len - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.data_crc = ceph_crc32c_page(
+			con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+			con->v2.out_bvec.bv_offset, sent);
+		ceph_msg_data_advance(&con->v2.out_cursor, sent);
+	}
+	WARN_ON(resid > con->v2.out_cursor.total_resid);
+	con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
+						con->v2.out_cursor.total_resid);
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, con->v2.out_cursor.total_resid);
+	queue_zeros(con);
+}
+
+static void revoke_at_finish_message(struct ceph_connection *con)
+{
+	int boundary;
+	int resid;
+
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+
+	if (!front_len(con->out_msg) && !middle_len(con->out_msg) &&
+	    !data_len(con->out_msg)) {
+		WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head (empty message) - noop\n",
+		     __func__, con);
+		return;
+	}
+
+	boundary = front_len(con->out_msg) + middle_len(con->out_msg) +
+		   CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head\n", __func__, con);
+		if (front_len(con->out_msg))
+			prepare_zero_front(con, front_len(con->out_msg));
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending front\n", __func__, con);
+		prepare_zero_front(con, resid);
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		queue_zeros(con);
+		return;
+	}
+
+	boundary = CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending middle\n", __func__, con);
+		prepare_zero_middle(con, resid);
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		queue_zeros(con);
+		return;
+	}
+
+	WARN_ON(!resid);
+	dout("%s con %p was sending epilogue - noop\n", __func__, con);
+}
+
+void ceph_con_v2_revoke(struct ceph_connection *con)
+{
+	WARN_ON(con->v2.out_zero);
+
+	if (con_secure(con)) {
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
+			con->v2.out_state != OUT_S_FINISH_MESSAGE);
+		dout("%s con %p secure - noop\n", __func__, con);
+		return;
+	}
+
+	switch (con->v2.out_state) {
+	case OUT_S_QUEUE_DATA:
+		revoke_at_queue_data(con);
+		break;
+	case OUT_S_QUEUE_DATA_CONT:
+		revoke_at_queue_data_cont(con);
+		break;
+	case OUT_S_FINISH_MESSAGE:
+		revoke_at_finish_message(con);
+		break;
+	default:
+		WARN(1, "bad out_state %d", con->v2.out_state);
+		break;
+	}
+}
+
+static void revoke_at_prepare_read_data(struct ceph_connection *con)
+{
+	int remaining;  /* data + [data padding] + epilogue */
+	int resid;
+
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid);
+
+	if (con_secure(con))
+		remaining = padded_len(data_len(con->in_msg)) +
+			    CEPH_EPILOGUE_SECURE_LEN;
+	else
+		remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+
+	dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
+	     remaining);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
+{
+	int recved, resid;  /* current piece of data */
+	int remaining;  /* [data padding] + epilogue */
+
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+	recved = con->v2.in_bvec.bv_len - resid;
+	dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
+
+	if (recved)
+		ceph_msg_data_advance(&con->v2.in_cursor, recved);
+	WARN_ON(resid > con->v2.in_cursor.total_resid);
+
+	if (con_secure(con))
+		remaining = padding_len(data_len(con->in_msg)) +
+			    CEPH_EPILOGUE_SECURE_LEN;
+	else
+		remaining = CEPH_EPILOGUE_PLAIN_LEN;
+
+	dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
+	     con->v2.in_cursor.total_resid, remaining);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_handle_epilogue(struct ceph_connection *con)
+{
+	int resid;
+
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid);
+
+	dout("%s con %p resid %d\n", __func__, con, resid);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
+{
+	switch (con->v2.in_state) {
+	case IN_S_PREPARE_READ_DATA:
+		revoke_at_prepare_read_data(con);
+		break;
+	case IN_S_PREPARE_READ_DATA_CONT:
+		revoke_at_prepare_read_data_cont(con);
+		break;
+	case IN_S_HANDLE_EPILOGUE:
+		revoke_at_handle_epilogue(con);
+		break;
+	default:
+		WARN(1, "bad in_state %d", con->v2.in_state);
+		break;
+	}
+}
+
+bool ceph_con_v2_opened(struct ceph_connection *con)
+{
+	return con->v2.peer_global_seq;
+}
+
+void ceph_con_v2_reset_session(struct ceph_connection *con)
+{
+	con->v2.client_cookie = 0;
+	con->v2.server_cookie = 0;
+	con->v2.global_seq = 0;
+	con->v2.connect_seq = 0;
+	con->v2.peer_global_seq = 0;
+}
+
+void ceph_con_v2_reset_protocol(struct ceph_connection *con)
+{
+	iov_iter_truncate(&con->v2.in_iter, 0);
+	iov_iter_truncate(&con->v2.out_iter, 0);
+	con->v2.out_zero = 0;
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+
+	if (con->v2.out_enc_pages) {
+		WARN_ON(!con->v2.out_enc_page_cnt);
+		ceph_release_page_vector(con->v2.out_enc_pages,
+					 con->v2.out_enc_page_cnt);
+		con->v2.out_enc_pages = NULL;
+		con->v2.out_enc_page_cnt = 0;
+	}
+
+	con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
+
+	if (con->v2.hmac_tfm) {
+		crypto_free_shash(con->v2.hmac_tfm);
+		con->v2.hmac_tfm = NULL;
+	}
+	if (con->v2.gcm_req) {
+		aead_request_free(con->v2.gcm_req);
+		con->v2.gcm_req = NULL;
+	}
+	if (con->v2.gcm_tfm) {
+		crypto_free_aead(con->v2.gcm_tfm);
+		con->v2.gcm_tfm = NULL;
+	}
+}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 792a8c4164d7..b9d54ed9f338 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -257,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc)
 		      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
 	/*
-	 * send an initial keepalive to ensure our timestamp is valid
-	 * by the time we are in an OPENED state
+	 * Queue a keepalive to ensure that in case of an early fault
+	 * the messenger doesn't put us into STANDBY state and instead
+	 * retries.  This also ensures that our timestamp is valid by
+	 * the time we finish hunting and delayed_work() checks it.
 	 */
 	ceph_con_keepalive(&monc->con);
+	if (ceph_msgr2(monc->client)) {
+		monc->pending_auth = 1;
+		return;
+	}
 
 	/* initiate authentication handshake */
 	ret = ceph_auth_build_hello(monc->auth,
@@ -543,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
-	monmap = ceph_monmap_decode(&p, end, false);
+	monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client));
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
@@ -1119,8 +1125,9 @@ static void delayed_work(struct work_struct *work)
  */
 static int build_initial_monmap(struct ceph_mon_client *monc)
 {
+	__le32 my_type = ceph_msgr2(monc->client) ?
+		CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY;
 	struct ceph_options *opt = monc->client->options;
-	struct ceph_entity_addr *mon_addr = opt->mon_addr;
 	int num_mon = opt->num_mon;
 	int i;
 
@@ -1129,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 			       GFP_KERNEL);
 	if (!monc->monmap)
 		return -ENOMEM;
+
 	for (i = 0; i < num_mon; i++) {
-		monc->monmap->mon_inst[i].addr = mon_addr[i];
-		monc->monmap->mon_inst[i].addr.nonce = 0;
-		monc->monmap->mon_inst[i].name.type =
-			CEPH_ENTITY_TYPE_MON;
-		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+		struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
+
+		memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr,
+		       sizeof(inst->addr.in_addr));
+		inst->addr.type = my_type;
+		inst->addr.nonce = 0;
+		inst->name.type = CEPH_ENTITY_TYPE_MON;
+		inst->name.num = cpu_to_le64(i);
 	}
 	monc->monmap->num_mon = num_mon;
 	return 0;
@@ -1337,6 +1348,88 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_validate_auth);
 
+static int mon_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_auth_get_request(monc->auth, buf, *buf_len);
+	mutex_unlock(&monc->mutex);
+	if (ret < 0)
+		return ret;
+
+	*buf_len = ret;
+	*authorizer = NULL;
+	*authorizer_len = 0;
+	return 0;
+}
+
+static int mon_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len,
+					  buf, *buf_len);
+	mutex_unlock(&monc->mutex);
+	if (ret < 0)
+		return ret;
+
+	*buf_len = ret;
+	*authorizer = NULL;
+	*authorizer_len = 0;
+	return 0;
+}
+
+static int mon_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	bool was_authed;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	WARN_ON(!monc->hunting);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ret = ceph_auth_handle_reply_done(monc->auth, global_id,
+					  reply, reply_len,
+					  session_key, session_key_len,
+					  con_secret, con_secret_len);
+	finish_auth(monc, ret, was_authed);
+	if (!ret)
+		finish_hunting(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int mon_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_mon_client *monc = con->private;
+	bool was_authed;
+
+	mutex_lock(&monc->mutex);
+	WARN_ON(!monc->hunting);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ceph_auth_handle_bad_method(monc->auth, used_proto, result,
+				    allowed_protos, proto_cnt,
+				    allowed_modes, mode_cnt);
+	finish_auth(monc, -EACCES, was_authed);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
 /*
  * handle incoming message
  */
@@ -1487,4 +1580,8 @@ static const struct ceph_connection_operations mon_con_ops = {
 	.dispatch = dispatch,
 	.fault = mon_fault,
 	.alloc_msg = mon_alloc_msg,
+	.get_auth_request = mon_get_auth_request,
+	.handle_auth_reply_more = mon_handle_auth_reply_more,
+	.handle_auth_done = mon_handle_auth_done,
+	.handle_auth_bad_method = mon_handle_auth_bad_method,
 };
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 51be5a7482fc..662b52e52651 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	set_pool_was_full(osdc);
 
 	if (incremental)
-		newmap = osdmap_apply_incremental(&p, end, false, osdc->osdmap);
+		newmap = osdmap_apply_incremental(&p, end,
+						  ceph_msgr2(osdc->client),
+						  osdc->osdmap);
 	else
-		newmap = ceph_osdmap_decode(&p, end, false);
+		newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client));
 	if (IS_ERR(newmap))
 		return PTR_ERR(newmap);
 
@@ -5575,6 +5577,7 @@ static void put_osd_con(struct ceph_connection *con)
 /*
  * authentication
  */
+
 /*
  * Note: returned pointer is the address of a structure that's
  * managed separately.  Caller must *not* attempt to free it.
@@ -5640,6 +5643,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
+static int osd_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
+
+	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+				       buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int osd_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
+
+	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+					      buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int osd_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+
+	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+					       session_key, session_key_len,
+					       con_secret, con_secret_len);
+}
+
+static int osd_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_mon_client *monc = &o->o_osdc->client->monc;
+	int ret;
+
+	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD,
+					    used_proto, result,
+					    allowed_protos, proto_cnt,
+					    allowed_modes, mode_cnt)) {
+		ret = ceph_monc_validate_auth(monc);
+		if (ret)
+			return ret;
+	}
+
+	return -EACCES;
+}
+
 static void osd_reencode_message(struct ceph_msg *msg)
 {
 	int type = le16_to_cpu(msg->hdr.type);
@@ -5677,4 +5754,8 @@ static const struct ceph_connection_operations osd_con_ops = {
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
 	.fault = osd_fault,
+	.get_auth_request = osd_get_auth_request,
+	.handle_auth_reply_more = osd_handle_auth_reply_more,
+	.handle_auth_done = osd_handle_auth_done,
+	.handle_auth_bad_method = osd_handle_auth_bad_method,
 };
-- 
cgit 


From 2f0df6cfa325d7106b8a65bc0e02db1086e3f73b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 20:00:10 +0100
Subject: libceph: drop ceph_auth_{create,update}_authorizer()

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h |  6 ------
 net/ceph/auth.c           | 28 ----------------------------
 2 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 3fbe72ebd779..71b5d481c653 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -126,13 +126,7 @@ int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
 			       struct ceph_auth_handshake *auth,
 			       int peer_type, bool force_new,
 			       int *proto, int *pref_mode, int *fallb_mode);
-extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
-				       int peer_type,
-				       struct ceph_auth_handshake *auth);
 void ceph_auth_destroy_authorizer(struct ceph_authorizer *a);
-extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
-				       int peer_type,
-				       struct ceph_auth_handshake *a);
 int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 6b315c8212b1..eb261aa5fe18 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -326,40 +326,12 @@ out:
 }
 EXPORT_SYMBOL(__ceph_auth_get_authorizer);
 
-int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
-				int peer_type,
-				struct ceph_auth_handshake *auth)
-{
-	int ret = 0;
-
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->create_authorizer)
-		ret = ac->ops->create_authorizer(ac, peer_type, auth);
-	mutex_unlock(&ac->mutex);
-	return ret;
-}
-EXPORT_SYMBOL(ceph_auth_create_authorizer);
-
 void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
 {
 	a->destroy(a);
 }
 EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
 
-int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
-				int peer_type,
-				struct ceph_auth_handshake *a)
-{
-	int ret = 0;
-
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->update_authorizer)
-		ret = ac->ops->update_authorizer(ac, peer_type, a);
-	mutex_unlock(&ac->mutex);
-	return ret;
-}
-EXPORT_SYMBOL(ceph_auth_update_authorizer);
-
 int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
-- 
cgit 


From be98e05a67f05ff4c8349a51fcec993a28be718c Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 4 Dec 2020 21:02:42 +0100
Subject: dma-buf: Fix kerneldoc formatting

I wanted to look up something and noticed the hyperlink doesn't work.
While fixing that also noticed a trivial kerneldoc comment typo in the
same section, fix that too.

Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20201204200242.2671481-1-daniel.vetter@ffwll.ch
---
 Documentation/driver-api/dma-buf.rst | 2 +-
 include/linux/dma-buf-map.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst
index d6b2a195dbed..a2133d69872c 100644
--- a/Documentation/driver-api/dma-buf.rst
+++ b/Documentation/driver-api/dma-buf.rst
@@ -190,7 +190,7 @@ DMA Fence uABI/Sync File
 Indefinite DMA Fences
 ~~~~~~~~~~~~~~~~~~~~~
 
-At various times &dma_fence with an indefinite time until dma_fence_wait()
+At various times struct dma_fence with an indefinite time until dma_fence_wait()
 finishes have been proposed. Examples include:
 
 * Future fences, used in HWC1 to signal when a buffer isn't used by the display
diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index 583a3a1f9447..278d489e4bdd 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -122,7 +122,7 @@ struct dma_buf_map {
 
 /**
  * DMA_BUF_MAP_INIT_VADDR - Initializes struct dma_buf_map to an address in system memory
- * @vaddr:	A system-memory address
+ * @vaddr_:	A system-memory address
  */
 #define DMA_BUF_MAP_INIT_VADDR(vaddr_) \
 	{ \
-- 
cgit 


From a313357e704f2617f298333e3e617a38b1719760 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:37 +0100
Subject: genirq: Move irq_has_action() into core code

This function uses irq_to_desc() and is going to be used by modules to
replace the open coded irq_to_desc() (ab)usage. The final goal is to remove
the export of irq_to_desc() so driver cannot fiddle with it anymore.

Move it into the core code and fixup the usage sites to include the proper
header.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194042.548936472@linutronix.de
---
 arch/alpha/kernel/sys_jensen.c |  2 +-
 arch/x86/kernel/topology.c     |  1 +
 include/linux/interrupt.h      |  1 +
 include/linux/irqdesc.h        |  7 +------
 kernel/irq/manage.c            | 17 +++++++++++++++++
 5 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c
index 0a2ab6cb18db..e5d870ff225f 100644
--- a/arch/alpha/kernel/sys_jensen.c
+++ b/arch/alpha/kernel/sys_jensen.c
@@ -7,7 +7,7 @@
  *
  * Code supporting the Jensen.
  */
-
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0a2ec801b63f..f5477eab5692 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,6 +25,7 @@
  *
  * Send feedback to <colpatch@us.ibm.com>
  */
+#include <linux/interrupt.h>
 #include <linux/nodemask.h>
 #include <linux/export.h>
 #include <linux/mmzone.h>
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 870b3251e174..bb8ff9083e7d 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -232,6 +232,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 # define local_irq_enable_in_hardirq()	local_irq_enable()
 #endif
 
+bool irq_has_action(unsigned int irq);
 extern void disable_irq_nosync(unsigned int irq);
 extern bool disable_hardirq(unsigned int irq);
 extern void disable_irq(unsigned int irq);
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 5745491303e0..385a4fafe631 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -179,12 +179,7 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_desc_has_action(struct irq_desc *desc)
 {
-	return desc->action != NULL;
-}
-
-static inline int irq_has_action(unsigned int irq)
-{
-	return irq_desc_has_action(irq_to_desc(irq));
+	return desc && desc->action != NULL;
 }
 
 /**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c826ba4141fe..a5a1cde5c1a2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2822,3 +2822,20 @@ out_unlock:
 	return err;
 }
 EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_has_action - Check whether an interrupt is requested
+ * @irq:	The linux irq number
+ *
+ * Returns: A snapshot of the current state
+ */
+bool irq_has_action(unsigned int irq)
+{
+	bool res;
+
+	rcu_read_lock();
+	res = irq_desc_has_action(irq_to_desc(irq));
+	rcu_read_unlock();
+	return res;
+}
+EXPORT_SYMBOL_GPL(irq_has_action);
-- 
cgit 


From fdd029630434b434b127efc7fba337da28f45658 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:38 +0100
Subject: genirq: Move status flag checks to core

These checks are used by modules and prevent the removal of the export of
irq_to_desc(). Move the accessor into the core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194042.703779349@linutronix.de
---
 include/linux/irqdesc.h | 17 +++++------------
 kernel/irq/manage.c     | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 385a4fafe631..308d7db8991f 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -223,28 +223,21 @@ irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip,
 	data->chip = chip;
 }
 
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask);
+
 static inline bool irq_balancing_disabled(unsigned int irq)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	return desc->status_use_accessors & IRQ_NO_BALANCING_MASK;
+	return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK);
 }
 
 static inline bool irq_is_percpu(unsigned int irq)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	return desc->status_use_accessors & IRQ_PER_CPU;
+	return irq_check_status_bit(irq, IRQ_PER_CPU);
 }
 
 static inline bool irq_is_percpu_devid(unsigned int irq)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	return desc->status_use_accessors & IRQ_PER_CPU_DEVID;
+	return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID);
 }
 
 static inline void
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a5a1cde5c1a2..ab8567f32501 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2839,3 +2839,23 @@ bool irq_has_action(unsigned int irq)
 	return res;
 }
 EXPORT_SYMBOL_GPL(irq_has_action);
+
+/**
+ * irq_check_status_bit - Check whether bits in the irq descriptor status are set
+ * @irq:	The linux irq number
+ * @bitmask:	The bitmask to evaluate
+ *
+ * Returns: True if one of the bits in @bitmask is set
+ */
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask)
+{
+	struct irq_desc *desc;
+	bool res = false;
+
+	rcu_read_lock();
+	desc = irq_to_desc(irq);
+	if (desc)
+		res = !!(desc->status_use_accessors & bitmask);
+	rcu_read_unlock();
+	return res;
+}
-- 
cgit 


From f1c6306c0d6b50844ba02c8a53e35405e9c0db05 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:39 +0100
Subject: genirq: Move irq_set_lockdep_class() to core

irq_set_lockdep_class() is used from modules and requires irq_to_desc() to
be exported. Move it into the core code which lifts another requirement for
the export.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194042.860029489@linutronix.de
---
 include/linux/irqdesc.h | 10 ++++------
 kernel/irq/irqdesc.c    | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 308d7db8991f..4a1d016716f4 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -240,16 +240,14 @@ static inline bool irq_is_percpu_devid(unsigned int irq)
 	return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID);
 }
 
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+			     struct lock_class_key *request_class);
 static inline void
 irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
 		      struct lock_class_key *request_class)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	if (desc) {
-		lockdep_set_class(&desc->lock, lock_class);
-		lockdep_set_class(&desc->request_mutex, request_class);
-	}
+	if (IS_ENABLED(CONFIG_LOCKDEP))
+		__irq_set_lockdep_class(irq, lock_class, request_class);
 }
 
 #endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index e810eb9906ea..20a54fa7cd30 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -968,3 +968,17 @@ unsigned int kstat_irqs_usr(unsigned int irq)
 	rcu_read_unlock();
 	return sum;
 }
+
+#ifdef CONFIG_LOCKDEP
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+			     struct lock_class_key *request_class)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc) {
+		lockdep_set_class(&desc->lock, lock_class);
+		lockdep_set_class(&desc->request_mutex, request_class);
+	}
+}
+EXPORT_SYMBOL_GPL(__irq_set_lockdep_class);
+#endif
-- 
cgit 


From 3e2380123fb96987ce958f623207010c667ffa7c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:40 +0100
Subject: genirq: Provide irq_get_effective_affinity()

Provide an accessor to the effective interrupt affinity mask. Going to be
used to replace open coded fiddling with the irq descriptor.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194042.967177918@linutronix.de
---
 include/linux/irq.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c332871d59da..4aeb1c4c7e07 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -906,6 +906,13 @@ struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 }
 #endif
 
+static inline struct cpumask *irq_get_effective_affinity_mask(unsigned int irq)
+{
+	struct irq_data *d = irq_get_irq_data(irq);
+
+	return d ? irq_data_get_effective_affinity_mask(d) : NULL;
+}
+
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-- 
cgit 


From 26c19d0a8610fb233b31730fe26a31145f2d9796 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:43 +0100
Subject: genirq: Make kstat_irqs() static

No more users outside the core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194043.268774449@linutronix.de
---
 include/linux/kernel_stat.h |  1 -
 kernel/irq/irqdesc.c        | 19 ++++++-------------
 2 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 89f0745c096d..44ae1a7eb9e3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -67,7 +67,6 @@ static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
-extern unsigned int kstat_irqs(unsigned int irq);
 extern unsigned int kstat_irqs_usr(unsigned int irq);
 
 /*
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 02b446a21ce6..2eb076f4a566 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -924,15 +924,7 @@ static bool irq_is_nmi(struct irq_desc *desc)
 	return desc->istate & IRQS_NMI;
 }
 
-/**
- * kstat_irqs - Get the statistics for an interrupt
- * @irq:	The interrupt number
- *
- * Returns the sum of interrupt counts on all cpus since boot for
- * @irq. The caller must ensure that the interrupt is not removed
- * concurrently.
- */
-unsigned int kstat_irqs(unsigned int irq)
+static unsigned int kstat_irqs(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned int sum = 0;
@@ -951,13 +943,14 @@ unsigned int kstat_irqs(unsigned int irq)
 }
 
 /**
- * kstat_irqs_usr - Get the statistics for an interrupt
+ * kstat_irqs_usr - Get the statistics for an interrupt from thread context
  * @irq:	The interrupt number
  *
  * Returns the sum of interrupt counts on all cpus since boot for @irq.
- * Contrary to kstat_irqs() this can be called from any context.
- * It uses rcu since a concurrent removal of an interrupt descriptor is
- * observing an rcu grace period before delayed_free_desc()/irq_kobj_release().
+ *
+ * It uses rcu to protect the access since a concurrent removal of an
+ * interrupt descriptor is observing an rcu grace period before
+ * delayed_free_desc()/irq_kobj_release().
  */
 unsigned int kstat_irqs_usr(unsigned int irq)
 {
-- 
cgit 


From 501e2db67fa4264b517de5c7934e94cca89b3a1e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:44 +0100
Subject: genirq: Provide kstat_irqdesc_cpu()

Most users of kstat_irqs_cpu() have the irq descriptor already. No point in
calling into the core code and looking it up once more.

Use it in per_cpu_count_show() to start with.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194043.362094758@linutronix.de
---
 include/linux/irqdesc.h | 6 ++++++
 kernel/irq/irqdesc.c    | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 4a1d016716f4..891b323266df 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -113,6 +113,12 @@ static inline void irq_unlock_sparse(void) { }
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
+static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
+					      unsigned int cpu)
+{
+	return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+}
+
 static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
 {
 	return container_of(data->common, struct irq_desc, irq_common_data);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2eb076f4a566..f509c4db2029 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -147,12 +147,12 @@ static ssize_t per_cpu_count_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
 	struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
-	int cpu, irq = desc->irq_data.irq;
 	ssize_t ret = 0;
 	char *p = "";
+	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		unsigned int c = kstat_irqs_cpu(irq, cpu);
+		unsigned int c = irq_desc_kstat_cpu(desc, cpu);
 
 		ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
 		p = ",";
-- 
cgit 


From ee2cc4276ba4909438f5894a218877660e1536d9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 14 Dec 2020 21:08:00 +0100
Subject: cpufreq: Add special-purpose fast-switching callback for drivers

First off, some cpufreq drivers (eg. intel_pstate) can pass hints
beyond the current target frequency to the hardware and there are no
provisions for doing that in the cpufreq framework.  In particular,
today the driver has to assume that it should not allow the frequency
to fall below the one requested by the governor (or the required
capacity may not be provided) which may not be the case and which may
lead to excessive energy usage in some scenarios.

Second, the hints passed by these drivers to the hardware need not be
in terms of the frequency, so representing the utilization numbers
coming from the scheduler as frequency before passing them to those
drivers is not really useful.

Address the two points above by adding a special-purpose replacement
for the ->fast_switch callback, called ->adjust_perf, allowing the
governor to pass abstract performance level (rather than frequency)
values for the minimum (required) and target (desired) performance
along with the CPU capacity to compare them to.

Also update the schedutil governor to use the new callback instead
of ->fast_switch if present and if the utilization mertics are
frequency-invariant (that is requisite for the direct mapping
between the utilization and the CPU performance levels to be a
reasonable approximation).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c        | 40 +++++++++++++++++++++++
 include/linux/cpufreq.h          | 14 +++++++++
 include/linux/sched/cpufreq.h    |  5 +++
 kernel/sched/cpufreq_schedutil.c | 68 ++++++++++++++++++++++++++++++++++------
 4 files changed, 117 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c17aa2973c44..d0a3525ce27f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2097,6 +2097,46 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
 
+/**
+ * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
+ * @cpu: Target CPU.
+ * @min_perf: Minimum (required) performance level (units of @capacity).
+ * @target_perf: Terget (desired) performance level (units of @capacity).
+ * @capacity: Capacity of the target CPU.
+ *
+ * Carry out a fast performance level switch of @cpu without sleeping.
+ *
+ * The driver's ->adjust_perf() callback invoked by this function must be
+ * suitable for being called from within RCU-sched read-side critical sections
+ * and it is expected to select a suitable performance level equal to or above
+ * @min_perf and preferably equal to or below @target_perf.
+ *
+ * This function must not be called if policy->fast_switch_enabled is unset.
+ *
+ * Governors calling this function must guarantee that it will never be invoked
+ * twice in parallel for the same CPU and that it will never be called in
+ * parallel with either ->target() or ->target_index() or ->fast_switch() for
+ * the same CPU.
+ */
+void cpufreq_driver_adjust_perf(unsigned int cpu,
+				 unsigned long min_perf,
+				 unsigned long target_perf,
+				 unsigned long capacity)
+{
+	cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity);
+}
+
+/**
+ * cpufreq_driver_has_adjust_perf - Check "direct fast switch" callback.
+ *
+ * Return 'true' if the ->adjust_perf callback is present for the
+ * current driver or 'false' otherwise.
+ */
+bool cpufreq_driver_has_adjust_perf(void)
+{
+	return !!cpufreq_driver->adjust_perf;
+}
+
 /* Must set freqs->new to intermediate frequency */
 static int __target_intermediate(struct cpufreq_policy *policy,
 				 struct cpufreq_freqs *freqs, int index)
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 584fccd4fcab..9c8b7437b6cd 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -320,6 +320,15 @@ struct cpufreq_driver {
 					unsigned int index);
 	unsigned int	(*fast_switch)(struct cpufreq_policy *policy,
 				       unsigned int target_freq);
+	/*
+	 * ->fast_switch() replacement for drivers that use an internal
+	 * representation of performance levels and can pass hints other than
+	 * the target performance level to the hardware.
+	 */
+	void		(*adjust_perf)(unsigned int cpu,
+				       unsigned long min_perf,
+				       unsigned long target_perf,
+				       unsigned long capacity);
 
 	/*
 	 * Caches and returns the lowest driver-supported frequency greater than
@@ -588,6 +597,11 @@ struct cpufreq_governor {
 /* Pass a target to the cpufreq driver */
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 					unsigned int target_freq);
+void cpufreq_driver_adjust_perf(unsigned int cpu,
+				unsigned long min_perf,
+				unsigned long target_perf,
+				unsigned long capacity);
+bool cpufreq_driver_has_adjust_perf(void);
 int cpufreq_driver_target(struct cpufreq_policy *policy,
 				 unsigned int target_freq,
 				 unsigned int relation);
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 3ed5aa18593f..6205578ab6ee 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -28,6 +28,11 @@ static inline unsigned long map_util_freq(unsigned long util,
 {
 	return (freq + (freq >> 2)) * util / cap;
 }
+
+static inline unsigned long map_util_perf(unsigned long util)
+{
+	return util + (util >> 2);
+}
 #endif /* CONFIG_CPU_FREQ */
 
 #endif /* _LINUX_SCHED_CPUFREQ_H */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 319a270d13c1..803bcb30db27 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -432,13 +432,10 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_p
 		sg_policy->limits_changed = true;
 }
 
-static void sugov_update_single(struct update_util_data *hook, u64 time,
-				unsigned int flags)
+static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
+					      u64 time, unsigned int flags)
 {
-	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
-	unsigned int cached_freq = sg_policy->cached_raw_freq;
-	unsigned int next_f;
 
 	sugov_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
@@ -446,11 +443,25 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	ignore_dl_rate_limit(sg_cpu, sg_policy);
 
 	if (!sugov_should_update_freq(sg_policy, time))
-		return;
+		return false;
 
 	sugov_get_util(sg_cpu);
 	sugov_iowait_apply(sg_cpu, time);
 
+	return true;
+}
+
+static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
+				     unsigned int flags)
+{
+	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	unsigned int cached_freq = sg_policy->cached_raw_freq;
+	unsigned int next_f;
+
+	if (!sugov_update_single_common(sg_cpu, time, flags))
+		return;
+
 	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
@@ -477,6 +488,38 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	}
 }
 
+static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+				     unsigned int flags)
+{
+	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	unsigned long prev_util = sg_cpu->util;
+
+	/*
+	 * Fall back to the "frequency" path if frequency invariance is not
+	 * supported, because the direct mapping between the utilization and
+	 * the performance levels depends on the frequency invariance.
+	 */
+	if (!arch_scale_freq_invariant()) {
+		sugov_update_single_freq(hook, time, flags);
+		return;
+	}
+
+	if (!sugov_update_single_common(sg_cpu, time, flags))
+		return;
+
+	/*
+	 * Do not reduce the target performance level if the CPU has not been
+	 * idle recently, as the reduction is likely to be premature then.
+	 */
+	if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+		sg_cpu->util = prev_util;
+
+	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
+				   map_util_perf(sg_cpu->util), sg_cpu->max);
+
+	sg_cpu->sg_policy->last_freq_update_time = time;
+}
+
 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 {
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -815,6 +858,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
 static int sugov_start(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
+	void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
 	unsigned int cpu;
 
 	sg_policy->freq_update_delay_ns	= sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
@@ -834,13 +878,17 @@ static int sugov_start(struct cpufreq_policy *policy)
 		sg_cpu->sg_policy		= sg_policy;
 	}
 
+	if (policy_is_shared(policy))
+		uu = sugov_update_shared;
+	else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
+		uu = sugov_update_single_perf;
+	else
+		uu = sugov_update_single_freq;
+
 	for_each_cpu(cpu, policy->cpus) {
 		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 
-		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
-					     policy_is_shared(policy) ?
-							sugov_update_shared :
-							sugov_update_single);
+		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
 	}
 	return 0;
 }
-- 
cgit 


From f0dbd2bd1c22c6670e83ddcd46a9beb8b575e86d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 14 Dec 2020 19:03:55 -0800
Subject: mm: slab: provide krealloc_array()

When allocating an array of elements, users should check for
multiplication overflow or preferably use one of the provided helpers
like: kmalloc_array().

There's no krealloc_array() counterpart but there are many users who use
regular krealloc() to reallocate arrays.  Let's provide an actual
krealloc_array() implementation.

While at it: add some documentation regarding krealloc.

Link: https://lkml.kernel.org/r/20201109110654.12547-3-brgl@bgdev.pl
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Christian Knig <christian.koenig@amd.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: David Rientjes <rientjes@google.com>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: James Morse <james.morse@arm.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: "Michael S . Tsirkin" <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/memory-allocation.rst |  4 ++++
 include/linux/slab.h                         | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst
index 4446a1ac36cc..5954ddf6ee13 100644
--- a/Documentation/core-api/memory-allocation.rst
+++ b/Documentation/core-api/memory-allocation.rst
@@ -147,6 +147,10 @@ The address of a chunk allocated with `kmalloc` is aligned to at least
 ARCH_KMALLOC_MINALIGN bytes.  For sizes which are a power of two, the
 alignment is also guaranteed to be at least the respective size.
 
+Chunks allocated with kmalloc() can be resized with krealloc(). Similarly
+to kmalloc_array(): a helper for resizing arrays is provided in the form of
+krealloc_array().
+
 For large allocations you can use vmalloc() and vzalloc(), or directly
 request pages from the page allocator. The memory allocated by `vmalloc`
 and related functions is not physically contiguous.
diff --git a/include/linux/slab.h b/include/linux/slab.h
index dd6897f62010..be4ba5867ac5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -592,6 +592,24 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
 	return __kmalloc(bytes, flags);
 }
 
+/**
+ * krealloc_array - reallocate memory for an array.
+ * @p: pointer to the memory chunk to reallocate
+ * @new_n: new number of elements to alloc
+ * @new_size: new size of a single member of the array
+ * @flags: the type of memory to allocate (see kmalloc)
+ */
+static __must_check inline void *
+krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
+{
+	size_t bytes;
+
+	if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
+		return NULL;
+
+	return krealloc(p, bytes, flags);
+}
+
 /**
  * kcalloc - allocate memory for an array. The memory is set to zero.
  * @n: number of elements.
-- 
cgit 


From 7fb7ab6d618a4dc7ea3f3eafc92388a35b4f8894 Mon Sep 17 00:00:00 2001
From: Zhenhua Huang <zhenhuah@codeaurora.org>
Date: Mon, 14 Dec 2020 19:04:46 -0800
Subject: mm: fix page_owner initializing issue for arm32

Page owner of pages used by page owner itself used is missing on arm32
targets.  The reason is dummy_handle and failure_handle is not initialized
correctly.  Buddy allocator is used to initialize these two handles.
However, buddy allocator is not ready when page owner calls it.  This
change fixed that by initializing page owner after buddy initialization.

The working flow before and after this change are:
original logic:
 1. allocated memory for page_ext(using memblock).
 2. invoke the init callback of page_ext_ops like page_owner(using buddy
    allocator).
 3. initialize buddy.

after this change:
 1. allocated memory for page_ext(using memblock).
 2. initialize buddy.
 3. invoke the init callback of page_ext_ops like page_owner(using buddy
    allocator).

with the change, failure/dummy_handle can get its correct value and page
owner output for example has the one for page owner itself:

  Page allocated via order 2, mask 0x6202c0(GFP_USER|__GFP_NOWARN), pid 1006, ts 67278156558 ns
  PFN 543776 type Unmovable Block 531 type Unmovable Flags 0x0()
    init_page_owner+0x28/0x2f8
    invoke_init_callbacks_flatmem+0x24/0x34
    start_kernel+0x33c/0x5d8

Link: https://lkml.kernel.org/r/1603104925-5888-1-git-send-email-zhenhuah@codeaurora.org
Signed-off-by: Zhenhua Huang <zhenhuah@codeaurora.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_ext.h |  8 ++++++++
 init/main.c              |  2 ++
 mm/page_ext.c            | 10 ++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index cfce186f0c4e..aff81ba31bd8 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -44,8 +44,12 @@ static inline void page_ext_init_flatmem(void)
 {
 }
 extern void page_ext_init(void);
+static inline void page_ext_init_flatmem_late(void)
+{
+}
 #else
 extern void page_ext_init_flatmem(void);
+extern void page_ext_init_flatmem_late(void);
 static inline void page_ext_init(void)
 {
 }
@@ -76,6 +80,10 @@ static inline void page_ext_init(void)
 {
 }
 
+static inline void page_ext_init_flatmem_late(void)
+{
+}
+
 static inline void page_ext_init_flatmem(void)
 {
 }
diff --git a/init/main.c b/init/main.c
index 32b2a8affafd..82ae0d1345a0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -828,6 +828,8 @@ static void __init mm_init(void)
 	init_debug_pagealloc();
 	report_meminit();
 	mem_init();
+	/* page_owner must be initialized after buddy is ready */
+	page_ext_init_flatmem_late();
 	kmem_cache_init();
 	kmemleak_init();
 	pgtable_init();
diff --git a/mm/page_ext.c b/mm/page_ext.c
index a3616f7a0e9e..16b161f28a31 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -99,12 +99,19 @@ static void __init invoke_init_callbacks(void)
 	}
 }
 
+#ifndef CONFIG_SPARSEMEM
+void __init page_ext_init_flatmem_late(void)
+{
+	invoke_init_callbacks();
+}
+#endif
+
 static inline struct page_ext *get_entry(void *base, unsigned long index)
 {
 	return base + page_ext_size * index;
 }
 
-#if !defined(CONFIG_SPARSEMEM)
+#ifndef CONFIG_SPARSEMEM
 
 
 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
@@ -177,7 +184,6 @@ void __init page_ext_init_flatmem(void)
 			goto fail;
 	}
 	pr_info("allocated %ld bytes of page_ext\n", total_usage);
-	invoke_init_callbacks();
 	return;
 
 fail:
-- 
cgit 


From 57efa1fe5957694fa541c9062de0a127f0b9acb0 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 14 Dec 2020 19:05:44 -0800
Subject: mm/gup: prevent gup_fast from racing with COW during fork

Since commit 70e806e4e645 ("mm: Do early cow for pinned pages during
fork() for ptes") pages under a FOLL_PIN will not be write protected
during COW for fork.  This means that pages returned from
pin_user_pages(FOLL_WRITE) should not become write protected while the pin
is active.

However, there is a small race where get_user_pages_fast(FOLL_PIN) can
establish a FOLL_PIN at the same time copy_present_page() is write
protecting it:

        CPU 0                             CPU 1
   get_user_pages_fast()
    internal_get_user_pages_fast()
                                       copy_page_range()
                                         pte_alloc_map_lock()
                                           copy_present_page()
                                             atomic_read(has_pinned) == 0
					     page_maybe_dma_pinned() == false
     atomic_set(has_pinned, 1);
     gup_pgd_range()
      gup_pte_range()
       pte_t pte = gup_get_pte(ptep)
       pte_access_permitted(pte)
       try_grab_compound_head()
                                             pte = pte_wrprotect(pte)
	                                     set_pte_at();
                                         pte_unmap_unlock()
      // GUP now returns with a write protected page

The first attempt to resolve this by using the write protect caused
problems (and was missing a barrrier), see commit f3c64eda3e50 ("mm: avoid
early COW write protect games during fork()")

Instead wrap copy_p4d_range() with the write side of a seqcount and check
the read side around gup_pgd_range().  If there is a collision then
get_user_pages_fast() fails and falls back to slow GUP.

Slow GUP is safe against this race because copy_page_range() is only
called while holding the exclusive side of the mmap_lock on the src
mm_struct.

[akpm@linux-foundation.org: coding style fixes]
  Link: https://lore.kernel.org/r/CAHk-=wi=iCnYCARbPGjkVJu9eyYeZ13N64tZYLdOB8CP5Q_PLw@mail.gmail.com

Link: https://lkml.kernel.org/r/2-v4-908497cf359a+4782-gup_fork_jgg@nvidia.com
Fixes: f3c64eda3e50 ("mm: avoid early COW write protect games during fork()")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: "Ahmed S. Darwish" <a.darwish@linutronix.de>	[seqcount_t parts]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kirill Shutemov <kirill@shutemov.name>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/tboot.c    |  1 +
 drivers/firmware/efi/efi.c |  1 +
 include/linux/mm_types.h   |  8 ++++++++
 kernel/fork.c              |  1 +
 mm/gup.c                   | 18 ++++++++++++++++++
 mm/init-mm.c               |  1 +
 mm/memory.c                | 13 ++++++++++++-
 7 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index ae64f98ec2ab..4c09ba110204 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -93,6 +93,7 @@ static struct mm_struct tboot_mm = {
 	.pgd            = swapper_pg_dir,
 	.mm_users       = ATOMIC_INIT(2),
 	.mm_count       = ATOMIC_INIT(1),
+	.write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq),
 	MMAP_LOCK_INITIALIZER(init_mm)
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 6c6eec044a97..df3f9bcab581 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -57,6 +57,7 @@ struct mm_struct efi_mm = {
 	.mm_rb			= RB_ROOT,
 	.mm_users		= ATOMIC_INIT(2),
 	.mm_count		= ATOMIC_INIT(1),
+	.write_protect_seq      = SEQCNT_ZERO(efi_mm.write_protect_seq),
 	MMAP_LOCK_INITIALIZER(efi_mm)
 	.page_table_lock	= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
 	.mmlist			= LIST_HEAD_INIT(efi_mm.mmlist),
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5a9238f6caad..915f4f100383 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -14,6 +14,7 @@
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
+#include <linux/seqlock.h>
 
 #include <asm/mmu.h>
 
@@ -446,6 +447,13 @@ struct mm_struct {
 		 */
 		atomic_t has_pinned;
 
+		/**
+		 * @write_protect_seq: Locked when any thread is write
+		 * protecting pages mapped by this mm to enforce a later COW,
+		 * for instance during page table copying for fork().
+		 */
+		seqcount_t write_protect_seq;
+
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* PTE page table pages */
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..dc55f68a6ee3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1007,6 +1007,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->vmacache_seqnum = 0;
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
+	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
diff --git a/mm/gup.c b/mm/gup.c
index c7e24301860a..9c6a2f5001c5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2684,11 +2684,18 @@ static unsigned long lockless_pages_from_mm(unsigned long start,
 {
 	unsigned long flags;
 	int nr_pinned = 0;
+	unsigned seq;
 
 	if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
 	    !gup_fast_permitted(start, end))
 		return 0;
 
+	if (gup_flags & FOLL_PIN) {
+		seq = raw_read_seqcount(&current->mm->write_protect_seq);
+		if (seq & 1)
+			return 0;
+	}
+
 	/*
 	 * Disable interrupts. The nested form is used, in order to allow full,
 	 * general purpose use of this routine.
@@ -2703,6 +2710,17 @@ static unsigned long lockless_pages_from_mm(unsigned long start,
 	local_irq_save(flags);
 	gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
 	local_irq_restore(flags);
+
+	/*
+	 * When pinning pages for DMA there could be a concurrent write protect
+	 * from fork() via copy_page_range(), in this case always fail fast GUP.
+	 */
+	if (gup_flags & FOLL_PIN) {
+		if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
+			unpin_user_pages(pages, nr_pinned);
+			return 0;
+		}
+	}
 	return nr_pinned;
 }
 
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 3a613c85f9ed..153162669f80 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -31,6 +31,7 @@ struct mm_struct init_mm = {
 	.pgd		= swapper_pg_dir,
 	.mm_users	= ATOMIC_INIT(2),
 	.mm_count	= ATOMIC_INIT(1),
+	.write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq),
 	MMAP_LOCK_INITIALIZER(init_mm)
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
diff --git a/mm/memory.c b/mm/memory.c
index c48f8df6e502..50632c4366b8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1171,6 +1171,15 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
 					0, src_vma, src_mm, addr, end);
 		mmu_notifier_invalidate_range_start(&range);
+		/*
+		 * Disabling preemption is not needed for the write side, as
+		 * the read side doesn't spin, but goes to the mmap_lock.
+		 *
+		 * Use the raw variant of the seqcount_t write API to avoid
+		 * lockdep complaining about preemptibility.
+		 */
+		mmap_assert_write_locked(src_mm);
+		raw_write_seqcount_begin(&src_mm->write_protect_seq);
 	}
 
 	ret = 0;
@@ -1187,8 +1196,10 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 		}
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 
-	if (is_cow)
+	if (is_cow) {
+		raw_write_seqcount_end(&src_mm->write_protect_seq);
 		mmu_notifier_invalidate_range_end(&range);
+	}
 	return ret;
 }
 
-- 
cgit 


From 52650c8b466bac399aec213c61d74bfe6f7af1a4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 14 Dec 2020 19:05:48 -0800
Subject: mm/gup: remove the vma allocation from gup_longterm_locked()

Long ago there wasn't a FOLL_LONGTERM flag so this DAX check was done by
post-processing the VMA list.

These days it is trivial to just check each VMA to see if it is DAX before
processing it inside __get_user_pages() and return failure if a DAX VMA is
encountered with FOLL_LONGTERM.

Removing the allocation of the VMA list is a significant speed up for many
call sites.

Add an IS_ENABLED to vma_is_fsdax so that code generation is unchanged
when DAX is compiled out.

Remove the dummy version of __gup_longterm_locked() as !CONFIG_CMA already
makes memalloc_nocma_save(), check_and_migrate_cma_pages(), and
memalloc_nocma_restore() into a NOP.

Link: https://lkml.kernel.org/r/0-v1-5551df3ed12e+b8-gup_dax_speedup_jgg@nvidia.com
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h |  2 +-
 mm/gup.c           | 83 ++++++++++--------------------------------------------
 2 files changed, 16 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8667d0cdc71e..1fcc2b00582b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3230,7 +3230,7 @@ static inline bool vma_is_fsdax(struct vm_area_struct *vma)
 {
 	struct inode *inode;
 
-	if (!vma->vm_file)
+	if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
 		return false;
 	if (!vma_is_dax(vma))
 		return false;
diff --git a/mm/gup.c b/mm/gup.c
index 9c6a2f5001c5..311a44ff41ff 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -923,6 +923,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
 		return -EFAULT;
 
+	if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
+		return -EOPNOTSUPP;
+
 	if (write) {
 		if (!(vm_flags & VM_WRITE)) {
 			if (!(gup_flags & FOLL_FORCE))
@@ -1060,10 +1063,14 @@ static long __get_user_pages(struct mm_struct *mm,
 				goto next_page;
 			}
 
-			if (!vma || check_vma_flags(vma, gup_flags)) {
+			if (!vma) {
 				ret = -EFAULT;
 				goto out;
 			}
+			ret = check_vma_flags(vma, gup_flags);
+			if (ret)
+				goto out;
+
 			if (is_vm_hugetlb_page(vma)) {
 				i = follow_hugetlb_page(mm, vma, pages, vmas,
 						&start, &nr_pages, i,
@@ -1567,26 +1574,6 @@ struct page *get_dump_page(unsigned long addr)
 }
 #endif /* CONFIG_ELF_CORE */
 
-#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
-	long i;
-	struct vm_area_struct *vma_prev = NULL;
-
-	for (i = 0; i < nr_pages; i++) {
-		struct vm_area_struct *vma = vmas[i];
-
-		if (vma == vma_prev)
-			continue;
-
-		vma_prev = vma;
-
-		if (vma_is_fsdax(vma))
-			return true;
-	}
-	return false;
-}
-
 #ifdef CONFIG_CMA
 static long check_and_migrate_cma_pages(struct mm_struct *mm,
 					unsigned long start,
@@ -1705,63 +1692,23 @@ static long __gup_longterm_locked(struct mm_struct *mm,
 				  struct vm_area_struct **vmas,
 				  unsigned int gup_flags)
 {
-	struct vm_area_struct **vmas_tmp = vmas;
 	unsigned long flags = 0;
-	long rc, i;
+	long rc;
 
-	if (gup_flags & FOLL_LONGTERM) {
-		if (!pages)
-			return -EINVAL;
-
-		if (!vmas_tmp) {
-			vmas_tmp = kcalloc(nr_pages,
-					   sizeof(struct vm_area_struct *),
-					   GFP_KERNEL);
-			if (!vmas_tmp)
-				return -ENOMEM;
-		}
+	if (gup_flags & FOLL_LONGTERM)
 		flags = memalloc_nocma_save();
-	}
 
-	rc = __get_user_pages_locked(mm, start, nr_pages, pages,
-				     vmas_tmp, NULL, gup_flags);
+	rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
+				     gup_flags);
 
 	if (gup_flags & FOLL_LONGTERM) {
-		if (rc < 0)
-			goto out;
-
-		if (check_dax_vmas(vmas_tmp, rc)) {
-			if (gup_flags & FOLL_PIN)
-				unpin_user_pages(pages, rc);
-			else
-				for (i = 0; i < rc; i++)
-					put_page(pages[i]);
-			rc = -EOPNOTSUPP;
-			goto out;
-		}
-
-		rc = check_and_migrate_cma_pages(mm, start, rc, pages,
-						 vmas_tmp, gup_flags);
-out:
+		if (rc > 0)
+			rc = check_and_migrate_cma_pages(mm, start, rc, pages,
+							 vmas, gup_flags);
 		memalloc_nocma_restore(flags);
 	}
-
-	if (vmas_tmp != vmas)
-		kfree(vmas_tmp);
 	return rc;
 }
-#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
-static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
-						  unsigned long start,
-						  unsigned long nr_pages,
-						  struct page **pages,
-						  struct vm_area_struct **vmas,
-						  unsigned int flags)
-{
-	return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
-				       NULL, flags);
-}
-#endif /* CONFIG_FS_DAX || CONFIG_CMA */
 
 static bool is_valid_gup_flags(unsigned int gup_flags)
 {
-- 
cgit 


From 462680946b6d982afdda3bf5f7de3c379cb8c97b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 14 Dec 2020 19:06:11 -0800
Subject: mm: remove pagevec_lookup_range_nr_tag()

With the merge of commit 2e1692966034 ("ceph: have ceph_writepages_start
call pagevec_lookup_range_tag"), nothing calls this anymore.

Link: https://lkml.kernel.org/r/20201021193926.101474-1-jlayton@kernel.org
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 3 ---
 mm/swap.c               | 9 ---------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 081d934eda64..ad4ddc17d403 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -43,9 +43,6 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
 		xa_mark_t tag);
-unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t *index, pgoff_t end,
-		xa_mark_t tag, unsigned max_pages);
 static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, xa_mark_t tag)
 {
diff --git a/mm/swap.c b/mm/swap.c
index 29220174433b..16a525296960 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1167,15 +1167,6 @@ unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 }
 EXPORT_SYMBOL(pagevec_lookup_range_tag);
 
-unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t *index, pgoff_t end,
-		xa_mark_t tag, unsigned max_pages)
-{
-	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
-		min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
-	return pagevec_count(pvec);
-}
-EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
 /*
  * Perform any setup for the swap system
  */
-- 
cgit 


From 30e6a51dbb0594d79dc2a9543659c1d596e2f7d4 Mon Sep 17 00:00:00 2001
From: Hui Su <sh_def@163.com>
Date: Mon, 14 Dec 2020 19:06:14 -0800
Subject: mm/shmem.c: make shmem_mapping() inline

shmem_mapping() isn't worth an out-of-line call from any callsite.

So make it inline by
 - make shmem_aops global
 - export shmem_aops
 - inline the shmem_mapping()

and replace the direct call 'shmem_aops' with shmem_mapping()
in shmem.c.

Link: https://lkml.kernel.org/r/20201115165207.GA265355@rlk
Signed-off-by: Hui Su <sh_def@163.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h |  6 +++++-
 mm/shmem.c               | 16 ++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a5a5d1d4d7b1..d82b6f396588 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -67,7 +67,11 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 #ifdef CONFIG_SHMEM
-extern bool shmem_mapping(struct address_space *mapping);
+extern const struct address_space_operations shmem_aops;
+static inline bool shmem_mapping(struct address_space *mapping)
+{
+	return mapping->a_ops == &shmem_aops;
+}
 #else
 static inline bool shmem_mapping(struct address_space *mapping)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..b7361fce50bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -246,7 +246,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
 }
 
 static const struct super_operations shmem_ops;
-static const struct address_space_operations shmem_aops;
+const struct address_space_operations shmem_aops;
 static const struct file_operations shmem_file_operations;
 static const struct inode_operations shmem_inode_operations;
 static const struct inode_operations shmem_dir_inode_operations;
@@ -1152,7 +1152,7 @@ static void shmem_evict_inode(struct inode *inode)
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 
-	if (inode->i_mapping->a_ops == &shmem_aops) {
+	if (shmem_mapping(inode->i_mapping)) {
 		shmem_unacct_size(info->flags, inode->i_size);
 		inode->i_size = 0;
 		shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -1858,7 +1858,7 @@ repeat:
 	}
 
 	/* shmem_symlink() */
-	if (mapping->a_ops != &shmem_aops)
+	if (!shmem_mapping(mapping))
 		goto alloc_nohuge;
 	if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
 		goto alloc_nohuge;
@@ -2352,11 +2352,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 	return inode;
 }
 
-bool shmem_mapping(struct address_space *mapping)
-{
-	return mapping->a_ops == &shmem_aops;
-}
-
 static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 				  pmd_t *dst_pmd,
 				  struct vm_area_struct *dst_vma,
@@ -3865,7 +3860,7 @@ static void shmem_destroy_inodecache(void)
 	kmem_cache_destroy(shmem_inode_cachep);
 }
 
-static const struct address_space_operations shmem_aops = {
+const struct address_space_operations shmem_aops = {
 	.writepage	= shmem_writepage,
 	.set_page_dirty	= __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
@@ -3877,6 +3872,7 @@ static const struct address_space_operations shmem_aops = {
 #endif
 	.error_remove_page = generic_error_remove_page,
 };
+EXPORT_SYMBOL(shmem_aops);
 
 static const struct file_operations shmem_file_operations = {
 	.mmap		= shmem_mmap,
@@ -4312,7 +4308,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 	struct page *page;
 	int error;
 
-	BUG_ON(mapping->a_ops != &shmem_aops);
+	BUG_ON(!shmem_mapping(mapping));
 	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
 				  gfp, NULL, NULL, NULL);
 	if (error)
-- 
cgit 


From 1a984c4e8200e0e58bb316f14a4bebb28d32d15a Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Mon, 14 Dec 2020 19:06:24 -0800
Subject: mm: memcontrol: remove unused mod_memcg_obj_state()

Since commit 991e7673859e ("mm: memcontrol: account kernel stack per
node") there is no user of the mod_memcg_obj_state().  So just remove
it.

Also rework type of the idx parameter of the mod_objcg_state() from int
to enum node_stat_item.

Link: https://lkml.kernel.org/r/20201013153504.92602-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yafang Shao <laoar.shao@gmail.com>
Cc: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ------
 mm/memcontrol.c            | 11 -----------
 mm/slab.h                  |  4 ++--
 3 files changed, 2 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 922a7f600465..28bf65560084 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -798,8 +798,6 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val);
 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);
 
-void mod_memcg_obj_state(void *p, int idx, int val);
-
 static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
 					 int val)
 {
@@ -1255,10 +1253,6 @@ static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
 	mod_node_page_state(page_pgdat(page), idx, val);
 }
 
-static inline void mod_memcg_obj_state(void *p, int idx, int val)
-{
-}
-
 static inline
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 					    gfp_t gfp_mask,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c3654510fb70..83f2e5f84bdc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -882,17 +882,6 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
 	rcu_read_unlock();
 }
 
-void mod_memcg_obj_state(void *p, int idx, int val)
-{
-	struct mem_cgroup *memcg;
-
-	rcu_read_lock();
-	memcg = mem_cgroup_from_obj(p);
-	if (memcg)
-		mod_memcg_state(memcg, idx, val);
-	rcu_read_unlock();
-}
-
 /**
  * __count_memcg_events - account VM events in a cgroup
  * @memcg: the memory cgroup
diff --git a/mm/slab.h b/mm/slab.h
index f9977d6613d6..65f4647f1286 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -204,7 +204,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
-static inline int cache_vmstat_idx(struct kmem_cache *s)
+static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
 {
 	return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
@@ -304,7 +304,7 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
 
 static inline void mod_objcg_state(struct obj_cgroup *objcg,
 				   struct pglist_data *pgdat,
-				   int idx, int nr)
+				   enum node_stat_item idx, int nr)
 {
 	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
-- 
cgit 


From 013339df116c2ee0d796dd8bfb8f293a2030c063 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Mon, 14 Dec 2020 19:06:39 -0800
Subject: mm/rmap: always do TTU_IGNORE_ACCESS

Since commit 369ea8242c0f ("mm/rmap: update to new mmu_notifier semantic
v2"), the code to check the secondary MMU's page table access bit is
broken for !(TTU_IGNORE_ACCESS) because the page is unmapped from the
secondary MMU's page table before the check.  More specifically for those
secondary MMUs which unmap the memory in
mmu_notifier_invalidate_range_start() like kvm.

However memory reclaim is the only user of !(TTU_IGNORE_ACCESS) or the
absence of TTU_IGNORE_ACCESS and it explicitly performs the page table
access check before trying to unmap the page.  So, at worst the reclaim
will miss accesses in a very short window if we remove page table access
check in unmapping code.

There is an unintented consequence of !(TTU_IGNORE_ACCESS) for the memcg
reclaim.  From memcg reclaim the page_referenced() only account the
accesses from the processes which are in the same memcg of the target page
but the unmapping code is considering accesses from all the processes, so,
decreasing the effectiveness of memcg reclaim.

The simplest solution is to always assume TTU_IGNORE_ACCESS in unmapping
code.

Link: https://lkml.kernel.org/r/20201104231928.1494083-1-shakeelb@google.com
Fixes: 369ea8242c0f ("mm/rmap: update to new mmu_notifier semantic v2")
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  1 -
 mm/huge_memory.c     |  2 +-
 mm/memory-failure.c  |  2 +-
 mm/memory_hotplug.c  |  2 +-
 mm/migrate.c         |  8 +++-----
 mm/rmap.c            |  9 ---------
 mm/vmscan.c          | 14 +++++---------
 7 files changed, 11 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3a6adfa70fb0..70085ca1a3fc 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -91,7 +91,6 @@ enum ttu_flags {
 
 	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
 	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
-	TTU_IGNORE_ACCESS	= 0x10,	/* don't age */
 	TTU_IGNORE_HWPOISON	= 0x20,	/* corrupted page is recoverable */
 	TTU_BATCH_FLUSH		= 0x40,	/* Batch TLB flushes where possible
 					 * and caller guarantees they will
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42b18d461086..c11c80078335 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2321,7 +2321,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
 
 static void unmap_page(struct page *page)
 {
-	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
+	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
 		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
 	bool unmap_success;
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5d880d4eb9a2..71295bb984af 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -989,7 +989,7 @@ static int get_hwpoison_page(struct page *page)
 static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 				  int flags, struct page **hpagep)
 {
-	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+	enum ttu_flags ttu = TTU_IGNORE_MLOCK;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
 	bool unmap_success = true;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 63b2e46b6555..0f855deea4b2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1304,7 +1304,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			if (WARN_ON(PageLRU(page)))
 				isolate_lru_page(page);
 			if (page_mapped(page))
-				try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+				try_to_unmap(page, TTU_IGNORE_MLOCK);
 			continue;
 		}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 5795cb82e27c..8ea0c65f1075 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1122,8 +1122,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		/* Establish migration ptes */
 		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
 				page);
-		try_to_unmap(page,
-			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+		try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
 		page_was_mapped = 1;
 	}
 
@@ -1329,8 +1328,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 
 	if (page_mapped(hpage)) {
 		bool mapping_locked = false;
-		enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK|
-					TTU_IGNORE_ACCESS;
+		enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
 
 		if (!PageAnon(hpage)) {
 			/*
@@ -2688,7 +2686,7 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
  */
 static void migrate_vma_unmap(struct migrate_vma *migrate)
 {
-	int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+	int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
 	const unsigned long npages = migrate->npages;
 	const unsigned long start = migrate->start;
 	unsigned long addr, i, restore = 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index 31b29321adfe..6657000b18d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1533,15 +1533,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			goto discard;
 		}
 
-		if (!(flags & TTU_IGNORE_ACCESS)) {
-			if (ptep_clear_flush_young_notify(vma, address,
-						pvmw.pte)) {
-				ret = false;
-				page_vma_mapped_walk_done(&pvmw);
-				break;
-			}
-		}
-
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
 		if (should_defer_flush(mm, flags)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7b4e31eac2cf..0ec6321e9887 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1072,7 +1072,6 @@ static void page_check_dirty_writeback(struct page *page,
 static unsigned int shrink_page_list(struct list_head *page_list,
 				     struct pglist_data *pgdat,
 				     struct scan_control *sc,
-				     enum ttu_flags ttu_flags,
 				     struct reclaim_stat *stat,
 				     bool ignore_references)
 {
@@ -1297,7 +1296,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page)) {
-			enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+			enum ttu_flags flags = TTU_BATCH_FLUSH;
 			bool was_swapbacked = PageSwapBacked(page);
 
 			if (unlikely(PageTransHuge(page)))
@@ -1514,7 +1513,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	}
 
 	nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
-			TTU_IGNORE_ACCESS, &stat, true);
+					&stat, true);
 	list_splice(&clean_pages, page_list);
 	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
 			    -(long)nr_reclaimed);
@@ -1958,8 +1957,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
-				&stat, false);
+	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
 
 	spin_lock_irq(&pgdat->lru_lock);
 
@@ -2131,8 +2129,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
 
 		nr_reclaimed += shrink_page_list(&node_page_list,
 						NODE_DATA(nid),
-						&sc, 0,
-						&dummy_stat, false);
+						&sc, &dummy_stat, false);
 		while (!list_empty(&node_page_list)) {
 			page = lru_to_page(&node_page_list);
 			list_del(&page->lru);
@@ -2145,8 +2142,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
 	if (!list_empty(&node_page_list)) {
 		nr_reclaimed += shrink_page_list(&node_page_list,
 						NODE_DATA(nid),
-						&sc, 0,
-						&dummy_stat, false);
+						&sc, &dummy_stat, false);
 		while (!list_empty(&node_page_list)) {
 			page = lru_to_page(&node_page_list);
 			list_del(&page->lru);
-- 
cgit 


From a7cb874bfff785d39de6cc847673539cb3540821 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 14 Dec 2020 19:06:45 -0800
Subject: mm: memcg: fix obsolete code comments

This patch fixes/removes some obsolete comments in the code related
to the kernel memory accounting:

 - kmem_cache->memcg_params.memcg_caches has been removed by commit
   9855609bde03 ("mm: memcg/slab: use a single set of kmem_caches for
   all accounted allocations")

 - memcg->kmemcg_id is not used as a gate for kmem accounting since
   commit 0b8f73e10428 ("mm: memcontrol: clean up alloc, online,
   offline, free functions")

Link: https://lkml.kernel.org/r/20201110184615.311974-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 6 ++----
 mm/memcontrol.c            | 6 ------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 28bf65560084..1f0f64fa3ccd 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -296,7 +296,6 @@ struct mem_cgroup {
 	int			tcpmem_pressure;
 
 #ifdef CONFIG_MEMCG_KMEM
-        /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
 	enum memcg_kmem_state kmem_state;
 	struct obj_cgroup __rcu *objcg;
@@ -1562,9 +1561,8 @@ static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
 }
 
 /*
- * helper for accessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
+ * A helper for accessing memcg's kmem_id, used for getting
+ * corresponding LRU lists.
  */
 static inline int memcg_cache_id(struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 758613277286..466d9aed6cc8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3703,12 +3703,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
 
 	static_branch_enable(&memcg_kmem_enabled_key);
 
-	/*
-	 * A memory cgroup is considered kmem-online as soon as it gets
-	 * kmemcg_id. Setting the id after enabling static branching will
-	 * guarantee no one starts accounting before all call sites are
-	 * patched.
-	 */
 	memcg->kmemcg_id = memcg_id;
 	memcg->kmem_state = KMEM_ONLINE;
 
-- 
cgit 


From bef8620cd8e0a117c1a0719604052e424eb418f9 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 14 Dec 2020 19:06:49 -0800
Subject: mm: memcg: deprecate the non-hierarchical mode

Patch series "mm: memcg: deprecate cgroup v1 non-hierarchical mode", v1.

The non-hierarchical cgroup v1 mode is a legacy of early days
of the memory controller and doesn't bring any value today.
However, it complicates the code and creates many edge cases
all over the memory controller code.

It's a good time to deprecate it completely. This patchset removes
the internal logic, adjusts the user interface and updates
the documentation. The alt patch removes some bits of the cgroup
core code, which become obsolete.

Michal Hocko said:
  "All that we know today is that we have a warning in place to complain
   loudly when somebody relies on use_hierarchy=0 with a deeper
   hierarchy. For all those years we have seen _zero_ reports that would
   describe a sensible usecase.

   Moreover we (SUSE) have backported this warning into old distribution
   kernels (since 3.0 based kernels) to extend the coverage and didn't
   hear even for users who adopt new kernels only very slowly. The only
   report we have seen so far was a LTP test suite which doesn't really
   reflect any real life usecase"

This patch (of 3):

The non-hierarchical cgroup v1 mode is a legacy of early days of the
memory controller and doesn't bring any value today.  However, it
complicates the code and creates many edge cases all over the memory
controller code.

It's a good time to deprecate it completely.

Functionally this patch enabled is by default for all cgroups and forbids
switching it off.  Nothing changes if cgroup v2 is used: hierarchical mode
was enforced from scratch.

To protect the ABI memory.use_hierarchy interface is preserved with a
limited functionality: reading always returns "1", writing of "1" passes
silently, writing of any other value fails with -EINVAL and a warning to
dmesg (on the first occasion).

Link: https://lkml.kernel.org/r/20201110220800.929549-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201110220800.929549-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  7 ----
 kernel/cgroup/cgroup.c     |  5 ---
 mm/memcontrol.c            | 90 +++++++---------------------------------------
 3 files changed, 13 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1f0f64fa3ccd..dd992b81bcb7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -234,11 +234,6 @@ struct mem_cgroup {
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
 
-	/*
-	 * Should the accounting and control be hierarchical, per subtree?
-	 */
-	bool use_hierarchy;
-
 	/*
 	 * Should the OOM killer kill all belonging tasks, had it kill one?
 	 */
@@ -588,8 +583,6 @@ static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
 {
 	if (root == memcg)
 		return true;
-	if (!root->use_hierarchy)
-		return false;
 	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e41c21819ba0..80c5c34416e8 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -281,9 +281,6 @@ bool cgroup_ssid_enabled(int ssid)
  * - cpuset: a task can be moved into an empty cpuset, and again it takes
  *   masks of ancestors.
  *
- * - memcg: use_hierarchy is on by default and the cgroup file for the flag
- *   is not created.
- *
  * - blkcg: blk-throttle becomes properly hierarchical.
  *
  * - debug: disallowed on the default hierarchy.
@@ -5156,8 +5153,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 	    cgroup_parent(parent)) {
 		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
 			current->comm, current->pid, ss->name);
-		if (!strcmp(ss->name, "memory"))
-			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
 		ss->warned_broken_hierarchy = true;
 	}
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 466d9aed6cc8..fc18a2b7d25a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1141,12 +1141,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 	if (prev && !reclaim)
 		pos = prev;
 
-	if (!root->use_hierarchy && root != root_mem_cgroup) {
-		if (prev)
-			goto out;
-		return root;
-	}
-
 	rcu_read_lock();
 
 	if (reclaim) {
@@ -1226,7 +1220,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 
 out_unlock:
 	rcu_read_unlock();
-out:
 	if (prev && prev != root)
 		css_put(&prev->css);
 
@@ -3461,10 +3454,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 }
 
 /*
- * Test whether @memcg has children, dead or alive.  Note that this
- * function doesn't care whether @memcg has use_hierarchy enabled and
- * returns %true if there are child csses according to the cgroup
- * hierarchy.  Testing use_hierarchy is the caller's responsibility.
+ * Test whether @memcg has children, dead or alive.
  */
 static inline bool memcg_has_children(struct mem_cgroup *memcg)
 {
@@ -3524,37 +3514,20 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
 				     struct cftype *cft)
 {
-	return mem_cgroup_from_css(css)->use_hierarchy;
+	return 1;
 }
 
 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 				      struct cftype *cft, u64 val)
 {
-	int retval = 0;
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
-
-	if (memcg->use_hierarchy == val)
+	if (val == 1)
 		return 0;
 
-	/*
-	 * If parent's use_hierarchy is set, we can't make any modifications
-	 * in the child subtrees. If it is unset, then the change can
-	 * occur, provided the current cgroup has no children.
-	 *
-	 * For the root cgroup, parent_mem is NULL, we allow value to be
-	 * set if there are no children.
-	 */
-	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
-				(val == 1 || val == 0)) {
-		if (!memcg_has_children(memcg))
-			memcg->use_hierarchy = val;
-		else
-			retval = -EBUSY;
-	} else
-		retval = -EINVAL;
+	pr_warn_once("Non-hierarchical mode is deprecated. "
+		     "Please report your usecase to linux-mm@kvack.org if you "
+		     "depend on this functionality.\n");
 
-	return retval;
+	return -EINVAL;
 }
 
 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3742,8 +3715,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
 		child = mem_cgroup_from_css(css);
 		BUG_ON(child->kmemcg_id != kmemcg_id);
 		child->kmemcg_id = parent->kmemcg_id;
-		if (!memcg->use_hierarchy)
-			break;
 	}
 	rcu_read_unlock();
 
@@ -5334,38 +5305,22 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (parent) {
 		memcg->swappiness = mem_cgroup_swappiness(parent);
 		memcg->oom_kill_disable = parent->oom_kill_disable;
-	}
-	if (!parent) {
-		page_counter_init(&memcg->memory, NULL);
-		page_counter_init(&memcg->swap, NULL);
-		page_counter_init(&memcg->kmem, NULL);
-		page_counter_init(&memcg->tcpmem, NULL);
-	} else if (parent->use_hierarchy) {
-		memcg->use_hierarchy = true;
+
 		page_counter_init(&memcg->memory, &parent->memory);
 		page_counter_init(&memcg->swap, &parent->swap);
 		page_counter_init(&memcg->kmem, &parent->kmem);
 		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
 	} else {
-		page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
-		page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
-		page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
-		page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
-		/*
-		 * Deeper hierachy with use_hierarchy == false doesn't make
-		 * much sense so let cgroup subsystem know about this
-		 * unfortunate state in our controller.
-		 */
-		if (parent != root_mem_cgroup)
-			memory_cgrp_subsys.broken_hierarchy = true;
-	}
+		page_counter_init(&memcg->memory, NULL);
+		page_counter_init(&memcg->swap, NULL);
+		page_counter_init(&memcg->kmem, NULL);
+		page_counter_init(&memcg->tcpmem, NULL);
 
-	/* The following stuff does not apply to the root */
-	if (!parent) {
 		root_mem_cgroup = memcg;
 		return &memcg->css;
 	}
 
+	/* The following stuff does not apply to the root */
 	error = memcg_online_kmem(memcg);
 	if (error)
 		goto fail;
@@ -6202,24 +6157,6 @@ static void mem_cgroup_move_task(void)
 }
 #endif
 
-/*
- * Cgroup retains root cgroups across [un]mount cycles making it necessary
- * to verify whether we're attached to the default hierarchy on each mount
- * attempt.
- */
-static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
-{
-	/*
-	 * use_hierarchy is forced on the default hierarchy.  cgroup core
-	 * guarantees that @root doesn't have any children, so turning it
-	 * on for the root memcg is enough.
-	 */
-	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
-		root_mem_cgroup->use_hierarchy = true;
-	else
-		root_mem_cgroup->use_hierarchy = false;
-}
-
 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
 {
 	if (value == PAGE_COUNTER_MAX)
@@ -6557,7 +6494,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.post_attach = mem_cgroup_move_task,
-	.bind = mem_cgroup_bind,
 	.dfl_cftypes = memory_files,
 	.legacy_cftypes = mem_cgroup_legacy_files,
 	.early_init = 0,
-- 
cgit 


From 9d9d341df4d519d96e7927941d91f5785c5cea07 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 14 Dec 2020 19:06:55 -0800
Subject: cgroup: remove obsoleted broken_hierarchy and warned_broken_hierarchy

With the deprecation of the non-hierarchical mode of the memory controller
there are no more examples of broken hierarchies left.

Let's remove the cgroup core code which was supposed to print warnings
about creating of broken hierarchies.

Link: https://lkml.kernel.org/r/20201110220800.929549-4-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup-defs.h | 15 ---------------
 kernel/cgroup/cgroup.c      |  7 -------
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index fee0b5547cd0..559ee05f86b2 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -668,21 +668,6 @@ struct cgroup_subsys {
 	 */
 	bool threaded:1;
 
-	/*
-	 * If %false, this subsystem is properly hierarchical -
-	 * configuration, resource accounting and restriction on a parent
-	 * cgroup cover those of its children.  If %true, hierarchy support
-	 * is broken in some ways - some subsystems ignore hierarchy
-	 * completely while others are only implemented half-way.
-	 *
-	 * It's now disallowed to create nested cgroups if the subsystem is
-	 * broken and cgroup core will emit a warning message on such
-	 * cases.  Eventually, all subsystems will be made properly
-	 * hierarchical and this will go away.
-	 */
-	bool broken_hierarchy:1;
-	bool warned_broken_hierarchy:1;
-
 	/* the following two fields are initialized automtically during boot */
 	int id;
 	const char *name;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 80c5c34416e8..16f4692dc961 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5149,13 +5149,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 	if (err)
 		goto err_list_del;
 
-	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
-	    cgroup_parent(parent)) {
-		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
-			current->comm, current->pid, ss->name);
-		ss->warned_broken_hierarchy = true;
-	}
-
 	return css;
 
 err_list_del:
-- 
cgit 


From da3ceeff923e3bc750a8423c840462760c463926 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Mon, 14 Dec 2020 19:07:04 -0800
Subject: mm: memcg/slab: rename *_lruvec_slab_state to *_lruvec_kmem_state

The *_lruvec_slab_state is also suitable for pages allocated from buddy,
not just for the slab objects.  But the function name seems to tell us
that only slab object is applicable.  So we can rename the keyword of slab
to kmem.

Link: https://lkml.kernel.org/r/20201117085249.24319-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 18 +++++++++---------
 kernel/fork.c              |  2 +-
 mm/memcontrol.c            |  2 +-
 mm/workingset.c            |  8 ++++----
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dd992b81bcb7..71c961452d9a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -788,15 +788,15 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			      int val);
 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val);
-void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);
+void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
 
-static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 					 int val)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__mod_lruvec_slab_state(p, idx, val);
+	__mod_lruvec_kmem_state(p, idx, val);
 	local_irq_restore(flags);
 }
 
@@ -1229,7 +1229,7 @@ static inline void mod_lruvec_page_state(struct page *page,
 	mod_node_page_state(page_pgdat(page), idx, val);
 }
 
-static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 					   int val)
 {
 	struct page *page = virt_to_head_page(p);
@@ -1237,7 +1237,7 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
 	__mod_node_page_state(page_pgdat(page), idx, val);
 }
 
-static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 					 int val)
 {
 	struct page *page = virt_to_head_page(p);
@@ -1332,14 +1332,14 @@ static inline void __dec_lruvec_page_state(struct page *page,
 	__mod_lruvec_page_state(page, idx, -1);
 }
 
-static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx)
+static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
-	__mod_lruvec_slab_state(p, idx, 1);
+	__mod_lruvec_kmem_state(p, idx, 1);
 }
 
-static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx)
+static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
-	__mod_lruvec_slab_state(p, idx, -1);
+	__mod_lruvec_kmem_state(p, idx, -1);
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
diff --git a/kernel/fork.c b/kernel/fork.c
index dc55f68a6ee3..5e7cc88eadb5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -385,7 +385,7 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 		mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
 				      account * (THREAD_SIZE / 1024));
 	else
-		mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+		mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
 				      account * (THREAD_SIZE / 1024));
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ce19e8484c89..b50f7f336b16 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -853,7 +853,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 		__mod_memcg_lruvec_state(lruvec, idx, val);
 }
 
-void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
+void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 {
 	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 	struct mem_cgroup *memcg;
diff --git a/mm/workingset.c b/mm/workingset.c
index 975a4d2dd02e..25f75bbe80e0 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -445,12 +445,12 @@ void workingset_update_node(struct xa_node *node)
 	if (node->count && node->count == node->nr_values) {
 		if (list_empty(&node->private_list)) {
 			list_lru_add(&shadow_nodes, &node->private_list);
-			__inc_lruvec_slab_state(node, WORKINGSET_NODES);
+			__inc_lruvec_kmem_state(node, WORKINGSET_NODES);
 		}
 	} else {
 		if (!list_empty(&node->private_list)) {
 			list_lru_del(&shadow_nodes, &node->private_list);
-			__dec_lruvec_slab_state(node, WORKINGSET_NODES);
+			__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
 		}
 	}
 }
@@ -544,7 +544,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	}
 
 	list_lru_isolate(lru, item);
-	__dec_lruvec_slab_state(node, WORKINGSET_NODES);
+	__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
 
 	spin_unlock(lru_lock);
 
@@ -559,7 +559,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 		goto out_invalid;
 	mapping->nrexceptional -= node->nr_values;
 	xa_delete_node(node, workingset_update_node);
-	__inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
+	__inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
 
 out_invalid:
 	xa_unlock_irq(&mapping->i_pages);
-- 
cgit 


From c47d5032ed3002311a4188eae51f4641ec436beb Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Mon, 14 Dec 2020 19:07:14 -0800
Subject: mm: move lruvec stats update functions to vmstat.h

Patch series "memcg: add pagetable comsumption to memory.stat", v2.

Many workloads consumes significant amount of memory in pagetables.  One
specific use-case is the user space network driver which mmaps the
application memory to provide zero copy transfer.  This driver can consume
a large amount memory in page tables.  This patch series exposes the
pagetable comsumption for each memory cgroup.

This patch (of 2):

This does not change any functionality and only move the functions which
update the lruvec stats to vmstat.h from memcontrol.h.  The main reason
for this patch is to be able to use these functions in the page table
contructor function which is defined in mm.h and we can not include the
memcontrol.h in that file.  Also this is a better place for this interface
in general.  The lruvec abstraction, while invented for memcg, isn't
specific to memcg at all.

Link: https://lkml.kernel.org/r/20201130212541.2781790-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 111 ---------------------------------------------
 include/linux/vmstat.h     | 104 ++++++++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c            |  17 +++++++
 3 files changed, 121 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 71c961452d9a..f530d634f055 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -786,8 +786,6 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			      int val);
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
-			int val);
 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
 
 static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
@@ -810,43 +808,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
 	local_irq_restore(flags);
 }
 
-static inline void mod_lruvec_state(struct lruvec *lruvec,
-				    enum node_stat_item idx, int val)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__mod_lruvec_state(lruvec, idx, val);
-	local_irq_restore(flags);
-}
-
-static inline void __mod_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx, int val)
-{
-	struct page *head = compound_head(page); /* rmap on tail pages */
-	pg_data_t *pgdat = page_pgdat(page);
-	struct lruvec *lruvec;
-
-	/* Untracked pages have no memcg, no lruvec. Update only the node */
-	if (!head->mem_cgroup) {
-		__mod_node_page_state(pgdat, idx, val);
-		return;
-	}
-
-	lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
-	__mod_lruvec_state(lruvec, idx, val);
-}
-
-static inline void mod_lruvec_page_state(struct page *page,
-					 enum node_stat_item idx, int val)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__mod_lruvec_page_state(page, idx, val);
-	local_irq_restore(flags);
-}
-
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
@@ -1205,30 +1166,6 @@ static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
 {
 }
 
-static inline void __mod_lruvec_state(struct lruvec *lruvec,
-				      enum node_stat_item idx, int val)
-{
-	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-}
-
-static inline void mod_lruvec_state(struct lruvec *lruvec,
-				    enum node_stat_item idx, int val)
-{
-	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-}
-
-static inline void __mod_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx, int val)
-{
-	__mod_node_page_state(page_pgdat(page), idx, val);
-}
-
-static inline void mod_lruvec_page_state(struct page *page,
-					 enum node_stat_item idx, int val)
-{
-	mod_node_page_state(page_pgdat(page), idx, val);
-}
-
 static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 					   int val)
 {
@@ -1308,30 +1245,6 @@ static inline void __dec_memcg_page_state(struct page *page,
 	__mod_memcg_page_state(page, idx, -1);
 }
 
-static inline void __inc_lruvec_state(struct lruvec *lruvec,
-				      enum node_stat_item idx)
-{
-	__mod_lruvec_state(lruvec, idx, 1);
-}
-
-static inline void __dec_lruvec_state(struct lruvec *lruvec,
-				      enum node_stat_item idx)
-{
-	__mod_lruvec_state(lruvec, idx, -1);
-}
-
-static inline void __inc_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx)
-{
-	__mod_lruvec_page_state(page, idx, 1);
-}
-
-static inline void __dec_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx)
-{
-	__mod_lruvec_page_state(page, idx, -1);
-}
-
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
 	__mod_lruvec_kmem_state(p, idx, 1);
@@ -1370,30 +1283,6 @@ static inline void dec_memcg_page_state(struct page *page,
 	mod_memcg_page_state(page, idx, -1);
 }
 
-static inline void inc_lruvec_state(struct lruvec *lruvec,
-				    enum node_stat_item idx)
-{
-	mod_lruvec_state(lruvec, idx, 1);
-}
-
-static inline void dec_lruvec_state(struct lruvec *lruvec,
-				    enum node_stat_item idx)
-{
-	mod_lruvec_state(lruvec, idx, -1);
-}
-
-static inline void inc_lruvec_page_state(struct page *page,
-					 enum node_stat_item idx)
-{
-	mod_lruvec_page_state(page, idx, 1);
-}
-
-static inline void dec_lruvec_page_state(struct page *page,
-					 enum node_stat_item idx)
-{
-	mod_lruvec_page_state(page, idx, -1);
-}
-
 static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
 {
 	struct mem_cgroup *memcg;
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 322dcbfcc933..773135fc6e19 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -450,4 +450,108 @@ static inline const char *vm_event_name(enum vm_event_item item)
 }
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 
+#ifdef CONFIG_MEMCG
+
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+			int val);
+
+static inline void mod_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx, int val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__mod_lruvec_state(lruvec, idx, val);
+	local_irq_restore(flags);
+}
+
+void __mod_lruvec_page_state(struct page *page,
+			     enum node_stat_item idx, int val);
+
+static inline void mod_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx, int val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__mod_lruvec_page_state(page, idx, val);
+	local_irq_restore(flags);
+}
+
+#else
+
+static inline void __mod_lruvec_state(struct lruvec *lruvec,
+				      enum node_stat_item idx, int val)
+{
+	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+}
+
+static inline void mod_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx, int val)
+{
+	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+}
+
+static inline void __mod_lruvec_page_state(struct page *page,
+					   enum node_stat_item idx, int val)
+{
+	__mod_node_page_state(page_pgdat(page), idx, val);
+}
+
+static inline void mod_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx, int val)
+{
+	mod_node_page_state(page_pgdat(page), idx, val);
+}
+
+#endif /* CONFIG_MEMCG */
+
+static inline void __inc_lruvec_state(struct lruvec *lruvec,
+				      enum node_stat_item idx)
+{
+	__mod_lruvec_state(lruvec, idx, 1);
+}
+
+static inline void __dec_lruvec_state(struct lruvec *lruvec,
+				      enum node_stat_item idx)
+{
+	__mod_lruvec_state(lruvec, idx, -1);
+}
+
+static inline void __inc_lruvec_page_state(struct page *page,
+					   enum node_stat_item idx)
+{
+	__mod_lruvec_page_state(page, idx, 1);
+}
+
+static inline void __dec_lruvec_page_state(struct page *page,
+					   enum node_stat_item idx)
+{
+	__mod_lruvec_page_state(page, idx, -1);
+}
+
+static inline void inc_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx)
+{
+	mod_lruvec_state(lruvec, idx, 1);
+}
+
+static inline void dec_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx)
+{
+	mod_lruvec_state(lruvec, idx, -1);
+}
+
+static inline void inc_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx)
+{
+	mod_lruvec_page_state(page, idx, 1);
+}
+
+static inline void dec_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx)
+{
+	mod_lruvec_page_state(page, idx, -1);
+}
+
 #endif /* _LINUX_VMSTAT_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0083b69b5cac..52837d68bbec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -853,6 +853,23 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 		__mod_memcg_lruvec_state(lruvec, idx, val);
 }
 
+void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
+			     int val)
+{
+	struct page *head = compound_head(page); /* rmap on tail pages */
+	pg_data_t *pgdat = page_pgdat(page);
+	struct lruvec *lruvec;
+
+	/* Untracked pages have no memcg, no lruvec. Update only the node */
+	if (!head->mem_cgroup) {
+		__mod_node_page_state(pgdat, idx, val);
+		return;
+	}
+
+	lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
+	__mod_lruvec_state(lruvec, idx, val);
+}
+
 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 {
 	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
-- 
cgit 


From f0c0c115fb81940f4dba0644ac2a8a43b39c83f3 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Mon, 14 Dec 2020 19:07:17 -0800
Subject: mm: memcontrol: account pagetables per node

For many workloads, pagetable consumption is significant and it makes
sense to expose it in the memory.stat for the memory cgroups.  However at
the moment, the pagetables are accounted per-zone.  Converting them to
per-node and using the right interface will correctly account for the
memory cgroups as well.

[akpm@linux-foundation.org: export __mod_lruvec_page_state to modules for arch/mips/kvm/]

Link: https://lkml.kernel.org/r/20201130212541.2781790-3-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 3 +++
 arch/nds32/mm/mm-nds32.c                | 6 +++---
 drivers/base/node.c                     | 2 +-
 fs/proc/meminfo.c                       | 2 +-
 include/linux/mm.h                      | 8 ++++----
 include/linux/mmzone.h                  | 2 +-
 mm/memcontrol.c                         | 2 ++
 mm/page_alloc.c                         | 6 +++---
 mm/vmstat.c                             | 2 +-
 9 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 515bb13084a0..63521cd36ce5 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1274,6 +1274,9 @@ PAGE_SIZE multiple when read back.
 	  kernel_stack
 		Amount of memory allocated to kernel stacks.
 
+	  pagetables
+                Amount of memory allocated for page tables.
+
 	  percpu(npn)
 		Amount of memory used for storing per-cpu kernel
 		data structures.
diff --git a/arch/nds32/mm/mm-nds32.c b/arch/nds32/mm/mm-nds32.c
index 55bec50ccc03..f2778f2b39f6 100644
--- a/arch/nds32/mm/mm-nds32.c
+++ b/arch/nds32/mm/mm-nds32.c
@@ -34,8 +34,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	cpu_dcache_wb_range((unsigned long)new_pgd,
 			    (unsigned long)new_pgd +
 			    PTRS_PER_PGD * sizeof(pgd_t));
-	inc_zone_page_state(virt_to_page((unsigned long *)new_pgd),
-			    NR_PAGETABLE);
+	inc_lruvec_page_state(virt_to_page((unsigned long *)new_pgd),
+			      NR_PAGETABLE);
 
 	return new_pgd;
 }
@@ -59,7 +59,7 @@ void pgd_free(struct mm_struct *mm, pgd_t * pgd)
 
 	pte = pmd_page(*pmd);
 	pmd_clear(pmd);
-	dec_zone_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE);
+	dec_lruvec_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE);
 	pte_free(mm, pte);
 	mm_dec_nr_ptes(mm);
 	pmd_free(mm, pmd);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6ffa470e2984..04f71c7bc3f8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -450,7 +450,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 #ifdef CONFIG_SHADOW_CALL_STACK
 			     nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
-			     nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
+			     nid, K(node_page_state(pgdat, NR_PAGETABLE)),
 			     nid, 0UL,
 			     nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
 			     nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 887a5532e449..d6fc74619625 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -107,7 +107,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		   global_node_page_state(NR_KERNEL_SCS_KB));
 #endif
 	show_val_kb(m, "PageTables:     ",
-		    global_zone_page_state(NR_PAGETABLE));
+		    global_node_page_state(NR_PAGETABLE));
 
 	show_val_kb(m, "NFS_Unstable:   ", 0);
 	show_val_kb(m, "Bounce:         ",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..5bbbf4aeee94 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2203,7 +2203,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page)
 	if (!ptlock_init(page))
 		return false;
 	__SetPageTable(page);
-	inc_zone_page_state(page, NR_PAGETABLE);
+	inc_lruvec_page_state(page, NR_PAGETABLE);
 	return true;
 }
 
@@ -2211,7 +2211,7 @@ static inline void pgtable_pte_page_dtor(struct page *page)
 {
 	ptlock_free(page);
 	__ClearPageTable(page);
-	dec_zone_page_state(page, NR_PAGETABLE);
+	dec_lruvec_page_state(page, NR_PAGETABLE);
 }
 
 #define pte_offset_map_lock(mm, pmd, address, ptlp)	\
@@ -2298,7 +2298,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page)
 	if (!pmd_ptlock_init(page))
 		return false;
 	__SetPageTable(page);
-	inc_zone_page_state(page, NR_PAGETABLE);
+	inc_lruvec_page_state(page, NR_PAGETABLE);
 	return true;
 }
 
@@ -2306,7 +2306,7 @@ static inline void pgtable_pmd_page_dtor(struct page *page)
 {
 	pmd_ptlock_free(page);
 	__ClearPageTable(page);
-	dec_zone_page_state(page, NR_PAGETABLE);
+	dec_lruvec_page_state(page, NR_PAGETABLE);
 }
 
 /*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fb3bf696c05e..cca2a4443c9c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -152,7 +152,6 @@ enum zone_stat_item {
 	NR_ZONE_UNEVICTABLE,
 	NR_ZONE_WRITE_PENDING,	/* Count of dirty, writeback and unstable pages */
 	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
-	NR_PAGETABLE,		/* used for pagetables */
 	/* Second 128 byte cacheline */
 	NR_BOUNCE,
 #if IS_ENABLED(CONFIG_ZSMALLOC)
@@ -207,6 +206,7 @@ enum node_stat_item {
 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
 	NR_KERNEL_SCS_KB,	/* measured in KiB */
 #endif
+	NR_PAGETABLE,		/* used for pagetables */
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 52837d68bbec..b9419a3605eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -869,6 +869,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 	lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
 	__mod_lruvec_state(lruvec, idx, val);
 }
+EXPORT_SYMBOL(__mod_lruvec_page_state);
 
 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 {
@@ -1493,6 +1494,7 @@ static struct memory_stat memory_stats[] = {
 	{ "anon", PAGE_SIZE, NR_ANON_MAPPED },
 	{ "file", PAGE_SIZE, NR_FILE_PAGES },
 	{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+	{ "pagetables", PAGE_SIZE, NR_PAGETABLE },
 	{ "percpu", 1, MEMCG_PERCPU_B },
 	{ "sock", PAGE_SIZE, MEMCG_SOCK },
 	{ "shmem", PAGE_SIZE, NR_SHMEM },
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eaa227a479e4..743fb2bccecc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5465,7 +5465,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
 		global_node_page_state(NR_FILE_MAPPED),
 		global_node_page_state(NR_SHMEM),
-		global_zone_page_state(NR_PAGETABLE),
+		global_node_page_state(NR_PAGETABLE),
 		global_zone_page_state(NR_BOUNCE),
 		global_zone_page_state(NR_FREE_PAGES),
 		free_pcp,
@@ -5497,6 +5497,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #ifdef CONFIG_SHADOW_CALL_STACK
 			" shadow_call_stack:%lukB"
 #endif
+			" pagetables:%lukB"
 			" all_unreclaimable? %s"
 			"\n",
 			pgdat->node_id,
@@ -5522,6 +5523,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #ifdef CONFIG_SHADOW_CALL_STACK
 			node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
+			K(node_page_state(pgdat, NR_PAGETABLE)),
 			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
 				"yes" : "no");
 	}
@@ -5553,7 +5555,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
-			" pagetables:%lukB"
 			" bounce:%lukB"
 			" free_pcp:%lukB"
 			" local_pcp:%ukB"
@@ -5574,7 +5575,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(zone->present_pages),
 			K(zone_managed_pages(zone)),
 			K(zone_page_state(zone, NR_MLOCK)),
-			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(free_pcp),
 			K(this_cpu_read(zone->pageset->pcp.count)),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 698bc0bc18d1..da36e3b0aab2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1157,7 +1157,6 @@ const char * const vmstat_text[] = {
 	"nr_zone_unevictable",
 	"nr_zone_write_pending",
 	"nr_mlock",
-	"nr_page_table_pages",
 	"nr_bounce",
 #if IS_ENABLED(CONFIG_ZSMALLOC)
 	"nr_zspages",
@@ -1215,6 +1214,7 @@ const char * const vmstat_text[] = {
 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
 	"nr_shadow_call_stack",
 #endif
+	"nr_page_table_pages",
 
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
-- 
cgit 


From d3f5ffcacd1528736471bc78f03f06da6c4551cc Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Mon, 14 Dec 2020 19:07:45 -0800
Subject: mm: cleanup: remove unused tsk arg from __access_remote_vm

Despite a comment that said that page fault accounting would be charged to
whatever task_struct* was passed into __access_remote_vm(), the tsk
argument was actually unused.

Making page fault accounting actually use this task struct is quite a
project, so there is no point in keeping the tsk argument.

Delete both the comment, and the argument.

[rppt@linux.ibm.com: changelog addition]

Link: https://lkml.kernel.org/r/20201026074137.4147787-1-jhubbard@nvidia.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  4 ++--
 kernel/ptrace.c    |  2 +-
 mm/memory.c        | 11 +++++------
 mm/nommu.c         |  8 ++++----
 4 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5bbbf4aeee94..1d8e84bf718a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1716,8 +1716,8 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
-extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long addr, void *buf, int len, unsigned int gup_flags);
+extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
+			      void *buf, int len, unsigned int gup_flags);
 
 long get_user_pages_remote(struct mm_struct *mm,
 			    unsigned long start, unsigned long nr_pages,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 79de1294f8eb..a77d25c641e9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -57,7 +57,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
 		return 0;
 	}
 
-	ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
 	mmput(mm);
 
 	return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 50632c4366b8..4a42a74a2240 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4885,11 +4885,10 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
 #endif
 
 /*
- * Access another process' address space as given in mm.  If non-NULL, use the
- * given task for page fault accounting.
+ * Access another process' address space as given in mm.
  */
-int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long addr, void *buf, int len, unsigned int gup_flags)
+int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
+		       int len, unsigned int gup_flags)
 {
 	struct vm_area_struct *vma;
 	void *old_buf = buf;
@@ -4966,7 +4965,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags)
 {
-	return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
+	return __access_remote_vm(mm, addr, buf, len, gup_flags);
 }
 
 /*
@@ -4984,7 +4983,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
 	if (!mm)
 		return 0;
 
-	ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
 
 	mmput(mm);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 0faf39b32cdb..870fea12823e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1675,8 +1675,8 @@ void filemap_map_pages(struct vm_fault *vmf,
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long addr, void *buf, int len, unsigned int gup_flags)
+int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
+		       int len, unsigned int gup_flags)
 {
 	struct vm_area_struct *vma;
 	int write = gup_flags & FOLL_WRITE;
@@ -1722,7 +1722,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags)
 {
-	return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
+	return __access_remote_vm(mm, addr, buf, len, gup_flags);
 }
 
 /*
@@ -1741,7 +1741,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 	if (!mm)
 		return 0;
 
-	len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+	len = __access_remote_vm(mm, addr, buf, len, gup_flags);
 
 	mmput(mm);
 	return len;
-- 
cgit 


From 2b5067a8143e34aa3fa57a20fb8a3c40d905f942 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Mon, 14 Dec 2020 19:07:55 -0800
Subject: mm: mmap_lock: add tracepoints around lock acquisition

The goal of these tracepoints is to be able to debug lock contention
issues.  This lock is acquired on most (all?) mmap / munmap / page fault
operations, so a multi-threaded process which does a lot of these can
experience significant contention.

We trace just before we start acquisition, when the acquisition returns
(whether it succeeded or not), and when the lock is released (or
downgraded).  The events are broken out by lock type (read / write).

The events are also broken out by memcg path.  For container-based
workloads, users often think of several processes in a memcg as a single
logical "task", so collecting statistics at this level is useful.

The end goal is to get latency information.  This isn't directly included
in the trace events.  Instead, users are expected to compute the time
between "start locking" and "acquire returned", using e.g.  synthetic
events or BPF.  The benefit we get from this is simpler code.

Because we use tracepoint_enabled() to decide whether or not to trace,
this patch has effectively no overhead unless tracepoints are enabled at
runtime.  If tracepoints are enabled, there is a performance impact, but
how much depends on exactly what e.g.  the BPF program does.

[axelrasmussen@google.com: fix use-after-free race and css ref leak in tracepoints]
  Link: https://lkml.kernel.org/r/20201130233504.3725241-1-axelrasmussen@google.com
[axelrasmussen@google.com: v3]
  Link: https://lkml.kernel.org/r/20201207213358.573750-1-axelrasmussen@google.com
[rostedt@goodmis.org: in-depth examples of tracepoint_enabled() usage, and per-cpu-per-context buffer design]

Link: https://lkml.kernel.org/r/20201105211739.568279-2-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmap_lock.h        |  94 +++++++++++++++-
 include/trace/events/mmap_lock.h | 107 ++++++++++++++++++
 mm/Makefile                      |   2 +-
 mm/mmap_lock.c                   | 230 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 427 insertions(+), 6 deletions(-)
 create mode 100644 include/trace/events/mmap_lock.h
 create mode 100644 mm/mmap_lock.c

(limited to 'include/linux')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 18e7eae9b5ba..0540f0156f58 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -1,11 +1,65 @@
 #ifndef _LINUX_MMAP_LOCK_H
 #define _LINUX_MMAP_LOCK_H
 
+#include <linux/lockdep.h>
+#include <linux/mm_types.h>
 #include <linux/mmdebug.h>
+#include <linux/rwsem.h>
+#include <linux/tracepoint-defs.h>
+#include <linux/types.h>
 
 #define MMAP_LOCK_INITIALIZER(name) \
 	.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
 
+DECLARE_TRACEPOINT(mmap_lock_start_locking);
+DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
+DECLARE_TRACEPOINT(mmap_lock_released);
+
+#ifdef CONFIG_TRACING
+
+void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
+void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
+					   bool success);
+void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);
+
+static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
+						   bool write)
+{
+	if (tracepoint_enabled(mmap_lock_start_locking))
+		__mmap_lock_do_trace_start_locking(mm, write);
+}
+
+static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
+						      bool write, bool success)
+{
+	if (tracepoint_enabled(mmap_lock_acquire_returned))
+		__mmap_lock_do_trace_acquire_returned(mm, write, success);
+}
+
+static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
+{
+	if (tracepoint_enabled(mmap_lock_released))
+		__mmap_lock_do_trace_released(mm, write);
+}
+
+#else /* !CONFIG_TRACING */
+
+static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
+						   bool write)
+{
+}
+
+static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
+						      bool write, bool success)
+{
+}
+
+static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
+{
+}
+
+#endif /* CONFIG_TRACING */
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -13,57 +67,86 @@ static inline void mmap_init_lock(struct mm_struct *mm)
 
 static inline void mmap_write_lock(struct mm_struct *mm)
 {
+	__mmap_lock_trace_start_locking(mm, true);
 	down_write(&mm->mmap_lock);
+	__mmap_lock_trace_acquire_returned(mm, true, true);
 }
 
 static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
 {
+	__mmap_lock_trace_start_locking(mm, true);
 	down_write_nested(&mm->mmap_lock, subclass);
+	__mmap_lock_trace_acquire_returned(mm, true, true);
 }
 
 static inline int mmap_write_lock_killable(struct mm_struct *mm)
 {
-	return down_write_killable(&mm->mmap_lock);
+	int ret;
+
+	__mmap_lock_trace_start_locking(mm, true);
+	ret = down_write_killable(&mm->mmap_lock);
+	__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
+	return ret;
 }
 
 static inline bool mmap_write_trylock(struct mm_struct *mm)
 {
-	return down_write_trylock(&mm->mmap_lock) != 0;
+	bool ret;
+
+	__mmap_lock_trace_start_locking(mm, true);
+	ret = down_write_trylock(&mm->mmap_lock) != 0;
+	__mmap_lock_trace_acquire_returned(mm, true, ret);
+	return ret;
 }
 
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	up_write(&mm->mmap_lock);
+	__mmap_lock_trace_released(mm, true);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	downgrade_write(&mm->mmap_lock);
+	__mmap_lock_trace_acquire_returned(mm, false, true);
 }
 
 static inline void mmap_read_lock(struct mm_struct *mm)
 {
+	__mmap_lock_trace_start_locking(mm, false);
 	down_read(&mm->mmap_lock);
+	__mmap_lock_trace_acquire_returned(mm, false, true);
 }
 
 static inline int mmap_read_lock_killable(struct mm_struct *mm)
 {
-	return down_read_killable(&mm->mmap_lock);
+	int ret;
+
+	__mmap_lock_trace_start_locking(mm, false);
+	ret = down_read_killable(&mm->mmap_lock);
+	__mmap_lock_trace_acquire_returned(mm, false, ret == 0);
+	return ret;
 }
 
 static inline bool mmap_read_trylock(struct mm_struct *mm)
 {
-	return down_read_trylock(&mm->mmap_lock) != 0;
+	bool ret;
+
+	__mmap_lock_trace_start_locking(mm, false);
+	ret = down_read_trylock(&mm->mmap_lock) != 0;
+	__mmap_lock_trace_acquire_returned(mm, false, ret);
+	return ret;
 }
 
 static inline void mmap_read_unlock(struct mm_struct *mm)
 {
 	up_read(&mm->mmap_lock);
+	__mmap_lock_trace_released(mm, false);
 }
 
 static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
 {
-	if (down_read_trylock(&mm->mmap_lock)) {
+	if (mmap_read_trylock(mm)) {
 		rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
 		return true;
 	}
@@ -73,6 +156,7 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
 static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
 {
 	up_read_non_owner(&mm->mmap_lock);
+	__mmap_lock_trace_released(mm, false);
 }
 
 static inline void mmap_assert_locked(struct mm_struct *mm)
diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h
new file mode 100644
index 000000000000..0abff67b96f0
--- /dev/null
+++ b/include/trace/events/mmap_lock.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mmap_lock
+
+#if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MMAP_LOCK_H
+
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+
+struct mm_struct;
+
+extern int trace_mmap_lock_reg(void);
+extern void trace_mmap_lock_unreg(void);
+
+TRACE_EVENT_FN(mmap_lock_start_locking,
+
+	TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
+
+	TP_ARGS(mm, memcg_path, write),
+
+	TP_STRUCT__entry(
+		__field(struct mm_struct *, mm)
+		__string(memcg_path, memcg_path)
+		__field(bool, write)
+	),
+
+	TP_fast_assign(
+		__entry->mm = mm;
+		__assign_str(memcg_path, memcg_path);
+		__entry->write = write;
+	),
+
+	TP_printk(
+		"mm=%p memcg_path=%s write=%s\n",
+		__entry->mm,
+		__get_str(memcg_path),
+		__entry->write ? "true" : "false"
+	),
+
+	trace_mmap_lock_reg, trace_mmap_lock_unreg
+);
+
+TRACE_EVENT_FN(mmap_lock_acquire_returned,
+
+	TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
+		bool success),
+
+	TP_ARGS(mm, memcg_path, write, success),
+
+	TP_STRUCT__entry(
+		__field(struct mm_struct *, mm)
+		__string(memcg_path, memcg_path)
+		__field(bool, write)
+		__field(bool, success)
+	),
+
+	TP_fast_assign(
+		__entry->mm = mm;
+		__assign_str(memcg_path, memcg_path);
+		__entry->write = write;
+		__entry->success = success;
+	),
+
+	TP_printk(
+		"mm=%p memcg_path=%s write=%s success=%s\n",
+		__entry->mm,
+		__get_str(memcg_path),
+		__entry->write ? "true" : "false",
+		__entry->success ? "true" : "false"
+	),
+
+	trace_mmap_lock_reg, trace_mmap_lock_unreg
+);
+
+TRACE_EVENT_FN(mmap_lock_released,
+
+	TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
+
+	TP_ARGS(mm, memcg_path, write),
+
+	TP_STRUCT__entry(
+		__field(struct mm_struct *, mm)
+		__string(memcg_path, memcg_path)
+		__field(bool, write)
+	),
+
+	TP_fast_assign(
+		__entry->mm = mm;
+		__assign_str(memcg_path, memcg_path);
+		__entry->write = write;
+	),
+
+	TP_printk(
+		"mm=%p memcg_path=%s write=%s\n",
+		__entry->mm,
+		__get_str(memcg_path),
+		__entry->write ? "true" : "false"
+	),
+
+	trace_mmap_lock_reg, trace_mmap_lock_unreg
+);
+
+#endif /* _TRACE_MMAP_LOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/Makefile b/mm/Makefile
index 069f216e109e..b6cd2fffa492 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -52,7 +52,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   mm_init.o percpu.o slab_common.o \
 			   compaction.o vmacache.o \
 			   interval_tree.o list_lru.o workingset.o \
-			   debug.o gup.o $(mmu-y)
+			   debug.o gup.o mmap_lock.o $(mmu-y)
 
 # Give 'page_alloc' its own module-parameter namespace
 page-alloc-y := page_alloc.o
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
new file mode 100644
index 000000000000..dcdde4f722a4
--- /dev/null
+++ b/mm/mmap_lock.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+#define CREATE_TRACE_POINTS
+#include <trace/events/mmap_lock.h>
+
+#include <linux/mm.h>
+#include <linux/cgroup.h>
+#include <linux/memcontrol.h>
+#include <linux/mmap_lock.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/trace_events.h>
+
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
+
+#ifdef CONFIG_MEMCG
+
+/*
+ * Our various events all share the same buffer (because we don't want or need
+ * to allocate a set of buffers *per event type*), so we need to protect against
+ * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
+ * been made.
+ */
+static DEFINE_MUTEX(reg_lock);
+static int reg_refcount; /* Protected by reg_lock. */
+
+/*
+ * Size of the buffer for memcg path names. Ignoring stack trace support,
+ * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
+ */
+#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
+
+/*
+ * How many contexts our trace events might be called in: normal, softirq, irq,
+ * and NMI.
+ */
+#define CONTEXT_COUNT 4
+
+static DEFINE_PER_CPU(char __rcu *, memcg_path_buf);
+static char **tmp_bufs;
+static DEFINE_PER_CPU(int, memcg_path_buf_idx);
+
+/* Called with reg_lock held. */
+static void free_memcg_path_bufs(void)
+{
+	int cpu;
+	char **old = tmp_bufs;
+
+	for_each_possible_cpu(cpu) {
+		*(old++) = rcu_dereference_protected(
+			per_cpu(memcg_path_buf, cpu),
+			lockdep_is_held(&reg_lock));
+		rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL);
+	}
+
+	/* Wait for inflight memcg_path_buf users to finish. */
+	synchronize_rcu();
+
+	old = tmp_bufs;
+	for_each_possible_cpu(cpu) {
+		kfree(*(old++));
+	}
+
+	kfree(tmp_bufs);
+	tmp_bufs = NULL;
+}
+
+int trace_mmap_lock_reg(void)
+{
+	int cpu;
+	char *new;
+
+	mutex_lock(&reg_lock);
+
+	/* If the refcount is going 0->1, proceed with allocating buffers. */
+	if (reg_refcount++)
+		goto out;
+
+	tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs),
+				 GFP_KERNEL);
+	if (tmp_bufs == NULL)
+		goto out_fail;
+
+	for_each_possible_cpu(cpu) {
+		new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
+		if (new == NULL)
+			goto out_fail_free;
+		rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new);
+		/* Don't need to wait for inflights, they'd have gotten NULL. */
+	}
+
+out:
+	mutex_unlock(&reg_lock);
+	return 0;
+
+out_fail_free:
+	free_memcg_path_bufs();
+out_fail:
+	/* Since we failed, undo the earlier ref increment. */
+	--reg_refcount;
+
+	mutex_unlock(&reg_lock);
+	return -ENOMEM;
+}
+
+void trace_mmap_lock_unreg(void)
+{
+	mutex_lock(&reg_lock);
+
+	/* If the refcount is going 1->0, proceed with freeing buffers. */
+	if (--reg_refcount)
+		goto out;
+
+	free_memcg_path_bufs();
+
+out:
+	mutex_unlock(&reg_lock);
+}
+
+static inline char *get_memcg_path_buf(void)
+{
+	char *buf;
+	int idx;
+
+	rcu_read_lock();
+	buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf));
+	if (buf == NULL) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) -
+	      MEMCG_PATH_BUF_SIZE;
+	return &buf[idx];
+}
+
+static inline void put_memcg_path_buf(void)
+{
+	this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE);
+	rcu_read_unlock();
+}
+
+/*
+ * Write the given mm_struct's memcg path to a percpu buffer, and return a
+ * pointer to it. If the path cannot be determined, or no buffer was available
+ * (because the trace event is being unregistered), NULL is returned.
+ *
+ * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
+ * disabled by the caller before calling us, and re-enabled only after the
+ * caller is done with the pointer.
+ *
+ * The caller must call put_memcg_path_buf() once the buffer is no longer
+ * needed. This must be done while preemption is still disabled.
+ */
+static const char *get_mm_memcg_path(struct mm_struct *mm)
+{
+	char *buf = NULL;
+	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+
+	if (memcg == NULL)
+		goto out;
+	if (unlikely(memcg->css.cgroup == NULL))
+		goto out_put;
+
+	buf = get_memcg_path_buf();
+	if (buf == NULL)
+		goto out_put;
+
+	cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
+
+out_put:
+	css_put(&memcg->css);
+out:
+	return buf;
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
+	do {                                                                   \
+		const char *memcg_path;                                        \
+		preempt_disable();                                             \
+		memcg_path = get_mm_memcg_path(mm);                            \
+		trace_mmap_lock_##type(mm,                                     \
+				       memcg_path != NULL ? memcg_path : "",   \
+				       ##__VA_ARGS__);                         \
+		if (likely(memcg_path != NULL))                                \
+			put_memcg_path_buf();                                  \
+		preempt_enable();                                              \
+	} while (0)
+
+#else /* !CONFIG_MEMCG */
+
+int trace_mmap_lock_reg(void)
+{
+	return 0;
+}
+
+void trace_mmap_lock_unreg(void)
+{
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
+	trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
+
+#endif /* CONFIG_MEMCG */
+
+/*
+ * Trace calls must be in a separate file, as otherwise there's a circular
+ * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
+ */
+
+void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
+{
+	TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
+
+void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
+					   bool success)
+{
+	TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
+
+void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
+{
+	TRACE_MMAP_LOCK_EVENT(released, mm, write);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_released);
-- 
cgit 


From 0966aeb404e854e3377a10fcd01be46f19055bc6 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 14 Dec 2020 19:08:02 -0800
Subject: mm: move free_unref_page to mm/internal.h

Code outside mm/ should not be calling free_unref_page().  Also move
free_unref_page_list().

Link: https://lkml.kernel.org/r/20201125034655.27687-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 --
 mm/internal.h       | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c603237e006c..6e479e9c48ce 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -580,8 +580,6 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
-extern void free_unref_page(struct page *page);
-extern void free_unref_page_list(struct list_head *list);
 
 struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
diff --git a/mm/internal.h b/mm/internal.h
index c43ccdddb0f6..fd6734be9c85 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -199,6 +199,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
 
+extern void free_unref_page(struct page *page);
+extern void free_unref_page_list(struct list_head *list);
+
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
 
-- 
cgit 


From cd544fd1dc9293c6702fab6effa63dac1cc67e99 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 14 Dec 2020 19:08:13 -0800
Subject: mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio

As kernel expect to see only one of such mappings, any further operations
on the VMA-copy may be unexpected by the kernel.  Maybe it's being on the
safe side, but there doesn't seem to be any expected use-case for this, so
restrict it now.

Link: https://lkml.kernel.org/r/20201013013416.390574-4-dima@arista.com
Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()")
Signed-off-by: Dmitry Safonov <dima@arista.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +-
 fs/aio.c                                  | 5 ++++-
 include/linux/mm.h                        | 2 +-
 mm/mmap.c                                 | 6 +++++-
 mm/mremap.c                               | 2 +-
 5 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 0daf2f1cf7a8..e916646adc69 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags)
 {
 	/* Not supported */
 	return -EINVAL;
diff --git a/fs/aio.c b/fs/aio.c
index 6a21d8919409..fd2d68532506 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -324,13 +324,16 @@ static void aio_free_ring(struct kioctx *ctx)
 	}
 }
 
-static int aio_ring_mremap(struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags)
 {
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
 	struct kioctx_table *table;
 	int i, res = -EINVAL;
 
+	if (flags & MREMAP_DONTUNMAP)
+		return -EINVAL;
+
 	spin_lock(&mm->ioctx_lock);
 	rcu_read_lock();
 	table = rcu_dereference(mm->ioctx_table);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1d8e84bf718a..dacc889744b1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -558,7 +558,7 @@ struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
 	int (*split)(struct vm_area_struct * area, unsigned long addr);
-	int (*mremap)(struct vm_area_struct * area);
+	int (*mremap)(struct vm_area_struct *area, unsigned long flags);
 	vm_fault_t (*fault)(struct vm_fault *vmf);
 	vm_fault_t (*huge_fault)(struct vm_fault *vmf,
 			enum page_entry_size pe_size);
diff --git a/mm/mmap.c b/mm/mmap.c
index 5c8b4485860d..8950c20239a0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3405,10 +3405,14 @@ static const char *special_mapping_name(struct vm_area_struct *vma)
 	return ((struct vm_special_mapping *)vma->vm_private_data)->name;
 }
 
-static int special_mapping_mremap(struct vm_area_struct *new_vma)
+static int special_mapping_mremap(struct vm_area_struct *new_vma,
+				  unsigned long flags)
 {
 	struct vm_special_mapping *sm = new_vma->vm_private_data;
 
+	if (flags & MREMAP_DONTUNMAP)
+		return -EINVAL;
+
 	if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
 		return -EFAULT;
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 90ff09c09e84..366b3dea992c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -534,7 +534,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (moved_len < old_len) {
 		err = -ENOMEM;
 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
-		err = vma->vm_ops->mremap(new_vma);
+		err = vma->vm_ops->mremap(new_vma, flags);
 	}
 
 	if (unlikely(err)) {
-- 
cgit 


From dd3b614f858d88f33e0cf8b7353e2ad937e71da3 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 14 Dec 2020 19:08:17 -0800
Subject: vm_ops: rename .split() callback to .may_split()

Rename the callback to reflect that it's not called *on* or *after* split,
but rather some time before the splitting to check if it's possible.

Link: https://lkml.kernel.org/r/20201013013416.390574-5-dima@arista.com
Signed-off-by: Dmitry Safonov <dima@arista.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dax/device.c | 4 ++--
 include/linux/mm.h   | 3 ++-
 ipc/shm.c            | 8 ++++----
 mm/hugetlb.c         | 2 +-
 mm/mmap.c            | 4 ++--
 5 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 25e0b84a4296..5da2980bb16b 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -256,7 +256,7 @@ static vm_fault_t dev_dax_fault(struct vm_fault *vmf)
 	return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
 }
 
-static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr)
+static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct file *filp = vma->vm_file;
 	struct dev_dax *dev_dax = filp->private_data;
@@ -277,7 +277,7 @@ static unsigned long dev_dax_pagesize(struct vm_area_struct *vma)
 static const struct vm_operations_struct dax_vm_ops = {
 	.fault = dev_dax_fault,
 	.huge_fault = dev_dax_huge_fault,
-	.split = dev_dax_split,
+	.may_split = dev_dax_may_split,
 	.pagesize = dev_dax_pagesize,
 };
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dacc889744b1..5d188b8611dc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -557,7 +557,8 @@ enum page_entry_size {
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
-	int (*split)(struct vm_area_struct * area, unsigned long addr);
+	/* Called any time before splitting to check if it's allowed */
+	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
 	int (*mremap)(struct vm_area_struct *area, unsigned long flags);
 	vm_fault_t (*fault)(struct vm_fault *vmf);
 	vm_fault_t (*huge_fault)(struct vm_fault *vmf,
diff --git a/ipc/shm.c b/ipc/shm.c
index e25c7c6106bc..febd88daba8c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -434,13 +434,13 @@ static vm_fault_t shm_fault(struct vm_fault *vmf)
 	return sfd->vm_ops->fault(vmf);
 }
 
-static int shm_split(struct vm_area_struct *vma, unsigned long addr)
+static int shm_may_split(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 
-	if (sfd->vm_ops->split)
-		return sfd->vm_ops->split(vma, addr);
+	if (sfd->vm_ops->may_split)
+		return sfd->vm_ops->may_split(vma, addr);
 
 	return 0;
 }
@@ -582,7 +582,7 @@ static const struct vm_operations_struct shm_vm_ops = {
 	.open	= shm_open,	/* callback for a new vm-area open */
 	.close	= shm_close,	/* callback for when the vm-area is released */
 	.fault	= shm_fault,
-	.split	= shm_split,
+	.may_split = shm_may_split,
 	.pagesize = shm_pagesize,
 #if defined(CONFIG_NUMA)
 	.set_policy = shm_set_policy,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d029d938d26d..4b809cc7a3fd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3673,7 +3673,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
 	.fault = hugetlb_vm_op_fault,
 	.open = hugetlb_vm_op_open,
 	.close = hugetlb_vm_op_close,
-	.split = hugetlb_vm_op_split,
+	.may_split = hugetlb_vm_op_split,
 	.pagesize = hugetlb_vm_op_pagesize,
 };
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 8950c20239a0..886de799e7d2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2731,8 +2731,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct vm_area_struct *new;
 	int err;
 
-	if (vma->vm_ops && vma->vm_ops->split) {
-		err = vma->vm_ops->split(vma, addr);
+	if (vma->vm_ops && vma->vm_ops->may_split) {
+		err = vma->vm_ops->may_split(vma, addr);
 		if (err)
 			return err;
 	}
-- 
cgit 


From 95d6c701f4ca7c44dc148d664f604541266a2333 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Mon, 14 Dec 2020 19:08:34 -0800
Subject: mm: extract might_alloc() debug check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extracted from slab.h, which seems to have the most complete version
including the correct might_sleep() check.  Roll it out to slob.c.

Motivated by a discussion with Paul about possibly changing call_rcu
behaviour to allocate memory, but only roughly every 500th call.

There are a lot fewer places in the kernel that care about whether
allocating memory is allowed or not (due to deadlocks with reclaim code)
than places that care whether sleeping is allowed.  But debugging these
also tends to be a lot harder, so nice descriptive checks could come in
handy.  I might have some use eventually for annotations in drivers/gpu.

Note that unlike fs_reclaim_acquire/release gfpflags_allow_blocking does
not consult the PF_MEMALLOC flags.  But there is no flag equivalent for
GFP_NOWAIT, hence this check can't go wrong due to
memalloc_no*_save/restore contexts.  Willy is working on a patch series
which might change this:

https://lore.kernel.org/linux-mm/20200625113122.7540-7-willy@infradead.org/

I think best would be if that updates gfpflags_allow_blocking(), since
there's a ton of callers all over the place for that already.

Link: https://lkml.kernel.org/r/20201125162532.1299794-3-daniel.vetter@ffwll.ch
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Waiman Long <longman@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Qian Cai <cai@lca.pw>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Christian König <christian.koenig@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Thomas Hellström (Intel) <thomas_os@shipmail.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h | 16 ++++++++++++++++
 mm/slab.h                |  5 +----
 mm/slob.c                |  6 ++----
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..a11a61b5226f 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -180,6 +180,22 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
 static inline void fs_reclaim_release(gfp_t gfp_mask) { }
 #endif
 
+/**
+ * might_alloc - Mark possible allocation sites
+ * @gfp_mask: gfp_t flags that would be used to allocate
+ *
+ * Similar to might_sleep() and other annotations, this can be used in functions
+ * that might allocate, but often don't. Compiles to nothing without
+ * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
+ */
+static inline void might_alloc(gfp_t gfp_mask)
+{
+	fs_reclaim_acquire(gfp_mask);
+	fs_reclaim_release(gfp_mask);
+
+	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+}
+
 /**
  * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
  *
diff --git a/mm/slab.h b/mm/slab.h
index 65f4647f1286..54faf9a623f9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -510,10 +510,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
 {
 	flags &= gfp_allowed_mask;
 
-	fs_reclaim_acquire(flags);
-	fs_reclaim_release(flags);
-
-	might_sleep_if(gfpflags_allow_blocking(flags));
+	might_alloc(flags);
 
 	if (should_failslab(s, flags))
 		return NULL;
diff --git a/mm/slob.c b/mm/slob.c
index 7cc9805c8091..8d4bfa46247f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -474,8 +474,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 
 	gfp &= gfp_allowed_mask;
 
-	fs_reclaim_acquire(gfp);
-	fs_reclaim_release(gfp);
+	might_alloc(gfp);
 
 	if (size < PAGE_SIZE - minalign) {
 		int align = minalign;
@@ -597,8 +596,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 
 	flags &= gfp_allowed_mask;
 
-	fs_reclaim_acquire(flags);
-	fs_reclaim_release(flags);
+	might_alloc(flags);
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node, 0);
-- 
cgit 


From 96e2db456135db0cf2476b6890f1e8b2fdcf21eb Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 14 Dec 2020 19:08:49 -0800
Subject: mm/vmalloc: rework the drain logic

A current "lazy drain" model suffers from at least two issues.

First one is related to the unsorted list of vmap areas, thus in order to
identify the [min:max] range of areas to be drained, it requires a full
list scan.  What is a time consuming if the list is too long.

Second one and as a next step is about merging all fragments with a free
space.  What is also a time consuming because it has to iterate over
entire list which holds outstanding lazy areas.

See below the "preemptirqsoff" tracer that illustrates a high latency.  It
is ~24676us.  Our workloads like audio and video are effected by such long
latency:

<snip>
  tracer: preemptirqsoff

  preemptirqsoff latency trace v1.1.5 on 4.9.186-perf+
  --------------------------------------------------------------------
  latency: 24676 us, #4/4, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 P:8)
     -----------------
     | task: crtc_commit:112-261 (uid:0 nice:0 policy:1 rt_prio:16)
     -----------------
   => started at: __purge_vmap_area_lazy
   => ended at:   __purge_vmap_area_lazy

                   _------=> CPU#
                  / _-----=> irqs-off
                 | / _----=> need-resched
                 || / _---=> hardirq/softirq
                 ||| / _--=> preempt-depth
                 |||| /     delay
   cmd     pid   ||||| time  |   caller
      \   /      |||||  \    |   /
crtc_com-261     1...1    1us*: _raw_spin_lock <-__purge_vmap_area_lazy
[...]
crtc_com-261     1...1 24675us : _raw_spin_unlock <-__purge_vmap_area_lazy
crtc_com-261     1...1 24677us : trace_preempt_on <-__purge_vmap_area_lazy
crtc_com-261     1...1 24683us : <stack trace>
 => free_vmap_area_noflush
 => remove_vm_area
 => __vunmap
 => vfree
 => drm_property_free_blob
 => drm_mode_object_unreference
 => drm_property_unreference_blob
 => __drm_atomic_helper_crtc_destroy_state
 => sde_crtc_destroy_state
 => drm_atomic_state_default_clear
 => drm_atomic_state_clear
 => drm_atomic_state_free
 => complete_commit
 => _msm_drm_commit_work_cb
 => kthread_worker_fn
 => kthread
 => ret_from_fork
<snip>

To address those two issues we can redesign a purging of the outstanding
lazy areas.  Instead of queuing vmap areas to the list, we replace it by
the separate rb-tree.  In hat case an area is located in the tree/list in
ascending order.  It will give us below advantages:

a) Outstanding vmap areas are merged creating bigger coalesced blocks,
   thus it becomes less fragmented.

b) It is possible to calculate a flush range [min:max] without scanning
   all elements.  It is O(1) access time or complexity;

c) The final merge of areas with the rb-tree that represents a free
   space is faster because of (a).  As a result the lock contention is
   also reduced.

Link: https://lkml.kernel.org/r/20201116220033.1837-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: huang ying <huang.ying.caritas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  8 ++---
 mm/vmalloc.c            | 90 +++++++++++++++++++++++++++----------------------
 2 files changed, 53 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 938eaf9517e2..80c0181c411d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -72,16 +72,14 @@ struct vmap_area {
 	struct list_head list;          /* address sorted list */
 
 	/*
-	 * The following three variables can be packed, because
-	 * a vmap_area object is always one of the three states:
+	 * The following two variables can be packed, because
+	 * a vmap_area object can be either:
 	 *    1) in "free" tree (root is vmap_area_root)
-	 *    2) in "busy" tree (root is free_vmap_area_root)
-	 *    3) in purge list  (head is vmap_purge_list)
+	 *    2) or "busy" tree (root is free_vmap_area_root)
 	 */
 	union {
 		unsigned long subtree_max_size; /* in "free" tree */
 		struct vm_struct *vm;           /* in "busy" tree */
-		struct llist_node purge_list;   /* in purge list */
 	};
 };
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index da833c66d625..9bd3298f8c69 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,10 +413,13 @@ static DEFINE_SPINLOCK(vmap_area_lock);
 static DEFINE_SPINLOCK(free_vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
-static LLIST_HEAD(vmap_purge_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 static bool vmap_initialized __read_mostly;
 
+static struct rb_root purge_vmap_area_root = RB_ROOT;
+static LIST_HEAD(purge_vmap_area_list);
+static DEFINE_SPINLOCK(purge_vmap_area_lock);
+
 /*
  * This kmem_cache is used for vmap_area objects. Instead of
  * allocating from slab we reuse an object from this cache to
@@ -820,10 +823,17 @@ insert:
 	if (!merged)
 		link_va(va, root, parent, link, head);
 
-	/*
-	 * Last step is to check and update the tree.
-	 */
-	augment_tree_propagate_from(va);
+	return va;
+}
+
+static __always_inline struct vmap_area *
+merge_or_add_vmap_area_augment(struct vmap_area *va,
+	struct rb_root *root, struct list_head *head)
+{
+	va = merge_or_add_vmap_area(va, root, head);
+	if (va)
+		augment_tree_propagate_from(va);
+
 	return va;
 }
 
@@ -1138,7 +1148,7 @@ static void free_vmap_area(struct vmap_area *va)
 	 * Insert/Merge it back to the free tree/list.
 	 */
 	spin_lock(&free_vmap_area_lock);
-	merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+	merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
 	spin_unlock(&free_vmap_area_lock);
 }
 
@@ -1326,32 +1336,32 @@ void set_iounmap_nonlazy(void)
 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 {
 	unsigned long resched_threshold;
-	struct llist_node *valist;
-	struct vmap_area *va;
-	struct vmap_area *n_va;
+	struct list_head local_pure_list;
+	struct vmap_area *va, *n_va;
 
 	lockdep_assert_held(&vmap_purge_lock);
 
-	valist = llist_del_all(&vmap_purge_list);
-	if (unlikely(valist == NULL))
+	spin_lock(&purge_vmap_area_lock);
+	purge_vmap_area_root = RB_ROOT;
+	list_replace_init(&purge_vmap_area_list, &local_pure_list);
+	spin_unlock(&purge_vmap_area_lock);
+
+	if (unlikely(list_empty(&local_pure_list)))
 		return false;
 
-	/*
-	 * TODO: to calculate a flush range without looping.
-	 * The list can be up to lazy_max_pages() elements.
-	 */
-	llist_for_each_entry(va, valist, purge_list) {
-		if (va->va_start < start)
-			start = va->va_start;
-		if (va->va_end > end)
-			end = va->va_end;
-	}
+	start = min(start,
+		list_first_entry(&local_pure_list,
+			struct vmap_area, list)->va_start);
+
+	end = max(end,
+		list_last_entry(&local_pure_list,
+			struct vmap_area, list)->va_end);
 
 	flush_tlb_kernel_range(start, end);
 	resched_threshold = lazy_max_pages() << 1;
 
 	spin_lock(&free_vmap_area_lock);
-	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+	list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 		unsigned long orig_start = va->va_start;
 		unsigned long orig_end = va->va_end;
@@ -1361,8 +1371,8 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 		 * detached and there is no need to "unlink" it from
 		 * anything.
 		 */
-		va = merge_or_add_vmap_area(va, &free_vmap_area_root,
-					    &free_vmap_area_list);
+		va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
+				&free_vmap_area_list);
 
 		if (!va)
 			continue;
@@ -1419,9 +1429,15 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
 				PAGE_SHIFT, &vmap_lazy_nr);
 
-	/* After this point, we may free va at any time */
-	llist_add(&va->purge_list, &vmap_purge_list);
+	/*
+	 * Merge or place it to the purge tree/list.
+	 */
+	spin_lock(&purge_vmap_area_lock);
+	merge_or_add_vmap_area(va,
+		&purge_vmap_area_root, &purge_vmap_area_list);
+	spin_unlock(&purge_vmap_area_lock);
 
+	/* After this point, we may free va at any time */
 	if (unlikely(nr_lazy > lazy_max_pages()))
 		try_purge_vmap_area_lazy();
 }
@@ -3351,8 +3367,8 @@ recovery:
 	while (area--) {
 		orig_start = vas[area]->va_start;
 		orig_end = vas[area]->va_end;
-		va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
-					    &free_vmap_area_list);
+		va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
+				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
 				va->va_start, va->va_end);
@@ -3401,8 +3417,8 @@ err_free_shadow:
 	for (area = 0; area < nr_vms; area++) {
 		orig_start = vas[area]->va_start;
 		orig_end = vas[area]->va_end;
-		va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
-					    &free_vmap_area_list);
+		va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
+				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
 				va->va_start, va->va_end);
@@ -3482,18 +3498,15 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 
 static void show_purge_info(struct seq_file *m)
 {
-	struct llist_node *head;
 	struct vmap_area *va;
 
-	head = READ_ONCE(vmap_purge_list.first);
-	if (head == NULL)
-		return;
-
-	llist_for_each_entry(va, head, purge_list) {
+	spin_lock(&purge_vmap_area_lock);
+	list_for_each_entry(va, &purge_vmap_area_list, list) {
 		seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
 			(void *)va->va_start, (void *)va->va_end,
 			va->va_end - va->va_start);
 	}
+	spin_unlock(&purge_vmap_area_lock);
 }
 
 static int s_show(struct seq_file *m, void *p)
@@ -3551,10 +3564,7 @@ static int s_show(struct seq_file *m, void *p)
 	seq_putc(m, '\n');
 
 	/*
-	 * As a final step, dump "unpurged" areas. Note,
-	 * that entire "/proc/vmallocinfo" output will not
-	 * be address sorted, because the purge list is not
-	 * sorted.
+	 * As a final step, dump "unpurged" areas.
 	 */
 	if (list_is_last(&va->list, &vmap_area_list))
 		show_purge_info(m);
-- 
cgit 


From 03e92a5e097d679acbd1fb4d2ae238a38158aa0b Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 14 Dec 2020 19:09:32 -0800
Subject: ia64: remove custom __early_pfn_to_nid()

The ia64 implementation of __early_pfn_to_nid() essentially relies on the
same data as the generic implementation.

The correspondence between memory ranges and nodes is set in memblock
during early memory initialization in register_active_ranges() function.

The initialization of sparsemem that requires early_pfn_to_nid() happens
later and it can use the memblock information like the other architectures.

Link: https://lkml.kernel.org/r/20201101170454.9567-3-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Meelis Roos <mroos@linux.ee>
Cc: Michael Schmitz <schmitzmic@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/Kconfig      |  3 ---
 arch/ia64/mm/numa.c    | 30 ------------------------------
 include/linux/mm.h     |  3 ---
 include/linux/mmzone.h | 11 -----------
 mm/page_alloc.c        | 16 ++++++++++++----
 5 files changed, 12 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 39b25a5a591b..12aae706cb27 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -342,9 +342,6 @@ config HOLES_IN_ZONE
 	bool
 	default y if VIRTUAL_MEM_MAP
 
-config HAVE_ARCH_EARLY_PFN_TO_NID
-	def_bool NUMA && SPARSEMEM
-
 config HAVE_ARCH_NODEDATA_EXTENSION
 	def_bool y
 	depends on NUMA
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
index f34964271101..46b6e5f3a40f 100644
--- a/arch/ia64/mm/numa.c
+++ b/arch/ia64/mm/numa.c
@@ -58,36 +58,6 @@ paddr_to_nid(unsigned long paddr)
 EXPORT_SYMBOL(paddr_to_nid);
 
 #if defined(CONFIG_SPARSEMEM) && defined(CONFIG_NUMA)
-/*
- * Because of holes evaluate on section limits.
- * If the section of memory exists, then return the node where the section
- * resides.  Otherwise return node 0 as the default.  This is used by
- * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where
- * the section resides.
- */
-int __meminit __early_pfn_to_nid(unsigned long pfn,
-					struct mminit_pfnnid_cache *state)
-{
-	int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec;
-
-	if (section >= state->last_start && section < state->last_end)
-		return state->last_nid;
-
-	for (i = 0; i < num_node_memblks; i++) {
-		ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT;
-		esec = (node_memblk[i].start_paddr + node_memblk[i].size +
-			((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT;
-		if (section >= ssec && section < esec) {
-			state->last_start = ssec;
-			state->last_end = esec;
-			state->last_nid = node_memblk[i].nid;
-			return node_memblk[i].nid;
-		}
-	}
-
-	return -1;
-}
-
 void numa_clear_node(int cpu)
 {
 	unmap_cpu_from_node(cpu, NUMA_NO_NODE);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5d188b8611dc..48fa3e71be1a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2434,9 +2434,6 @@ static inline int early_pfn_to_nid(unsigned long pfn)
 #else
 /* please see mm/page_alloc.c */
 extern int __meminit early_pfn_to_nid(unsigned long pfn);
-/* there is a per-arch backend function. */
-extern int __meminit __early_pfn_to_nid(unsigned long pfn,
-					struct mminit_pfnnid_cache *state);
 #endif
 
 extern void set_dma_reserve(unsigned long new_dma_reserve);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cca2a4443c9c..16fb5522a74f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1428,17 +1428,6 @@ void sparse_init(void);
 #define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
-/*
- * During memory init memblocks map pfns to nids. The search is expensive and
- * this caches recent lookups. The implementation of __early_pfn_to_nid
- * may treat start/end as pfns or sections.
- */
-struct mminit_pfnnid_cache {
-	unsigned long last_start;
-	unsigned long last_end;
-	int last_nid;
-};
-
 /*
  * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
  * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e240d3cda05..484d5582f88c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1559,14 +1559,23 @@ void __free_pages_core(struct page *page, unsigned int order)
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 
-static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+/*
+ * During memory init memblocks map pfns to nids. The search is expensive and
+ * this caches recent lookups. The implementation of __early_pfn_to_nid
+ * treats start/end as pfns.
+ */
+struct mminit_pfnnid_cache {
+	unsigned long last_start;
+	unsigned long last_end;
+	int last_nid;
+};
 
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
 
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  */
-int __meminit __early_pfn_to_nid(unsigned long pfn,
+static int __meminit __early_pfn_to_nid(unsigned long pfn,
 					struct mminit_pfnnid_cache *state)
 {
 	unsigned long start_pfn, end_pfn;
@@ -1584,7 +1593,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
 
 	return nid;
 }
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
-- 
cgit 


From 5e545df3292fbd3d5963c68980f1527ead2a2b3f Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 14 Dec 2020 19:09:55 -0800
Subject: arm: remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL

ARM is the only architecture that defines CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
which in turn enables memmap_valid_within() function that is intended to
verify existence  of struct page associated with a pfn when there are holes
in the memory map.

However, the ARCH_HAS_HOLES_MEMORYMODEL also enables HAVE_ARCH_PFN_VALID
and arch-specific pfn_valid() implementation that also deals with the holes
in the memory map.

The only two users of memmap_valid_within() call this function after
a call to pfn_valid() so the memmap_valid_within() check becomes redundant.

Remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL and memmap_valid_within() and rely
entirely on ARM's implementation of pfn_valid() that is now enabled
unconditionally.

Link: https://lkml.kernel.org/r/20201101170454.9567-9-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Meelis Roos <mroos@linux.ee>
Cc: Michael Schmitz <schmitzmic@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/memory-model.rst |  3 +--
 arch/arm/Kconfig                  |  8 ++------
 arch/arm/mach-bcm/Kconfig         |  1 -
 arch/arm/mach-davinci/Kconfig     |  1 -
 arch/arm/mach-exynos/Kconfig      |  1 -
 arch/arm/mach-highbank/Kconfig    |  1 -
 arch/arm/mach-omap2/Kconfig       |  1 -
 arch/arm/mach-s5pv210/Kconfig     |  1 -
 arch/arm/mach-tango/Kconfig       |  1 -
 fs/proc/kcore.c                   |  2 --
 include/linux/mmzone.h            | 31 -------------------------------
 mm/mmzone.c                       | 14 --------------
 mm/vmstat.c                       |  4 ----
 13 files changed, 3 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
index 9daadf9faba1..ce398a7dc6cd 100644
--- a/Documentation/vm/memory-model.rst
+++ b/Documentation/vm/memory-model.rst
@@ -51,8 +51,7 @@ call :c:func:`free_area_init` function. Yet, the mappings array is not
 usable until the call to :c:func:`memblock_free_all` that hands all the
 memory to the page allocator.
 
-If an architecture enables `CONFIG_ARCH_HAS_HOLES_MEMORYMODEL` option,
-it may free parts of the `mem_map` array that do not cover the
+An architecture may free parts of the `mem_map` array that do not cover the
 actual physical pages. In such case, the architecture specific
 :c:func:`pfn_valid` implementation should take the holes in the
 `mem_map` into account.
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 002e0cf025f5..353c3979a2d5 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -25,7 +25,7 @@ config ARM
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC
+	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN
 	select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
@@ -520,7 +520,6 @@ config ARCH_S3C24XX
 config ARCH_OMAP1
 	bool "TI OMAP1"
 	depends on MMU
-	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARCH_OMAP
 	select CLKDEV_LOOKUP
 	select CLKSRC_MMIO
@@ -1480,9 +1479,6 @@ config OABI_COMPAT
 	  UNPREDICTABLE (in fact it can be predicted that it won't work
 	  at all). If in doubt say N.
 
-config ARCH_HAS_HOLES_MEMORYMODEL
-	bool
-
 config ARCH_SELECT_MEMORY_MODEL
 	bool
 
@@ -1494,7 +1490,7 @@ config ARCH_SPARSEMEM_ENABLE
 	select SPARSEMEM_STATIC if SPARSEMEM
 
 config HAVE_ARCH_PFN_VALID
-	def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
+	def_bool y
 
 config HIGHMEM
 	bool "High Memory Support"
diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig
index ae790908fc74..9b594ae98153 100644
--- a/arch/arm/mach-bcm/Kconfig
+++ b/arch/arm/mach-bcm/Kconfig
@@ -211,7 +211,6 @@ config ARCH_BRCMSTB
 	select BCM7038_L1_IRQ
 	select BRCMSTB_L2_IRQ
 	select BCM7120_L2_IRQ
-	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ZONE_DMA if ARM_LPAE
 	select SOC_BRCMSTB
 	select SOC_BUS
diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig
index f56ff8c24043..de11030748d0 100644
--- a/arch/arm/mach-davinci/Kconfig
+++ b/arch/arm/mach-davinci/Kconfig
@@ -5,7 +5,6 @@ menuconfig ARCH_DAVINCI
 	depends on ARCH_MULTI_V5
 	select DAVINCI_TIMER
 	select ZONE_DMA
-	select ARCH_HAS_HOLES_MEMORYMODEL
 	select PM_GENERIC_DOMAINS if PM
 	select PM_GENERIC_DOMAINS_OF if PM && OF
 	select REGMAP_MMIO
diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig
index d2d249706ebb..56d272967fc0 100644
--- a/arch/arm/mach-exynos/Kconfig
+++ b/arch/arm/mach-exynos/Kconfig
@@ -8,7 +8,6 @@
 menuconfig ARCH_EXYNOS
 	bool "Samsung Exynos"
 	depends on ARCH_MULTI_V7
-	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARCH_SUPPORTS_BIG_ENDIAN
 	select ARM_AMBA
 	select ARM_GIC
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig
index 1bc68913d62c..9de38ce8124f 100644
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -2,7 +2,6 @@
 config ARCH_HIGHBANK
 	bool "Calxeda ECX-1000/2000 (Highbank/Midway)"
 	depends on ARCH_MULTI_V7
-	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARCH_SUPPORTS_BIG_ENDIAN
 	select ARM_AMBA
 	select ARM_ERRATA_764369 if SMP
diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig
index 3f62a0c9450d..164985505f9e 100644
--- a/arch/arm/mach-omap2/Kconfig
+++ b/arch/arm/mach-omap2/Kconfig
@@ -93,7 +93,6 @@ config SOC_DRA7XX
 config ARCH_OMAP2PLUS
 	bool
 	select ARCH_HAS_BANDGAP
-	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARCH_HAS_RESET_CONTROLLER
 	select ARCH_OMAP
 	select CLKSRC_MMIO
diff --git a/arch/arm/mach-s5pv210/Kconfig b/arch/arm/mach-s5pv210/Kconfig
index 95d4e8284866..d644b45bc29d 100644
--- a/arch/arm/mach-s5pv210/Kconfig
+++ b/arch/arm/mach-s5pv210/Kconfig
@@ -8,7 +8,6 @@
 config ARCH_S5PV210
 	bool "Samsung S5PV210/S5PC110"
 	depends on ARCH_MULTI_V7
-	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARM_VIC
 	select CLKSRC_SAMSUNG_PWM
 	select COMMON_CLK_SAMSUNG
diff --git a/arch/arm/mach-tango/Kconfig b/arch/arm/mach-tango/Kconfig
index 25b2fd434861..a9eeda36aeb1 100644
--- a/arch/arm/mach-tango/Kconfig
+++ b/arch/arm/mach-tango/Kconfig
@@ -3,7 +3,6 @@ config ARCH_TANGO
 	bool "Sigma Designs Tango4 (SMP87xx)"
 	depends on ARCH_MULTI_V7
 	# Cortex-A9 MPCore r3p0, PL310 r3p2
-	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARM_ERRATA_754322
 	select ARM_ERRATA_764369 if SMP
 	select ARM_ERRATA_775420
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e502414b3556..4d2e64e9016c 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -193,8 +193,6 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
 		return 1;
 
 	p = pfn_to_page(pfn);
-	if (!memmap_valid_within(pfn, p, page_zone(p)))
-		return 1;
 
 	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
 	if (!ent)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 16fb5522a74f..6e0025b5a88f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1440,37 +1440,6 @@ void sparse_init(void);
 #define pfn_valid_within(pfn) (1)
 #endif
 
-#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-/*
- * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
- * associated with it or not. This means that a struct page exists for this
- * pfn. The caller cannot assume the page is fully initialized in general.
- * Hotplugable pages might not have been onlined yet. pfn_to_online_page()
- * will ensure the struct page is fully online and initialized. Special pages
- * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly.
- *
- * In FLATMEM, it is expected that holes always have valid memmap as long as
- * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed
- * that a valid section has a memmap for the entire section.
- *
- * However, an ARM, and maybe other embedded architectures in the future
- * free memmap backing holes to save memory on the assumption the memmap is
- * never used. The page_zone linkages are then broken even though pfn_valid()
- * returns true. A walker of the full memmap must then do this additional
- * check to ensure the memmap they are looking at is sane by making sure
- * the zone and PFN linkages are still valid. This is expensive, but walkers
- * of the full memmap are extremely rare.
- */
-bool memmap_valid_within(unsigned long pfn,
-					struct page *page, struct zone *zone);
-#else
-static inline bool memmap_valid_within(unsigned long pfn,
-					struct page *page, struct zone *zone)
-{
-	return true;
-}
-#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
 #endif /* !__GENERATING_BOUNDS.H */
 #endif /* !__ASSEMBLY__ */
 #endif /* _LINUX_MMZONE_H */
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..f337831affc2 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,20 +72,6 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
 	return z;
 }
 
-#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-bool memmap_valid_within(unsigned long pfn,
-					struct page *page, struct zone *zone)
-{
-	if (page_to_pfn(page) != pfn)
-		return false;
-
-	if (page_zone(page) != zone)
-		return false;
-
-	return true;
-}
-#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
 void lruvec_init(struct lruvec *lruvec)
 {
 	enum lru_list lru;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index da36e3b0aab2..f8942160fc95 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1503,10 +1503,6 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 		if (!page)
 			continue;
 
-		/* Watch for unexpected holes punched in the memmap */
-		if (!memmap_valid_within(pfn, page, zone))
-			continue;
-
 		if (page_zone(page) != zone)
 			continue;
 
-- 
cgit 


From 77bc7fd607dee2ffb28daff6d0dd8ae42af61ea8 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 14 Dec 2020 19:10:20 -0800
Subject: mm: introduce debug_pagealloc_{map,unmap}_pages() helpers

Patch series "arch, mm: improve robustness of direct map manipulation", v7.

During recent discussion about KVM protected memory, David raised a
concern about usage of __kernel_map_pages() outside of DEBUG_PAGEALLOC
scope [1].

Indeed, for architectures that define CONFIG_ARCH_HAS_SET_DIRECT_MAP it is
possible that __kernel_map_pages() would fail, but since this function is
void, the failure will go unnoticed.

Moreover, there's lack of consistency of __kernel_map_pages() semantics
across architectures as some guard this function with #ifdef
DEBUG_PAGEALLOC, some refuse to update the direct map if page allocation
debugging is disabled at run time and some allow modifying the direct map
regardless of DEBUG_PAGEALLOC settings.

This set straightens this out by restoring dependency of
__kernel_map_pages() on DEBUG_PAGEALLOC and updating the call sites
accordingly.

Since currently the only user of __kernel_map_pages() outside
DEBUG_PAGEALLOC is hibernation, it is updated to make direct map accesses
there more explicit.

[1] https://lore.kernel.org/lkml/2759b4bf-e1e3-d006-7d86-78a40348269d@redhat.com

This patch (of 4):

When CONFIG_DEBUG_PAGEALLOC is enabled, it unmaps pages from the kernel
direct mapping after free_pages().  The pages than need to be mapped back
before they could be used.  Theese mapping operations use
__kernel_map_pages() guarded with with debug_pagealloc_enabled().

The only place that calls __kernel_map_pages() without checking whether
DEBUG_PAGEALLOC is enabled is the hibernation code that presumes
availability of this function when ARCH_HAS_SET_DIRECT_MAP is set.  Still,
on arm64, __kernel_map_pages() will bail out when DEBUG_PAGEALLOC is not
enabled but set_direct_map_invalid_noflush() may render some pages not
present in the direct map and hibernation code won't be able to save such
pages.

To make page allocation debugging and hibernation interaction more robust,
the dependency on DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP has to be
made more explicit.

Start with combining the guard condition and the call to
__kernel_map_pages() into debug_pagealloc_map_pages() and
debug_pagealloc_unmap_pages() functions to emphasize that
__kernel_map_pages() should not be called without DEBUG_PAGEALLOC and use
these new functions to map/unmap pages when page allocation debugging is
enabled.

Link: https://lkml.kernel.org/r/20201109192128.960-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20201109192128.960-2-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  | 15 +++++++++++++++
 mm/memory_hotplug.c |  3 +--
 mm/page_alloc.c     |  6 ++----
 mm/slab.c           |  2 +-
 4 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 48fa3e71be1a..0f4c34672a13 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2943,12 +2943,27 @@ kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	__kernel_map_pages(page, numpages, enable);
 }
+
+static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
+{
+	if (debug_pagealloc_enabled_static())
+		__kernel_map_pages(page, numpages, 1);
+}
+
+static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
+{
+	if (debug_pagealloc_enabled_static())
+		__kernel_map_pages(page, numpages, 0);
+}
+
 #ifdef CONFIG_HIBERNATION
 extern bool kernel_page_present(struct page *page);
 #endif	/* CONFIG_HIBERNATION */
 #else	/* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable) {}
+static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
+static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
 #ifdef CONFIG_HIBERNATION
 static inline bool kernel_page_present(struct page *page) { return true; }
 #endif	/* CONFIG_HIBERNATION */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0f855deea4b2..41c62295292b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -596,8 +596,7 @@ void generic_online_page(struct page *page, unsigned int order)
 	 * so we should map it first. This is better than introducing a special
 	 * case in page freeing fast path.
 	 */
-	if (debug_pagealloc_enabled_static())
-		kernel_map_pages(page, 1 << order, 1);
+	debug_pagealloc_map_pages(page, 1 << order);
 	__free_pages_core(page, order);
 	totalram_pages_add(1UL << order);
 #ifdef CONFIG_HIGHMEM
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 484d5582f88c..2b5410dac827 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1273,8 +1273,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	 */
 	arch_free_page(page, order);
 
-	if (debug_pagealloc_enabled_static())
-		kernel_map_pages(page, 1 << order, 0);
+	debug_pagealloc_unmap_pages(page, 1 << order);
 
 	kasan_free_nondeferred_pages(page, order);
 
@@ -2279,8 +2278,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	set_page_refcounted(page);
 
 	arch_alloc_page(page, order);
-	if (debug_pagealloc_enabled_static())
-		kernel_map_pages(page, 1 << order, 1);
+	debug_pagealloc_map_pages(page, 1 << order);
 	kasan_alloc_pages(page, order);
 	kernel_poison_pages(page, 1 << order, 1);
 	set_page_owner(page, order, gfp_flags);
diff --git a/mm/slab.c b/mm/slab.c
index 176b65e2157d..d7c8da9319c7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1435,7 +1435,7 @@ static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
 	if (!is_debug_pagealloc_cache(cachep))
 		return;
 
-	kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+	__kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
 }
 
 #else
-- 
cgit 


From 2abf962a8d42b32f5ffeb827826290b799c85f86 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 14 Dec 2020 19:10:25 -0800
Subject: PM: hibernate: make direct map manipulations more explicit

When DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP is enabled a page may be
not present in the direct map and has to be explicitly mapped before it
could be copied.

Introduce hibernate_map_page() and hibernation_unmap_page() that will
explicitly use set_direct_map_{default,invalid}_noflush() for
ARCH_HAS_SET_DIRECT_MAP case and debug_pagealloc_{map,unmap}_pages() for
DEBUG_PAGEALLOC case.

The remapping of the pages in safe_copy_page() presumes that it only
changes protection bits in an existing PTE and so it is safe to ignore
return value of set_direct_map_{default,invalid}_noflush().

Still, add a pr_warn() so that future changes in set_memory APIs will not
silently break hibernation.

Link: https://lkml.kernel.org/r/20201109192128.960-3-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h      | 12 ------------
 kernel/power/snapshot.c | 38 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0f4c34672a13..6b5df31387b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2934,16 +2934,6 @@ static inline bool debug_pagealloc_enabled_static(void)
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
 extern void __kernel_map_pages(struct page *page, int numpages, int enable);
 
-/*
- * When called in DEBUG_PAGEALLOC context, the call should most likely be
- * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static()
- */
-static inline void
-kernel_map_pages(struct page *page, int numpages, int enable)
-{
-	__kernel_map_pages(page, numpages, enable);
-}
-
 static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
 {
 	if (debug_pagealloc_enabled_static())
@@ -2960,8 +2950,6 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
 extern bool kernel_page_present(struct page *page);
 #endif	/* CONFIG_HIBERNATION */
 #else	/* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
-static inline void
-kernel_map_pages(struct page *page, int numpages, int enable) {}
 static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
 static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
 #ifdef CONFIG_HIBERNATION
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 46b1804c1ddf..d848377dd8dc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -76,6 +76,40 @@ static inline void hibernate_restore_protect_page(void *page_address) {}
 static inline void hibernate_restore_unprotect_page(void *page_address) {}
 #endif /* CONFIG_STRICT_KERNEL_RWX  && CONFIG_ARCH_HAS_SET_MEMORY */
 
+
+/*
+ * The calls to set_direct_map_*() should not fail because remapping a page
+ * here means that we only update protection bits in an existing PTE.
+ * It is still worth to have a warning here if something changes and this
+ * will no longer be the case.
+ */
+static inline void hibernate_map_page(struct page *page)
+{
+	if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+		int ret = set_direct_map_default_noflush(page);
+
+		if (ret)
+			pr_warn_once("Failed to remap page\n");
+	} else {
+		debug_pagealloc_map_pages(page, 1);
+	}
+}
+
+static inline void hibernate_unmap_page(struct page *page)
+{
+	if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+		unsigned long addr = (unsigned long)page_address(page);
+		int ret  = set_direct_map_invalid_noflush(page);
+
+		if (ret)
+			pr_warn_once("Failed to remap page\n");
+
+		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+	} else {
+		debug_pagealloc_unmap_pages(page, 1);
+	}
+}
+
 static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
@@ -1355,9 +1389,9 @@ static void safe_copy_page(void *dst, struct page *s_page)
 	if (kernel_page_present(s_page)) {
 		do_copy_page(dst, page_address(s_page));
 	} else {
-		kernel_map_pages(s_page, 1, 1);
+		hibernate_map_page(s_page);
 		do_copy_page(dst, page_address(s_page));
-		kernel_map_pages(s_page, 1, 0);
+		hibernate_unmap_page(s_page);
 	}
 }
 
-- 
cgit 


From 5d6ad668f31625c6aa9ed8dc3bdb29561d2b1144 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 14 Dec 2020 19:10:30 -0800
Subject: arch, mm: restore dependency of __kernel_map_pages() on
 DEBUG_PAGEALLOC

The design of DEBUG_PAGEALLOC presumes that __kernel_map_pages() must
never fail.  With this assumption is wouldn't be safe to allow general
usage of this function.

Moreover, some architectures that implement __kernel_map_pages() have this
function guarded by #ifdef DEBUG_PAGEALLOC and some refuse to map/unmap
pages when page allocation debugging is disabled at runtime.

As all the users of __kernel_map_pages() were converted to use
debug_pagealloc_map_pages() it is safe to make it available only when
DEBUG_PAGEALLOC is set.

Link: https://lkml.kernel.org/r/20201109192128.960-4-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                     |  3 +++
 arch/arm64/Kconfig               |  4 +---
 arch/arm64/mm/pageattr.c         |  8 ++++++--
 arch/powerpc/Kconfig             |  5 +----
 arch/riscv/Kconfig               |  4 +---
 arch/riscv/include/asm/pgtable.h |  2 --
 arch/riscv/mm/pageattr.c         |  2 ++
 arch/s390/Kconfig                |  4 +---
 arch/sparc/Kconfig               |  4 +---
 arch/x86/Kconfig                 |  4 +---
 arch/x86/mm/pat/set_memory.c     |  2 ++
 include/linux/mm.h               | 10 +++++++---
 12 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 8d5efff59cd8..cd4172a80123 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1047,6 +1047,9 @@ config ARCH_WANT_LD_ORPHAN_WARN
 config HAVE_ARCH_PFN_VALID
 	bool
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6f0751631ef1..f2dee598a921 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -71,6 +71,7 @@ config ARM64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_USE_SYM_ANNOTATIONS
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	select ARCH_SUPPORTS_MEMORY_FAILURE
 	select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK
 	select ARCH_SUPPORTS_ATOMIC_RMW
@@ -1028,9 +1029,6 @@ config HOLES_IN_ZONE
 
 source "kernel/Kconfig.hz"
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	def_bool y
-
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	select SPARSEMEM_VMEMMAP_ENABLE
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 1b94f5b82654..439325532be1 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -155,7 +155,7 @@ int set_direct_map_invalid_noflush(struct page *page)
 		.clear_mask = __pgprot(PTE_VALID),
 	};
 
-	if (!rodata_full)
+	if (!debug_pagealloc_enabled() && !rodata_full)
 		return 0;
 
 	return apply_to_page_range(&init_mm,
@@ -170,7 +170,7 @@ int set_direct_map_default_noflush(struct page *page)
 		.clear_mask = __pgprot(PTE_RDONLY),
 	};
 
-	if (!rodata_full)
+	if (!debug_pagealloc_enabled() && !rodata_full)
 		return 0;
 
 	return apply_to_page_range(&init_mm,
@@ -178,6 +178,7 @@ int set_direct_map_default_noflush(struct page *page)
 				   PAGE_SIZE, change_page_range, &data);
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (!debug_pagealloc_enabled() && !rodata_full)
@@ -186,6 +187,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
 	set_memory_valid((unsigned long)page_address(page), numpages, enable);
 }
 
+#ifdef CONFIG_HIBERNATION
 /*
  * This function is used to determine if a linear map page has been marked as
  * not-valid. Walk the page table and check the PTE_VALID bit. This is based
@@ -232,3 +234,5 @@ bool kernel_page_present(struct page *page)
 	ptep = pte_offset_kernel(pmdp, addr);
 	return pte_valid(READ_ONCE(*ptep));
 }
+#endif /* CONFIG_HIBERNATION */
+#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5181872f9452..8e8f46bbd39c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -146,6 +146,7 @@ config PPC
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select ARCH_OPTIONAL_KERNEL_RWX		if ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC32 || PPC_BOOK3S_64
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_USE_QUEUED_RWLOCKS		if PPC_QUEUED_SPINLOCKS
@@ -356,10 +357,6 @@ config PPC_OF_PLATFORM_PCI
 	depends on PCI
 	depends on PPC64 # not supported on 32 bits yet
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	depends on PPC32 || PPC_BOOK3S_64
-	def_bool y
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 44377fd7860e..9283c6f9ae2a 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -14,6 +14,7 @@ config RISCV
 	def_bool y
 	select ARCH_CLOCKSOURCE_INIT
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_DEBUG_VM_PGTABLE
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
@@ -153,9 +154,6 @@ config ARCH_SELECT_MEMORY_MODEL
 config ARCH_WANT_GENERAL_HUGETLB
 	def_bool y
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	def_bool y
-
 config SYS_SUPPORTS_HUGETLBFS
 	depends on MMU
 	def_bool y
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 183f1f4b2ae6..41a72861987c 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -461,8 +461,6 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 #define VMALLOC_START		0
 #define VMALLOC_END		TASK_SIZE
 
-static inline void __kernel_map_pages(struct page *page, int numpages, int enable) {}
-
 #endif /* !CONFIG_MMU */
 
 #define kern_addr_valid(addr)   (1) /* FIXME */
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 19fecb362d81..321b09d2e2ea 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -184,6 +184,7 @@ int set_direct_map_default_noflush(struct page *page)
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (!debug_pagealloc_enabled())
@@ -196,3 +197,4 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
 		__set_memory((unsigned long)page_address(page), numpages,
 			     __pgprot(0), __pgprot(_PAGE_PRESENT));
 }
+#endif
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4a2a12be04c9..991a850a6c0b 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -35,9 +35,6 @@ config GENERIC_LOCKBREAK
 config PGSTE
 	def_bool y if KVM
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	def_bool y
-
 config AUDIT_ARCH
 	def_bool y
 
@@ -106,6 +103,7 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index a6ca135442f9..2c729b8d097a 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -88,6 +88,7 @@ config SPARC64
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	select HAVE_NMI
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select ARCH_USE_QUEUED_RWLOCKS
@@ -148,9 +149,6 @@ config GENERIC_ISA_DMA
 	bool
 	default y if SPARC32
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	def_bool y if SPARC64
-
 config PGTABLE_LEVELS
 	default 4 if 64BIT
 	default 3
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46393177ce73..384c91b057c5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -91,6 +91,7 @@ config X86
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ACPI
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_QUEUED_RWLOCKS
@@ -331,9 +332,6 @@ config ZONE_DMA32
 config AUDIT_ARCH
 	def_bool y if X86_64
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	def_bool y
-
 config KASAN_SHADOW_OFFSET
 	hex
 	depends on KASAN
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 40baa90e74f4..bc9be96b777f 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2194,6 +2194,7 @@ int set_direct_map_default_noflush(struct page *page)
 	return __set_pages_p(page, 1);
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (PageHighMem(page))
@@ -2239,6 +2240,7 @@ bool kernel_page_present(struct page *page)
 	return (pte_val(*pte) & _PAGE_PRESENT);
 }
 #endif /* CONFIG_HIBERNATION */
+#endif /* CONFIG_DEBUG_PAGEALLOC */
 
 int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 				   unsigned numpages, unsigned long page_flags)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6b5df31387b5..7a67bdf3df5e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2931,7 +2931,11 @@ static inline bool debug_pagealloc_enabled_static(void)
 	return static_branch_unlikely(&_debug_pagealloc_enabled);
 }
 
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
+#ifdef CONFIG_DEBUG_PAGEALLOC
+/*
+ * To support DEBUG_PAGEALLOC architecture must ensure that
+ * __kernel_map_pages() never fails
+ */
 extern void __kernel_map_pages(struct page *page, int numpages, int enable);
 
 static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
@@ -2949,13 +2953,13 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
 #ifdef CONFIG_HIBERNATION
 extern bool kernel_page_present(struct page *page);
 #endif	/* CONFIG_HIBERNATION */
-#else	/* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
+#else	/* CONFIG_DEBUG_PAGEALLOC */
 static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
 static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
 #ifdef CONFIG_HIBERNATION
 static inline bool kernel_page_present(struct page *page) { return true; }
 #endif	/* CONFIG_HIBERNATION */
-#endif	/* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
+#endif	/* CONFIG_DEBUG_PAGEALLOC */
 
 #ifdef __HAVE_ARCH_GATE_AREA
 extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
-- 
cgit 


From 32a0de886eb3cb7e6990da27a9cdfa50baa8be64 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 14 Dec 2020 19:10:35 -0800
Subject: arch, mm: make kernel_page_present() always available

For architectures that enable ARCH_HAS_SET_MEMORY having the ability to
verify that a page is mapped in the kernel direct map can be useful
regardless of hibernation.

Add RISC-V implementation of kernel_page_present(), update its forward
declarations and stubs to be a part of set_memory API and remove ugly
ifdefery in inlcude/linux/mm.h around current declarations of
kernel_page_present().

Link: https://lkml.kernel.org/r/20201109192128.960-5-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/cacheflush.h |  1 +
 arch/arm64/mm/pageattr.c            |  4 +---
 arch/riscv/include/asm/set_memory.h |  1 +
 arch/riscv/mm/pageattr.c            | 29 +++++++++++++++++++++++++++++
 arch/x86/include/asm/set_memory.h   |  1 +
 arch/x86/mm/pat/set_memory.c        |  4 +---
 include/linux/mm.h                  |  7 -------
 include/linux/set_memory.h          |  5 +++++
 8 files changed, 39 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 9384fd8fc13c..45217f21f1fe 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -140,6 +140,7 @@ int set_memory_valid(unsigned long addr, int numpages, int enable);
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
 
 #include <asm-generic/cacheflush.h>
 
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 439325532be1..92eccaf595c8 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -186,8 +186,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
 
 	set_memory_valid((unsigned long)page_address(page), numpages, enable);
 }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
 
-#ifdef CONFIG_HIBERNATION
 /*
  * This function is used to determine if a linear map page has been marked as
  * not-valid. Walk the page table and check the PTE_VALID bit. This is based
@@ -234,5 +234,3 @@ bool kernel_page_present(struct page *page)
 	ptep = pte_offset_kernel(pmdp, addr);
 	return pte_valid(READ_ONCE(*ptep));
 }
-#endif /* CONFIG_HIBERNATION */
-#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index 4c5bae7ca01c..d690b08dff2a 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -24,6 +24,7 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 321b09d2e2ea..87ba5a68bbb8 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -198,3 +198,32 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
 			     __pgprot(0), __pgprot(_PAGE_PRESENT));
 }
 #endif
+
+bool kernel_page_present(struct page *page)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+	pgd_t *pgd;
+	pud_t *pud;
+	p4d_t *p4d;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset_k(addr);
+	if (!pgd_present(*pgd))
+		return false;
+
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d))
+		return false;
+
+	pud = pud_offset(p4d, addr);
+	if (!pud_present(*pud))
+		return false;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return false;
+
+	pte = pte_offset_kernel(pmd, addr);
+	return pte_present(*pte);
+}
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 5948218f35c5..4352f08bfbb5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -82,6 +82,7 @@ int set_pages_rw(struct page *page, int numpages);
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
 
 extern int kernel_set_to_readonly;
 
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index bc9be96b777f..16f878c26667 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2226,8 +2226,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
 
 	arch_flush_lazy_mmu_mode();
 }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
 
-#ifdef CONFIG_HIBERNATION
 bool kernel_page_present(struct page *page)
 {
 	unsigned int level;
@@ -2239,8 +2239,6 @@ bool kernel_page_present(struct page *page)
 	pte = lookup_address((unsigned long)page_address(page), &level);
 	return (pte_val(*pte) & _PAGE_PRESENT);
 }
-#endif /* CONFIG_HIBERNATION */
-#endif /* CONFIG_DEBUG_PAGEALLOC */
 
 int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 				   unsigned numpages, unsigned long page_flags)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7a67bdf3df5e..5ec79757eeae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2949,16 +2949,9 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
 	if (debug_pagealloc_enabled_static())
 		__kernel_map_pages(page, numpages, 0);
 }
-
-#ifdef CONFIG_HIBERNATION
-extern bool kernel_page_present(struct page *page);
-#endif	/* CONFIG_HIBERNATION */
 #else	/* CONFIG_DEBUG_PAGEALLOC */
 static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
 static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
-#ifdef CONFIG_HIBERNATION
-static inline bool kernel_page_present(struct page *page) { return true; }
-#endif	/* CONFIG_HIBERNATION */
 #endif	/* CONFIG_DEBUG_PAGEALLOC */
 
 #ifdef __HAVE_ARCH_GATE_AREA
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 860e0f843c12..fe1aa4e54680 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -23,6 +23,11 @@ static inline int set_direct_map_default_noflush(struct page *page)
 {
 	return 0;
 }
+
+static inline bool kernel_page_present(struct page *page)
+{
+	return true;
+}
 #endif
 
 #ifndef set_mce_nospec
-- 
cgit 


From 952eaf815925f106eb6b68346b3458a68bb18ec1 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 14 Dec 2020 19:10:53 -0800
Subject: mm, page_alloc: cache pageset high and batch in struct zone

All per-cpu pagesets for a zone use the same high and batch values, that
are duplicated there just for performance (locality) reasons.  This patch
adds the same variables also to struct zone as a shared copy.

This will be useful later for making possible to disable pcplists
temporarily by setting high value to 0, while remembering the values for
restoring them later.  But we can also immediately benefit from not
updating pagesets of all possible cpus in case the newly recalculated
values (after sysctl change or memory online/offline) are actually
unchanged from the previous ones.

Link: https://lkml.kernel.org/r/20201111092812.11329-6-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  6 ++++++
 mm/page_alloc.c        | 16 ++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6e0025b5a88f..4eee08b0062b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -470,6 +470,12 @@ struct zone {
 #endif
 	struct pglist_data	*zone_pgdat;
 	struct per_cpu_pageset __percpu *pageset;
+	/*
+	 * the high and batch values are copied to individual pagesets for
+	 * faster access
+	 */
+	int pageset_high;
+	int pageset_batch;
 
 #ifndef CONFIG_SPARSEMEM
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0c47af9e97c6..c3d1752b57dc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5920,6 +5920,9 @@ static void build_zonelists(pg_data_t *pgdat)
  * Other parts of the kernel may not check if the zone is available.
  */
 static void pageset_init(struct per_cpu_pageset *p);
+/* These effectively disable the pcplists in the boot pageset completely */
+#define BOOT_PAGESET_HIGH	0
+#define BOOT_PAGESET_BATCH	1
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
@@ -6309,8 +6312,8 @@ static void pageset_init(struct per_cpu_pageset *p)
 	 * need to be as careful as pageset_update() as nobody can access the
 	 * pageset yet.
 	 */
-	pcp->high = 0;
-	pcp->batch = 1;
+	pcp->high = BOOT_PAGESET_HIGH;
+	pcp->batch = BOOT_PAGESET_BATCH;
 }
 
 /*
@@ -6334,6 +6337,13 @@ static void zone_set_pageset_high_and_batch(struct zone *zone)
 		new_batch = max(1UL, 1 * new_batch);
 	}
 
+	if (zone->pageset_high == new_high &&
+	    zone->pageset_batch == new_batch)
+		return;
+
+	zone->pageset_high = new_high;
+	zone->pageset_batch = new_batch;
+
 	for_each_possible_cpu(cpu) {
 		p = per_cpu_ptr(zone->pageset, cpu);
 		pageset_update(&p->pcp, new_high, new_batch);
@@ -6394,6 +6404,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
+	zone->pageset_high = BOOT_PAGESET_HIGH;
+	zone->pageset_batch = BOOT_PAGESET_BATCH;
 
 	if (populated_zone(zone))
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
-- 
cgit 


From 2ee08717da50160c20056f6d6b76afdf65db33ab Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 14 Dec 2020 19:11:02 -0800
Subject: include/linux/page-flags.h: remove unused __[Set|Clear]PagePrivate

They are not used anymore.

Link: https://lkml.kernel.org/r/20201009135914.64826-1-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4f6ba9379112..50cbf5e931bc 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -363,8 +363,7 @@ PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
  * for its own purposes.
  * - PG_private and PG_private_2 cause releasepage() and co to be invoked
  */
-PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
-	__CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private, private, PF_ANY)
 PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
 PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 	TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
-- 
cgit 


From 3b12da6d1d4adff087939c071e0d74a7857439a0 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 14 Dec 2020 19:11:05 -0800
Subject: mm/page-flags: fix comment

We haven't had 'dontuse' flags since 2002.  Replace this obsolete warning
with a hopefully more useful one.

Link: https://lkml.kernel.org/r/20201027025823.3704-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 50cbf5e931bc..c1368af622c7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -86,8 +86,7 @@
  */
 
 /*
- * Don't use the *_dontuse flags.  Use the macros.  Otherwise you'll break
- * locked- and dirty-page accounting.
+ * Don't use the pageflags directly.  Use the PageFoo macros.
  *
  * The page flags field is split into two parts, the main flags area
  * which extends from the low bits upwards, and the fields area which
-- 
cgit 


From ebfe1b8f6ea5d83d8c1aa18ddd8ede432a7414e7 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Mon, 14 Dec 2020 19:11:58 -0800
Subject: include/linux/huge_mm.h: remove extern keyword

The external function definitions don't need the "extern" keyword.  Remove
them so future changes don't copy the function definition style.

Link: https://lkml.kernel.org/r/20201106235135.32109-1-rcampbell@nvidia.com
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 93 ++++++++++++++++++++++---------------------------
 1 file changed, 41 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 0365aa97f8e7..6a19f35f836b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -7,43 +7,37 @@
 
 #include <linux/fs.h> /* only for vma_is_dax() */
 
-extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
-extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-			 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
-			 struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
-extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-			 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
-			 struct vm_area_struct *vma);
+vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+		  struct vm_area_struct *vma);
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
+int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+		  struct vm_area_struct *vma);
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
+void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
 #else
 static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
 {
 }
 #endif
 
-extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
-extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
-					  unsigned long addr,
-					  pmd_t *pmd,
-					  unsigned int flags);
-extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
-			struct vm_area_struct *vma,
-			pmd_t *pmd, unsigned long addr, unsigned long next);
-extern int zap_huge_pmd(struct mmu_gather *tlb,
-			struct vm_area_struct *vma,
-			pmd_t *pmd, unsigned long addr);
-extern int zap_huge_pud(struct mmu_gather *tlb,
-			struct vm_area_struct *vma,
-			pud_t *pud, unsigned long addr);
-extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
-			 unsigned long new_addr,
-			 pmd_t *old_pmd, pmd_t *new_pmd);
-extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, pgprot_t newprot,
-			unsigned long cp_flags);
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+				   unsigned long addr, pmd_t *pmd,
+				   unsigned int flags);
+bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+			   pmd_t *pmd, unsigned long addr, unsigned long next);
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
+		 unsigned long addr);
+int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
+		 unsigned long addr);
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+		   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
+		    pgprot_t newprot, unsigned long cp_flags);
 vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
 				   pgprot_t pgprot, bool write);
 
@@ -100,13 +94,13 @@ enum transparent_hugepage_flag {
 struct kobject;
 struct kobj_attribute;
 
-extern ssize_t single_hugepage_flag_store(struct kobject *kobj,
-				 struct kobj_attribute *attr,
-				 const char *buf, size_t count,
-				 enum transparent_hugepage_flag flag);
-extern ssize_t single_hugepage_flag_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buf,
-				enum transparent_hugepage_flag flag);
+ssize_t single_hugepage_flag_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count,
+				   enum transparent_hugepage_flag flag);
+ssize_t single_hugepage_flag_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf,
+				  enum transparent_hugepage_flag flag);
 extern struct kobj_attribute shmem_enabled_attr;
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
@@ -179,12 +173,11 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
 
-extern unsigned long thp_get_unmapped_area(struct file *filp,
-		unsigned long addr, unsigned long len, unsigned long pgoff,
-		unsigned long flags);
+unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags);
 
-extern void prep_transhuge_page(struct page *page);
-extern void free_transhuge_page(struct page *page);
+void prep_transhuge_page(struct page *page);
+void free_transhuge_page(struct page *page);
 bool is_transparent_hugepage(struct page *page);
 
 bool can_split_huge_page(struct page *page, int *pextra_pins);
@@ -222,16 +215,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 			__split_huge_pud(__vma, __pud, __address);	\
 	}  while (0)
 
-extern int hugepage_madvise(struct vm_area_struct *vma,
-			    unsigned long *vm_flags, int advice);
-extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
-				    unsigned long start,
-				    unsigned long end,
-				    long adjust_next);
-extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
-		struct vm_area_struct *vma);
-extern spinlock_t *__pud_trans_huge_lock(pud_t *pud,
-		struct vm_area_struct *vma);
+int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
+		     int advice);
+void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
+			   unsigned long end, long adjust_next);
+spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
+spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);
 
 static inline int is_swap_pmd(pmd_t pmd)
 {
@@ -294,7 +283,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 		pud_t *pud, int flags, struct dev_pagemap **pgmap);
 
-extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 extern struct page *huge_zero_page;
 
-- 
cgit 


From 2271b016bf368d19d60531dd5ddd4375b4dae0ab Mon Sep 17 00:00:00 2001
From: Hui Su <sh_def@163.com>
Date: Mon, 14 Dec 2020 19:12:46 -0800
Subject: mm/compaction: make defer_compaction and compaction_deferred static

defer_compaction() and compaction_deferred() and compaction_restarting()
in mm/compaction.c won't be used in other files, so make them static, and
remove the declaration in the header file.

Take the chance to fix a typo.

Link: https://lkml.kernel.org/r/20201123170801.GA9625@rlk
Signed-off-by: Hui Su <sh_def@163.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Nitin Gupta <nigupta@nvidia.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Mateusz Nosek <mateusznosek0@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 12 ------------
 mm/compaction.c            |  8 ++++----
 2 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 1de5a1151ee7..ed4070ed41ef 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -98,11 +98,8 @@ extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern enum compact_result compaction_suitable(struct zone *zone, int order,
 		unsigned int alloc_flags, int highest_zoneidx);
 
-extern void defer_compaction(struct zone *zone, int order);
-extern bool compaction_deferred(struct zone *zone, int order);
 extern void compaction_defer_reset(struct zone *zone, int order,
 				bool alloc_success);
-extern bool compaction_restarting(struct zone *zone, int order);
 
 /* Compaction has made some progress and retrying makes sense */
 static inline bool compaction_made_progress(enum compact_result result)
@@ -194,15 +191,6 @@ static inline enum compact_result compaction_suitable(struct zone *zone, int ord
 	return COMPACT_SKIPPED;
 }
 
-static inline void defer_compaction(struct zone *zone, int order)
-{
-}
-
-static inline bool compaction_deferred(struct zone *zone, int order)
-{
-	return true;
-}
-
 static inline bool compaction_made_progress(enum compact_result result)
 {
 	return false;
diff --git a/mm/compaction.c b/mm/compaction.c
index c1370828acee..dbcfdfce1b82 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(__ClearPageMovable);
  * allocation success. 1 << compact_defer_shift, compactions are skipped up
  * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
  */
-void defer_compaction(struct zone *zone, int order)
+static void defer_compaction(struct zone *zone, int order)
 {
 	zone->compact_considered = 0;
 	zone->compact_defer_shift++;
@@ -172,7 +172,7 @@ void defer_compaction(struct zone *zone, int order)
 }
 
 /* Returns true if compaction should be skipped this time */
-bool compaction_deferred(struct zone *zone, int order)
+static bool compaction_deferred(struct zone *zone, int order)
 {
 	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
 
@@ -209,7 +209,7 @@ void compaction_defer_reset(struct zone *zone, int order,
 }
 
 /* Returns true if restarting compaction after many failures */
-bool compaction_restarting(struct zone *zone, int order)
+static bool compaction_restarting(struct zone *zone, int order)
 {
 	if (order < zone->compact_order_failed)
 		return false;
@@ -237,7 +237,7 @@ static void reset_cached_positions(struct zone *zone)
 }
 
 /*
- * Compound pages of >= pageblock_order should consistenly be skipped until
+ * Compound pages of >= pageblock_order should consistently be skipped until
  * released. It is always pointless to compact pages of such order (if they are
  * migratable), and the pageblocks they occupy cannot contain any free pages.
  */
-- 
cgit 


From 0060ef3b4e6dd1410da164d48a595eadb2fb02f7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 14 Dec 2020 19:12:59 -0800
Subject: mm: support THPs in zero_user_segments

We can only kmap() one subpage of a THP at a time, so loop over all
relevant subpages, skipping ones which don't need to be zeroed.  This is
too large to inline when THPs are enabled and we actually need highmem, so
put it in highmem.c.

[willy@infradead.org: start1 was allowed to be less than start2]

Link: https://lkml.kernel.org/r/20201124041507.28996-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 19 ++++++++++++++----
 mm/highmem.c            | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 14e6202ce47f..8e21fe82b3a3 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -284,13 +284,22 @@ static inline void clear_highpage(struct page *page)
 	kunmap_atomic(kaddr);
 }
 
+/*
+ * If we pass in a base or tail page, we can zero up to PAGE_SIZE.
+ * If we pass in a head page, we can zero up to the size of the compound page.
+ */
+#if defined(CONFIG_HIGHMEM) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
+		unsigned start2, unsigned end2);
+#else /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */
 static inline void zero_user_segments(struct page *page,
-	unsigned start1, unsigned end1,
-	unsigned start2, unsigned end2)
+		unsigned start1, unsigned end1,
+		unsigned start2, unsigned end2)
 {
 	void *kaddr = kmap_atomic(page);
+	unsigned int i;
 
-	BUG_ON(end1 > PAGE_SIZE || end2 > PAGE_SIZE);
+	BUG_ON(end1 > page_size(page) || end2 > page_size(page));
 
 	if (end1 > start1)
 		memset(kaddr + start1, 0, end1 - start1);
@@ -299,8 +308,10 @@ static inline void zero_user_segments(struct page *page,
 		memset(kaddr + start2, 0, end2 - start2);
 
 	kunmap_atomic(kaddr);
-	flush_dcache_page(page);
+	for (i = 0; i < compound_nr(page); i++)
+		flush_dcache_page(page + i);
 }
+#endif /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */
 
 static inline void zero_user_segment(struct page *page,
 	unsigned start, unsigned end)
diff --git a/mm/highmem.c b/mm/highmem.c
index 1352a27951e3..0ee87a9e0cbf 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -369,6 +369,58 @@ void kunmap_high(struct page *page)
 }
 
 EXPORT_SYMBOL(kunmap_high);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
+		unsigned start2, unsigned end2)
+{
+	unsigned int i;
+
+	BUG_ON(end1 > page_size(page) || end2 > page_size(page));
+
+	for (i = 0; i < compound_nr(page); i++) {
+		void *kaddr = NULL;
+
+		if (start1 < PAGE_SIZE || start2 < PAGE_SIZE)
+			kaddr = kmap_atomic(page + i);
+
+		if (start1 >= PAGE_SIZE) {
+			start1 -= PAGE_SIZE;
+			end1 -= PAGE_SIZE;
+		} else {
+			unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
+
+			if (end1 > start1)
+				memset(kaddr + start1, 0, this_end - start1);
+			end1 -= this_end;
+			start1 = 0;
+		}
+
+		if (start2 >= PAGE_SIZE) {
+			start2 -= PAGE_SIZE;
+			end2 -= PAGE_SIZE;
+		} else {
+			unsigned this_end = min_t(unsigned, end2, PAGE_SIZE);
+
+			if (end2 > start2)
+				memset(kaddr + start2, 0, this_end - start2);
+			end2 -= this_end;
+			start2 = 0;
+		}
+
+		if (kaddr) {
+			kunmap_atomic(kaddr);
+			flush_dcache_page(page + i);
+		}
+
+		if (!end1 && !end2)
+			break;
+	}
+
+	BUG_ON((start1 | start2 | end1 | end2) != 0);
+}
+EXPORT_SYMBOL(zero_user_segments);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif	/* CONFIG_HIGHMEM */
 
 #if defined(HASHED_PAGE_VIRTUAL)
-- 
cgit 


From 236c32eb109696590b7428957eda50cc05e22af8 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Mon, 14 Dec 2020 19:13:13 -0800
Subject: mm: migrate: clean up migrate_prep{_local}

The migrate_prep{_local} never fails, so it is pointless to have return
value and check the return value.

Link: https://lkml.kernel.org/r/20201113205359.556831-5-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Song Liu <songliubraving@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 4 ++--
 mm/mempolicy.c          | 8 ++------
 mm/migrate.c            | 8 ++------
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0f8d1583fa8e..4594838a0f7c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -45,8 +45,8 @@ extern struct page *alloc_migration_target(struct page *page, unsigned long priv
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 extern void putback_movable_page(struct page *page);
 
-extern int migrate_prep(void);
-extern int migrate_prep_local(void);
+extern void migrate_prep(void);
+extern void migrate_prep_local(void);
 extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3ca4898f3f24..8cf96bd21341 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1114,9 +1114,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 	int err;
 	nodemask_t tmp;
 
-	err = migrate_prep();
-	if (err)
-		return err;
+	migrate_prep();
 
 	mmap_read_lock(mm);
 
@@ -1315,9 +1313,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 
-		err = migrate_prep();
-		if (err)
-			goto mpol_out;
+		migrate_prep();
 	}
 	{
 		NODEMASK_SCRATCH(scratch);
diff --git a/mm/migrate.c b/mm/migrate.c
index a7ccda55ca82..76e2a6f3749f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -62,7 +62,7 @@
  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
  * undesirable, use migrate_prep_local()
  */
-int migrate_prep(void)
+void migrate_prep(void)
 {
 	/*
 	 * Clear the LRU lists so pages can be isolated.
@@ -71,16 +71,12 @@ int migrate_prep(void)
 	 * pages that may be busy.
 	 */
 	lru_add_drain_all();
-
-	return 0;
 }
 
 /* Do the necessary work of migrate_prep but not if it involves other CPUs */
-int migrate_prep_local(void)
+void migrate_prep_local(void)
 {
 	lru_add_drain();
-
-	return 0;
 }
 
 int isolate_movable_page(struct page *page, isolate_mode_t mode)
-- 
cgit 


From 04013513cc84c401c7de9023ff3eda7863fc4add Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 14 Dec 2020 19:13:30 -0800
Subject: mm, page_alloc: do not rely on the order of page_poison and
 init_on_alloc/free parameters

Patch series "cleanup page poisoning", v3.

I have identified a number of issues and opportunities for cleanup with
CONFIG_PAGE_POISON and friends:

 - interaction with init_on_alloc and init_on_free parameters depends on
   the order of parameters (Patch 1)

 - the boot time enabling uses static key, but inefficienty (Patch 2)

 - sanity checking is incompatible with hibernation (Patch 3)

 - CONFIG_PAGE_POISONING_NO_SANITY can be removed now that we have
   init_on_free (Patch 4)

 - CONFIG_PAGE_POISONING_ZERO can be most likely removed now that we
   have init_on_free (Patch 5)

This patch (of 5):

Enabling page_poison=1 together with init_on_alloc=1 or init_on_free=1
produces a warning in dmesg that page_poison takes precedence.  However,
as these warnings are printed in early_param handlers for
init_on_alloc/free, they are not printed if page_poison is enabled later
on the command line (handlers are called in the order of their
parameters), or when init_on_alloc/free is always enabled by the
respective config option - before the page_poison early param handler is
called, it is not considered to be enabled.  This is inconsistent.

We can remove the dependency on order by making the init_on_* parameters
only set a boolean variable, and postponing the evaluation after all early
params have been processed.  Introduce a new
init_mem_debugging_and_hardening() function for that, and move the related
debug_pagealloc processing there as well.

As a result init_mem_debugging_and_hardening() knows always accurately if
init_on_* and/or page_poison options were enabled.  Thus we can also
optimize want_init_on_alloc() and want_init_on_free().  We don't need to
check page_poisoning_enabled() there, we can instead not enable the
init_on_* static keys at all, if page poisoning is enabled.  This results
in a simpler and more effective code.

Link: https://lkml.kernel.org/r/20201113104033.22907-1-vbabka@suse.cz
Link: https://lkml.kernel.org/r/20201113104033.22907-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mateusz Nosek <mateusznosek0@gmail.com>
Cc: Laura Abbott <labbott@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 20 ++-----------
 init/main.c        |  2 +-
 mm/page_alloc.c    | 88 ++++++++++++++++++++++++++----------------------------
 3 files changed, 46 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5ec79757eeae..591c784277ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2872,6 +2872,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
 				   unsigned long address, unsigned long size,
 				   pte_fn_t fn, void *data);
 
+extern void init_mem_debugging_and_hardening(void);
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
 extern void kernel_poison_pages(struct page *page, int numpages, int enable);
@@ -2881,35 +2882,20 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
 					int enable) { }
 #endif
 
-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
-DECLARE_STATIC_KEY_TRUE(init_on_alloc);
-#else
 DECLARE_STATIC_KEY_FALSE(init_on_alloc);
-#endif
 static inline bool want_init_on_alloc(gfp_t flags)
 {
-	if (static_branch_unlikely(&init_on_alloc) &&
-	    !page_poisoning_enabled())
+	if (static_branch_unlikely(&init_on_alloc))
 		return true;
 	return flags & __GFP_ZERO;
 }
 
-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
-DECLARE_STATIC_KEY_TRUE(init_on_free);
-#else
 DECLARE_STATIC_KEY_FALSE(init_on_free);
-#endif
 static inline bool want_init_on_free(void)
 {
-	return static_branch_unlikely(&init_on_free) &&
-	       !page_poisoning_enabled();
+	return static_branch_unlikely(&init_on_free);
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-extern void init_debug_pagealloc(void);
-#else
-static inline void init_debug_pagealloc(void) {}
-#endif
 extern bool _debug_pagealloc_enabled_early;
 DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
 
diff --git a/init/main.c b/init/main.c
index 404366e1a64d..3024c4db17a9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -824,7 +824,7 @@ static void __init mm_init(void)
 	 * bigger than MAX_ORDER unless SPARSEMEM.
 	 */
 	page_ext_init_flatmem();
-	init_debug_pagealloc();
+	init_mem_debugging_and_hardening();
 	report_meminit();
 	mem_init();
 	/* page_owner must be initialized after buddy is ready */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 521d29718c9e..3fd0c8d5745f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -167,53 +167,26 @@ unsigned long totalcma_pages __read_mostly;
 
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
-#else
 DEFINE_STATIC_KEY_FALSE(init_on_alloc);
-#endif
 EXPORT_SYMBOL(init_on_alloc);
 
-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
-DEFINE_STATIC_KEY_TRUE(init_on_free);
-#else
 DEFINE_STATIC_KEY_FALSE(init_on_free);
-#endif
 EXPORT_SYMBOL(init_on_free);
 
+static bool _init_on_alloc_enabled_early __read_mostly
+				= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
 static int __init early_init_on_alloc(char *buf)
 {
-	int ret;
-	bool bool_result;
 
-	ret = kstrtobool(buf, &bool_result);
-	if (ret)
-		return ret;
-	if (bool_result && page_poisoning_enabled())
-		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
-	if (bool_result)
-		static_branch_enable(&init_on_alloc);
-	else
-		static_branch_disable(&init_on_alloc);
-	return 0;
+	return kstrtobool(buf, &_init_on_alloc_enabled_early);
 }
 early_param("init_on_alloc", early_init_on_alloc);
 
+static bool _init_on_free_enabled_early __read_mostly
+				= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
 static int __init early_init_on_free(char *buf)
 {
-	int ret;
-	bool bool_result;
-
-	ret = kstrtobool(buf, &bool_result);
-	if (ret)
-		return ret;
-	if (bool_result && page_poisoning_enabled())
-		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
-	if (bool_result)
-		static_branch_enable(&init_on_free);
-	else
-		static_branch_disable(&init_on_free);
-	return 0;
+	return kstrtobool(buf, &_init_on_free_enabled_early);
 }
 early_param("init_on_free", early_init_on_free);
 
@@ -730,19 +703,6 @@ static int __init early_debug_pagealloc(char *buf)
 }
 early_param("debug_pagealloc", early_debug_pagealloc);
 
-void init_debug_pagealloc(void)
-{
-	if (!debug_pagealloc_enabled())
-		return;
-
-	static_branch_enable(&_debug_pagealloc_enabled);
-
-	if (!debug_guardpage_minorder())
-		return;
-
-	static_branch_enable(&_debug_guardpage_enabled);
-}
-
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
@@ -794,6 +754,42 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype) {}
 #endif
 
+/*
+ * Enable static keys related to various memory debugging and hardening options.
+ * Some override others, and depend on early params that are evaluated in the
+ * order of appearance. So we need to first gather the full picture of what was
+ * enabled, and then make decisions.
+ */
+void init_mem_debugging_and_hardening(void)
+{
+	if (_init_on_alloc_enabled_early) {
+		if (page_poisoning_enabled())
+			pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+				"will take precedence over init_on_alloc\n");
+		else
+			static_branch_enable(&init_on_alloc);
+	}
+	if (_init_on_free_enabled_early) {
+		if (page_poisoning_enabled())
+			pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+				"will take precedence over init_on_free\n");
+		else
+			static_branch_enable(&init_on_free);
+	}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	if (!debug_pagealloc_enabled())
+		return;
+
+	static_branch_enable(&_debug_pagealloc_enabled);
+
+	if (!debug_guardpage_minorder())
+		return;
+
+	static_branch_enable(&_debug_guardpage_enabled);
+#endif
+}
+
 static inline void set_buddy_order(struct page *page, unsigned int order)
 {
 	set_page_private(page, order);
-- 
cgit 


From 8db26a3d47354ce7271a8cab03cd65b9d3d610b9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 14 Dec 2020 19:13:34 -0800
Subject: mm, page_poison: use static key more efficiently

Commit 11c9c7edae06 ("mm/page_poison.c: replace bool variable with static
key") changed page_poisoning_enabled() to a static key check.  However,
the function is not inlined, so each check still involves a function call
with overhead not eliminated when page poisoning is disabled.

Analogically to how debug_pagealloc is handled, this patch converts
page_poisoning_enabled() back to boolean check, and introduces
page_poisoning_enabled_static() for fast paths.  Both functions are
inlined.

The function kernel_poison_pages() is also called unconditionally and does
the static key check inside.  Remove it from there and put it to callers.
Also split it to two functions kernel_poison_pages() and
kernel_unpoison_pages() instead of the confusing bool parameter.

Also optimize the check that enables page poisoning instead of
debug_pagealloc for architectures without proper debug_pagealloc support.
Move the check to init_mem_debugging_and_hardening() to enable a single
static key instead of having two static branches in
page_poisoning_enabled_static().

Link: https://lkml.kernel.org/r/20201113104033.22907-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Mateusz Nosek <mateusznosek0@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/virtio/virtio_balloon.c |  2 +-
 include/linux/mm.h              | 33 +++++++++++++++++++++----
 mm/page_alloc.c                 | 18 +++++++++++---
 mm/page_poison.c                | 53 ++++++-----------------------------------
 4 files changed, 52 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 481611c09dae..e53faed6ba93 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -1116,7 +1116,7 @@ static int virtballoon_validate(struct virtio_device *vdev)
 	 */
 	if (!want_init_on_free() &&
 	    (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY) ||
-	     !page_poisoning_enabled()))
+	     !page_poisoning_enabled_static()))
 		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
 	else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON))
 		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 591c784277ea..026707a58159 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2874,12 +2874,37 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
 
 extern void init_mem_debugging_and_hardening(void);
 #ifdef CONFIG_PAGE_POISONING
-extern bool page_poisoning_enabled(void);
-extern void kernel_poison_pages(struct page *page, int numpages, int enable);
+extern void __kernel_poison_pages(struct page *page, int numpages);
+extern void __kernel_unpoison_pages(struct page *page, int numpages);
+extern bool _page_poisoning_enabled_early;
+DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
+static inline bool page_poisoning_enabled(void)
+{
+	return _page_poisoning_enabled_early;
+}
+/*
+ * For use in fast paths after init_mem_debugging() has run, or when a
+ * false negative result is not harmful when called too early.
+ */
+static inline bool page_poisoning_enabled_static(void)
+{
+	return static_branch_unlikely(&_page_poisoning_enabled);
+}
+static inline void kernel_poison_pages(struct page *page, int numpages)
+{
+	if (page_poisoning_enabled_static())
+		__kernel_poison_pages(page, numpages);
+}
+static inline void kernel_unpoison_pages(struct page *page, int numpages)
+{
+	if (page_poisoning_enabled_static())
+		__kernel_unpoison_pages(page, numpages);
+}
 #else
 static inline bool page_poisoning_enabled(void) { return false; }
-static inline void kernel_poison_pages(struct page *page, int numpages,
-					int enable) { }
+static inline bool page_poisoning_enabled_static(void) { return false; }
+static inline void kernel_poison_pages(struct page *page, int numpages) { }
+static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
 #endif
 
 DECLARE_STATIC_KEY_FALSE(init_on_alloc);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3fd0c8d5745f..efcd1baa35e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -777,6 +777,17 @@ void init_mem_debugging_and_hardening(void)
 			static_branch_enable(&init_on_free);
 	}
 
+#ifdef CONFIG_PAGE_POISONING
+	/*
+	 * Page poisoning is debug page alloc for some arches. If
+	 * either of those options are enabled, enable poisoning.
+	 */
+	if (page_poisoning_enabled() ||
+	     (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+	      debug_pagealloc_enabled()))
+		static_branch_enable(&_page_poisoning_enabled);
+#endif
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	if (!debug_pagealloc_enabled())
 		return;
@@ -1262,7 +1273,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	if (want_init_on_free())
 		kernel_init_free_pages(page, 1 << order);
 
-	kernel_poison_pages(page, 1 << order, 0);
+	kernel_poison_pages(page, 1 << order);
+
 	/*
 	 * arch_free_page() can make the page's contents inaccessible.  s390
 	 * does this.  So nothing which can access the page's contents should
@@ -2219,7 +2231,7 @@ static inline int check_new_page(struct page *page)
 static inline bool free_pages_prezeroed(void)
 {
 	return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
-		page_poisoning_enabled()) || want_init_on_free();
+		page_poisoning_enabled_static()) || want_init_on_free();
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -2281,7 +2293,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	arch_alloc_page(page, order);
 	debug_pagealloc_map_pages(page, 1 << order);
 	kasan_alloc_pages(page, order);
-	kernel_poison_pages(page, 1 << order, 1);
+	kernel_unpoison_pages(page, 1 << order);
 	set_page_owner(page, order, gfp_flags);
 
 	if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
diff --git a/mm/page_poison.c b/mm/page_poison.c
index ae0482cded87..4d75fc9ccc7a 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -8,45 +8,17 @@
 #include <linux/ratelimit.h>
 #include <linux/kasan.h>
 
-static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning);
+bool _page_poisoning_enabled_early;
+EXPORT_SYMBOL(_page_poisoning_enabled_early);
+DEFINE_STATIC_KEY_FALSE(_page_poisoning_enabled);
+EXPORT_SYMBOL(_page_poisoning_enabled);
 
 static int __init early_page_poison_param(char *buf)
 {
-	int ret;
-	bool tmp;
-
-	ret = strtobool(buf, &tmp);
-	if (ret)
-		return ret;
-
-	if (tmp)
-		static_branch_enable(&want_page_poisoning);
-	else
-		static_branch_disable(&want_page_poisoning);
-
-	return 0;
+	return kstrtobool(buf, &_page_poisoning_enabled_early);
 }
 early_param("page_poison", early_page_poison_param);
 
-/**
- * page_poisoning_enabled - check if page poisoning is enabled
- *
- * Return true if page poisoning is enabled, or false if not.
- */
-bool page_poisoning_enabled(void)
-{
-	/*
-	 * Assumes that debug_pagealloc_enabled is set before
-	 * memblock_free_all.
-	 * Page poisoning is debug page alloc for some arches. If
-	 * either of those options are enabled, enable poisoning.
-	 */
-	return (static_branch_unlikely(&want_page_poisoning) ||
-		(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
-		debug_pagealloc_enabled()));
-}
-EXPORT_SYMBOL_GPL(page_poisoning_enabled);
-
 static void poison_page(struct page *page)
 {
 	void *addr = kmap_atomic(page);
@@ -58,7 +30,7 @@ static void poison_page(struct page *page)
 	kunmap_atomic(addr);
 }
 
-static void poison_pages(struct page *page, int n)
+void __kernel_poison_pages(struct page *page, int n)
 {
 	int i;
 
@@ -117,7 +89,7 @@ static void unpoison_page(struct page *page)
 	kunmap_atomic(addr);
 }
 
-static void unpoison_pages(struct page *page, int n)
+void __kernel_unpoison_pages(struct page *page, int n)
 {
 	int i;
 
@@ -125,17 +97,6 @@ static void unpoison_pages(struct page *page, int n)
 		unpoison_page(page + i);
 }
 
-void kernel_poison_pages(struct page *page, int numpages, int enable)
-{
-	if (!page_poisoning_enabled())
-		return;
-
-	if (enable)
-		unpoison_pages(page, numpages);
-	else
-		poison_pages(page, numpages);
-}
-
 #ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
-- 
cgit 


From 03b6c9a3e8805606c0bb4ad41855fac3bf85c3b9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 14 Dec 2020 19:13:38 -0800
Subject: kernel/power: allow hibernation with page_poison sanity checking

Page poisoning used to be incompatible with hibernation, as the state of
poisoned pages was lost after resume, thus enabling CONFIG_HIBERNATION
forces CONFIG_PAGE_POISONING_NO_SANITY.  For the same reason, the
poisoning with zeroes variant CONFIG_PAGE_POISONING_ZERO used to disable
hibernation.  The latter restriction was removed by commit 1ad1410f632d
("PM / Hibernate: allow hibernation with PAGE_POISONING_ZERO") and
similarly for init_on_free by commit 18451f9f9e58 ("PM: hibernate: fix
crashes with init_on_free=1") by making sure free pages are cleared after
resume.

We can use the same mechanism to instead poison free pages with
PAGE_POISON after resume.  This covers both zero and 0xAA patterns.  Thus
we can remove the Kconfig restriction that disables page poison sanity
checking when hibernation is enabled.

Link: https://lkml.kernel.org/r/20201113104033.22907-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>	[hibernation]
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Mateusz Nosek <mateusznosek0@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       |  1 +
 kernel/power/hibernate.c |  2 +-
 kernel/power/power.h     |  2 +-
 kernel/power/snapshot.c  | 14 +++++++++++---
 mm/Kconfig.debug         |  1 -
 5 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 026707a58159..3e1fe8ca9720 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2903,6 +2903,7 @@ static inline void kernel_unpoison_pages(struct page *page, int numpages)
 #else
 static inline bool page_poisoning_enabled(void) { return false; }
 static inline bool page_poisoning_enabled_static(void) { return false; }
+static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
 static inline void kernel_poison_pages(struct page *page, int numpages) { }
 static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
 #endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 2fc7d509a34f..da0b41914177 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -326,7 +326,7 @@ static int create_image(int platform_mode)
 
 	if (!in_suspend) {
 		events_check_enabled = false;
-		clear_free_pages();
+		clear_or_poison_free_pages();
 	}
 
 	platform_leave(platform_mode);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 24f12d534515..778bf431ec02 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
 extern int hibernate_preallocate_memory(void);
 
-extern void clear_free_pages(void);
+extern void clear_or_poison_free_pages(void);
 
 /**
  *	Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d848377dd8dc..d63560e1cf87 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1178,7 +1178,15 @@ void free_basic_memory_bitmaps(void)
 	pr_debug("Basic memory bitmaps freed\n");
 }
 
-void clear_free_pages(void)
+static void clear_or_poison_free_page(struct page *page)
+{
+	if (page_poisoning_enabled_static())
+		__kernel_poison_pages(page, 1);
+	else if (want_init_on_free())
+		clear_highpage(page);
+}
+
+void clear_or_poison_free_pages(void)
 {
 	struct memory_bitmap *bm = free_pages_map;
 	unsigned long pfn;
@@ -1186,12 +1194,12 @@ void clear_free_pages(void)
 	if (WARN_ON(!(free_pages_map)))
 		return;
 
-	if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) {
+	if (page_poisoning_enabled() || want_init_on_free()) {
 		memory_bm_position_reset(bm);
 		pfn = memory_bm_next_pfn(bm);
 		while (pfn != BM_END_OF_MAP) {
 			if (pfn_valid(pfn))
-				clear_highpage(pfn_to_page(pfn));
+				clear_or_poison_free_page(pfn_to_page(pfn));
 
 			pfn = memory_bm_next_pfn(bm);
 		}
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 864f129f1937..c57786ad5be9 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -64,7 +64,6 @@ config PAGE_OWNER
 
 config PAGE_POISONING
 	bool "Poison pages after freeing"
-	select PAGE_POISONING_NO_SANITY if HIBERNATION
 	help
 	  Fill the pages with poison patterns after free_pages() and verify
 	  the patterns before alloc_pages. The filling of the memory helps
-- 
cgit 


From f289041ed4cf9a3f6e8a32068fef9ffb2acc5662 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 14 Dec 2020 19:13:45 -0800
Subject: mm, page_poison: remove CONFIG_PAGE_POISONING_ZERO

CONFIG_PAGE_POISONING_ZERO uses the zero pattern instead of 0xAA.  It was
introduced by commit 1414c7f4f7d7 ("mm/page_poisoning.c: allow for zero
poisoning"), noting that using zeroes retains the benefit of sanitizing
content of freed pages, with the benefit of not having to zero them again
on alloc, and the downside of making some forms of corruption (stray
writes of NULLs) harder to detect than with the 0xAA pattern.  Together
with CONFIG_PAGE_POISONING_NO_SANITY it made possible to sanitize the
contents on free without checking it back on alloc.

These days we have the init_on_free() option to achieve sanitization with
zeroes and to save clearing on alloc (and without checking on alloc).
Arguably if someone does choose to check the poison for corruption on
alloc, the savings of not clearing the page are secondary, and it makes
sense to always use the 0xAA poison pattern.  Thus, remove the
CONFIG_PAGE_POISONING_ZERO option for being redundant.

Link: https://lkml.kernel.org/r/20201113104033.22907-6-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Mateusz Nosek <mateusznosek0@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poison.h       |  4 ----
 mm/Kconfig.debug             | 12 ------------
 mm/page_alloc.c              |  8 +-------
 tools/include/linux/poison.h |  6 +-----
 4 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index dc8ae5d8db03..aff1c9250c82 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -27,11 +27,7 @@
 #define TIMER_ENTRY_STATIC	((void *) 0x300 + POISON_POINTER_DELTA)
 
 /********** mm/page_poison.c **********/
-#ifdef CONFIG_PAGE_POISONING_ZERO
-#define PAGE_POISON 0x00
-#else
 #define PAGE_POISON 0xaa
-#endif
 
 /********** mm/page_alloc.c ************/
 
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 14e29fe5bfa6..1e73717802f8 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -80,18 +80,6 @@ config PAGE_POISONING
 
 	  If unsure, say N
 
-config PAGE_POISONING_ZERO
-	bool "Use zero for poisoning instead of debugging value"
-	depends on PAGE_POISONING
-	help
-	   Instead of using the existing poison value, fill the pages with
-	   zeros. This makes it harder to detect when errors are occurring
-	   due to sanitization but the zeroing at free means that it is
-	   no longer necessary to write zeros when GFP_ZERO is used on
-	   allocation.
-
-	   If unsure, say N
-
 config DEBUG_PAGE_REF
 	bool "Enable tracepoint to track down page reference manipulation"
 	depends on DEBUG_KERNEL
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index efcd1baa35e4..918647ff6eef 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2228,12 +2228,6 @@ static inline int check_new_page(struct page *page)
 	return 1;
 }
 
-static inline bool free_pages_prezeroed(void)
-{
-	return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
-		page_poisoning_enabled_static()) || want_init_on_free();
-}
-
 #ifdef CONFIG_DEBUG_VM
 /*
  * With DEBUG_VM enabled, order-0 pages are checked for expected state when
@@ -2296,7 +2290,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	kernel_unpoison_pages(page, 1 << order);
 	set_page_owner(page, order, gfp_flags);
 
-	if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
+	if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
 		kernel_init_free_pages(page, 1 << order);
 }
 
diff --git a/tools/include/linux/poison.h b/tools/include/linux/poison.h
index d29725769107..2e6338ac5eed 100644
--- a/tools/include/linux/poison.h
+++ b/tools/include/linux/poison.h
@@ -35,12 +35,8 @@
  */
 #define TIMER_ENTRY_STATIC	((void *) 0x300 + POISON_POINTER_DELTA)
 
-/********** mm/debug-pagealloc.c **********/
-#ifdef CONFIG_PAGE_POISONING_ZERO
-#define PAGE_POISON 0x00
-#else
+/********** mm/page_poison.c **********/
 #define PAGE_POISON 0xaa
-#endif
 
 /********** mm/page_alloc.c ************/
 
-- 
cgit 


From 88dcb9a3fb48c67ec345f1cdbc2a26119d3cb57d Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Tue, 15 Dec 2020 12:33:20 -0800
Subject: mm/thp: move lru_add_page_tail() to huge_memory.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "per memcg lru lock", v21.

This patchset includes 3 parts:

 1) some code cleanup and minimum optimization as preparation

 2) use TestCleanPageLRU as page isolation's precondition

 3) replace per node lru_lock with per memcg per node lru_lock

Current lru_lock is one for each of node, pgdat->lru_lock, that guard
for lru lists, but now we had moved the lru lists into memcg for long
time.  Still using per node lru_lock is clearly unscalable, pages on
each of memcgs have to compete each others for a whole lru_lock.  This
patchset try to use per lruvec/memcg lru_lock to repleace per node lru
lock to guard lru lists, make it scalable for memcgs and get performance
gain.

Currently lru_lock still guards both lru list and page's lru bit, that's
ok.  but if we want to use specific lruvec lock on the page, we need to
pin down the page's lruvec/memcg during locking.  Just taking lruvec
lock first may be undermined by the page's memcg charge/migration.  To
fix this problem, we could take out the page's lru bit clear and use it
as pin down action to block the memcg changes.  That's the reason for
new atomic func TestClearPageLRU.  So now isolating a page need both
actions: TestClearPageLRU and hold the lru_lock.

The typical usage of this is isolate_migratepages_block() in
compaction.c we have to take lru bit before lru lock, that serialized
the page isolation in memcg page charge/migration which will change
page's lruvec and new lru_lock in it.

The above solution suggested by Johannes Weiner, and based on his new
memcg charge path, then have this patchset.  (Hugh Dickins tested and
contributed much code from compaction fix to general code polish, thanks
a lot!).

Daniel Jordan's testing show 62% improvement on modified readtwice case
on his 2P * 10 core * 2 HT broadwell box on v18, which has no much
different with this v20.

 https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/

Thanks to Hugh Dickins and Konstantin Khlebnikov, they both brought this
idea 8 years ago, and others who gave comments as well: Daniel Jordan,
Mel Gorman, Shakeel Butt, Matthew Wilcox, Alexander Duyck etc.

Thanks for Testing support from Intel 0day and Rong Chen, Fengguang Wu,
and Yun Wang.  Hugh Dickins also shared his kbuild-swap case.

This patch (of 19):

lru_add_page_tail() is only used in huge_memory.c, defining it in other
file with a CONFIG_TRANSPARENT_HUGEPAGE macro restrict just looks weird.

Let's move it THP. And make it static as Hugh Dickins suggested.

Link: https://lkml.kernel.org/r/1604566549-62481-1-git-send-email-alex.shi@linux.alibaba.com
Link: https://lkml.kernel.org/r/1604566549-62481-2-git-send-email-alex.shi@linux.alibaba.com
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: "Chen, Rong A" <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 --
 mm/huge_memory.c     | 30 ++++++++++++++++++++++++++++++
 mm/swap.c            | 33 ---------------------------------
 3 files changed, 30 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 667935c0dbd4..5e1e967c225f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -338,8 +338,6 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file,
 			  unsigned int nr_pages);
 extern void lru_note_cost_page(struct page *);
 extern void lru_cache_add(struct page *);
-extern void lru_add_page_tail(struct page *page, struct page *page_tail,
-			 struct lruvec *lruvec, struct list_head *head);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 57d08156acb1..85b50baa7c7e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2359,6 +2359,36 @@ static void remap_page(struct page *page, unsigned int nr)
 	}
 }
 
+static void lru_add_page_tail(struct page *page, struct page *page_tail,
+		struct lruvec *lruvec, struct list_head *list)
+{
+	VM_BUG_ON_PAGE(!PageHead(page), page);
+	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
+	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
+	lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+
+	if (!list)
+		SetPageLRU(page_tail);
+
+	if (likely(PageLRU(page)))
+		list_add_tail(&page_tail->lru, &page->lru);
+	else if (list) {
+		/* page reclaim is reclaiming a huge page */
+		get_page(page_tail);
+		list_add_tail(&page_tail->lru, list);
+	} else {
+		/*
+		 * Head page has not yet been counted, as an hpage,
+		 * so we must account for each subpage individually.
+		 *
+		 * Put page_tail on the list at the correct position
+		 * so they all end up in order.
+		 */
+		add_page_to_lru_list_tail(page_tail, lruvec,
+					  page_lru(page_tail));
+	}
+}
+
 static void __split_huge_page_tail(struct page *head, int tail,
 		struct lruvec *lruvec, struct list_head *list)
 {
diff --git a/mm/swap.c b/mm/swap.c
index 16a525296960..6e3121dd6e07 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -977,39 +977,6 @@ void __pagevec_release(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_release);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* used by __split_huge_page_refcount() */
-void lru_add_page_tail(struct page *page, struct page *page_tail,
-		       struct lruvec *lruvec, struct list_head *list)
-{
-	VM_BUG_ON_PAGE(!PageHead(page), page);
-	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
-	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
-	lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
-
-	if (!list)
-		SetPageLRU(page_tail);
-
-	if (likely(PageLRU(page)))
-		list_add_tail(&page_tail->lru, &page->lru);
-	else if (list) {
-		/* page reclaim is reclaiming a huge page */
-		get_page(page_tail);
-		list_add_tail(&page_tail->lru, list);
-	} else {
-		/*
-		 * Head page has not yet been counted, as an hpage,
-		 * so we must account for each subpage individually.
-		 *
-		 * Put page_tail on the list at the correct position
-		 * so they all end up in order.
-		 */
-		add_page_to_lru_list_tail(page_tail, lruvec,
-					  page_lru(page_tail));
-	}
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 				 void *arg)
 {
-- 
cgit 


From d25b5bd8a8f420b15517c19c4626c0c009f72a63 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Tue, 15 Dec 2020 12:34:16 -0800
Subject: mm/lru: introduce TestClearPageLRU()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently lru_lock still guards both lru list and page's lru bit, that's
ok.  but if we want to use specific lruvec lock on the page, we need to
pin down the page's lruvec/memcg during locking.  Just taking lruvec lock
first may be undermined by the page's memcg charge/migration.  To fix this
problem, we will clear the lru bit out of locking and use it as pin down
action to block the page isolation in memcg changing.

So now a standard steps of page isolation is following:
	1, get_page(); 	       #pin the page avoid to be free
	2, TestClearPageLRU(); #block other isolation like memcg change
	3, spin_lock on lru_lock; #serialize lru list access
	4, delete page from lru list;

This patch start with the first part: TestClearPageLRU, which combines
PageLRU check and ClearPageLRU into a macro func TestClearPageLRU.  This
function will be used as page isolation precondition to prevent other
isolations some where else.  Then there are may !PageLRU page on lru list,
need to remove BUG() checking accordingly.

There 2 rules for lru bit now:
1, the lru bit still indicate if a page on lru list, just in some
   temporary moment(isolating), the page may have no lru bit when
   it's on lru list.  but the page still must be on lru list when the
   lru bit set.
2, have to remove lru bit before delete it from lru list.

As Andrew Morton mentioned this change would dirty cacheline for a page
which isn't on the LRU.  But the loss would be acceptable in Rong Chen
<rong.a.chen@intel.com> report:
https://lore.kernel.org/lkml/20200304090301.GB5972@shao2-debian/

Link: https://lkml.kernel.org/r/1604566549-62481-15-git-send-email-alex.shi@linux.alibaba.com
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h |  1 +
 mm/mlock.c                 |  3 +--
 mm/vmscan.c                | 39 +++++++++++++++++++--------------------
 3 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c1368af622c7..f8b4375ce1b3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -334,6 +334,7 @@ PAGEFLAG(Referenced, referenced, PF_HEAD)
 PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
 	__CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
 PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+	TESTCLEARFLAG(LRU, lru, PF_HEAD)
 PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
 	TESTCLEARFLAG(Active, active, PF_HEAD)
 PAGEFLAG(Workingset, workingset, PF_HEAD)
diff --git a/mm/mlock.c b/mm/mlock.c
index d487aa864e86..7b0e6334be6f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -276,10 +276,9 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 			 * We already have pin from follow_page_mask()
 			 * so we can spare the get_page() here.
 			 */
-			if (PageLRU(page)) {
+			if (TestClearPageLRU(page)) {
 				struct lruvec *lruvec;
 
-				ClearPageLRU(page);
 				lruvec = mem_cgroup_page_lruvec(page,
 							page_pgdat(page));
 				del_page_from_lru_list(page, lruvec,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 397f89accdb6..54ab9cbb631a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1541,7 +1541,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  */
 int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 {
-	int ret = -EINVAL;
+	int ret = -EBUSY;
 
 	/* Only take pages on the LRU. */
 	if (!PageLRU(page))
@@ -1551,8 +1551,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
 		return ret;
 
-	ret = -EBUSY;
-
 	/*
 	 * To minimise LRU disruption, the caller can indicate that it only
 	 * wants to isolate pages it will be able to operate on without
@@ -1599,8 +1597,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 		 * sure the page is not being freed elsewhere -- the
 		 * page release code relies on it.
 		 */
-		ClearPageLRU(page);
-		ret = 0;
+		if (TestClearPageLRU(page))
+			ret = 0;
+		else
+			put_page(page);
 	}
 
 	return ret;
@@ -1666,8 +1666,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		VM_BUG_ON_PAGE(!PageLRU(page), page);
-
 		nr_pages = compound_nr(page);
 		total_scan += nr_pages;
 
@@ -1764,21 +1762,18 @@ int isolate_lru_page(struct page *page)
 	VM_BUG_ON_PAGE(!page_count(page), page);
 	WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
 
-	if (PageLRU(page)) {
+	if (TestClearPageLRU(page)) {
 		pg_data_t *pgdat = page_pgdat(page);
 		struct lruvec *lruvec;
 
-		spin_lock_irq(&pgdat->lru_lock);
+		get_page(page);
 		lruvec = mem_cgroup_page_lruvec(page, pgdat);
-		if (PageLRU(page)) {
-			int lru = page_lru(page);
-			get_page(page);
-			ClearPageLRU(page);
-			del_page_from_lru_list(page, lruvec, lru);
-			ret = 0;
-		}
+		spin_lock_irq(&pgdat->lru_lock);
+		del_page_from_lru_list(page, lruvec, page_lru(page));
 		spin_unlock_irq(&pgdat->lru_lock);
+		ret = 0;
 	}
+
 	return ret;
 }
 
@@ -4289,6 +4284,10 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		nr_pages = thp_nr_pages(page);
 		pgscanned += nr_pages;
 
+		/* block memcg migration during page moving between lru */
+		if (!TestClearPageLRU(page))
+			continue;
+
 		if (pagepgdat != pgdat) {
 			if (pgdat)
 				spin_unlock_irq(&pgdat->lru_lock);
@@ -4297,10 +4296,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		}
 		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
-		if (!PageLRU(page) || !PageUnevictable(page))
-			continue;
-
-		if (page_evictable(page)) {
+		if (page_evictable(page) && PageUnevictable(page)) {
 			enum lru_list lru = page_lru_base_type(page);
 
 			VM_BUG_ON_PAGE(PageActive(page), page);
@@ -4309,12 +4305,15 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 			add_page_to_lru_list(page, lruvec, lru);
 			pgrescued += nr_pages;
 		}
+		SetPageLRU(page);
 	}
 
 	if (pgdat) {
 		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
 		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
 		spin_unlock_irq(&pgdat->lru_lock);
+	} else if (pgscanned) {
+		count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
 	}
 }
 EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
-- 
cgit 


From 9df41314390b81a541ca6e84c8340bad0959e4b5 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Tue, 15 Dec 2020 12:34:20 -0800
Subject: mm/compaction: do page isolation first in compaction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, compaction would get the lru_lock and then do page isolation
which works fine with pgdat->lru_lock, since any page isoltion would
compete for the lru_lock.  If we want to change to memcg lru_lock, we have
to isolate the page before getting lru_lock, thus isoltion would block
page's memcg change which relay on page isoltion too.  Then we could
safely use per memcg lru_lock later.

The new page isolation use previous introduced TestClearPageLRU() + pgdat
lru locking which will be changed to memcg lru lock later.

Hugh Dickins <hughd@google.com> fixed following bugs in this patch's early
version:

Fix lots of crashes under compaction load: isolate_migratepages_block()
must clean up appropriately when rejecting a page, setting PageLRU again
if it had been cleared; and a put_page() after get_page_unless_zero()
cannot safely be done while holding locked_lruvec - it may turn out to be
the final put_page(), which will take an lruvec lock when PageLRU.

And move __isolate_lru_page_prepare back after get_page_unless_zero to
make trylock_page() safe: trylock_page() is not safe to use at this time:
its setting PG_locked can race with the page being freed or allocated
("Bad page"), and can also erase flags being set by one of those "sole
owners" of a freshly allocated page who use non-atomic __SetPageFlag().

Link: https://lkml.kernel.org/r/1604566549-62481-16-git-send-email-alex.shi@linux.alibaba.com
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: "Chen, Rong A" <rong.a.chen@intel.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 +-
 mm/compaction.c      | 42 +++++++++++++++++++++++++++++++++---------
 mm/vmscan.c          | 43 ++++++++++++++++++++++---------------------
 3 files changed, 56 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5e1e967c225f..596bc2f4d9b0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -356,7 +356,7 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
-extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
+extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
diff --git a/mm/compaction.c b/mm/compaction.c
index dbcfdfce1b82..50938e66083c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -890,6 +890,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
 			if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
 				low_pfn = end_pfn;
+				page = NULL;
 				goto isolate_abort;
 			}
 			valid_page = page;
@@ -971,6 +972,21 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
 			goto isolate_fail;
 
+		/*
+		 * Be careful not to clear PageLRU until after we're
+		 * sure the page is not being freed elsewhere -- the
+		 * page release code relies on it.
+		 */
+		if (unlikely(!get_page_unless_zero(page)))
+			goto isolate_fail;
+
+		if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
+			goto isolate_fail_put;
+
+		/* Try isolate the page */
+		if (!TestClearPageLRU(page))
+			goto isolate_fail_put;
+
 		/* If we already hold the lock, we can skip some rechecking */
 		if (!locked) {
 			locked = compact_lock_irqsave(&pgdat->lru_lock,
@@ -983,10 +999,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 					goto isolate_abort;
 			}
 
-			/* Recheck PageLRU and PageCompound under lock */
-			if (!PageLRU(page))
-				goto isolate_fail;
-
 			/*
 			 * Page become compound since the non-locked check,
 			 * and it's on LRU. It can only be a THP so the order
@@ -994,16 +1006,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			 */
 			if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
 				low_pfn += compound_nr(page) - 1;
-				goto isolate_fail;
+				SetPageLRU(page);
+				goto isolate_fail_put;
 			}
 		}
 
 		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
-		/* Try isolate the page */
-		if (__isolate_lru_page(page, isolate_mode) != 0)
-			goto isolate_fail;
-
 		/* The whole page is taken off the LRU; skip the tail pages. */
 		if (PageCompound(page))
 			low_pfn += compound_nr(page) - 1;
@@ -1032,6 +1041,15 @@ isolate_success:
 		}
 
 		continue;
+
+isolate_fail_put:
+		/* Avoid potential deadlock in freeing page under lru_lock */
+		if (locked) {
+			spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+			locked = false;
+		}
+		put_page(page);
+
 isolate_fail:
 		if (!skip_on_failure)
 			continue;
@@ -1068,9 +1086,15 @@ isolate_fail:
 	if (unlikely(low_pfn > end_pfn))
 		low_pfn = end_pfn;
 
+	page = NULL;
+
 isolate_abort:
 	if (locked)
 		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (page) {
+		SetPageLRU(page);
+		put_page(page);
+	}
 
 	/*
 	 * Updated the cached scanner pfn once the pageblock has been scanned
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 54ab9cbb631a..cf99e66e7f33 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1539,7 +1539,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, isolate_mode_t mode)
+int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 {
 	int ret = -EBUSY;
 
@@ -1591,22 +1591,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
 		return ret;
 
-	if (likely(get_page_unless_zero(page))) {
-		/*
-		 * Be careful not to clear PageLRU until after we're
-		 * sure the page is not being freed elsewhere -- the
-		 * page release code relies on it.
-		 */
-		if (TestClearPageLRU(page))
-			ret = 0;
-		else
-			put_page(page);
-	}
-
-	return ret;
+	return 0;
 }
 
-
 /*
  * Update LRU sizes after isolating pages. The LRU size updates must
  * be complete before mem_cgroup_update_lru_size due to a sanity check.
@@ -1686,20 +1673,34 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		 * only when the page is being freed somewhere else.
 		 */
 		scan += nr_pages;
-		switch (__isolate_lru_page(page, mode)) {
+		switch (__isolate_lru_page_prepare(page, mode)) {
 		case 0:
+			/*
+			 * Be careful not to clear PageLRU until after we're
+			 * sure the page is not being freed elsewhere -- the
+			 * page release code relies on it.
+			 */
+			if (unlikely(!get_page_unless_zero(page)))
+				goto busy;
+
+			if (!TestClearPageLRU(page)) {
+				/*
+				 * This page may in other isolation path,
+				 * but we still hold lru_lock.
+				 */
+				put_page(page);
+				goto busy;
+			}
+
 			nr_taken += nr_pages;
 			nr_zone_taken[page_zonenum(page)] += nr_pages;
 			list_move(&page->lru, dst);
 			break;
 
-		case -EBUSY:
+		default:
+busy:
 			/* else it is being freed elsewhere */
 			list_move(&page->lru, src);
-			continue;
-
-		default:
-			BUG();
 		}
 	}
 
-- 
cgit 


From 6168d0da2b479ce25a4647de194045de1bdd1f1d Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Tue, 15 Dec 2020 12:34:29 -0800
Subject: mm/lru: replace pgdat lru_lock with lruvec lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node.  So on a large machine, each of memcg don't have
to suffer from per node pgdat->lru_lock competition.  They could go fast
with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

In isolate_migratepages_block(), compact_unlock_should_abort and
lock_page_lruvec_irqsave are open coded to work with compact_control.
Also add a debug func in locking which may give some clues if there are
sth out of hands.

Daniel Jordan's testing show 62% improvement on modified readtwice case on
his 2P * 10 core * 2 HT broadwell box.
https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/

Hugh Dickins helped on the patch polish, thanks!

[alex.shi@linux.alibaba.com: fix comment typo]
  Link: https://lkml.kernel.org/r/5b085715-292a-4b43-50b3-d73dc90d1de5@linux.alibaba.com
[alex.shi@linux.alibaba.com: use page_memcg()]
  Link: https://lkml.kernel.org/r/5a4c2b72-7ee8-2478-fc0e-85eb83aafec4@linux.alibaba.com

Link: https://lkml.kernel.org/r/1604566549-62481-18-git-send-email-alex.shi@linux.alibaba.com
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rong Chen <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  58 +++++++++++++++++++++++
 include/linux/mmzone.h     |   3 +-
 mm/compaction.c            |  56 ++++++++++++++--------
 mm/huge_memory.c           |  11 ++---
 mm/memcontrol.c            |  78 ++++++++++++++++++++++++++++--
 mm/mlock.c                 |  22 ++++++---
 mm/mmzone.c                |   1 +
 mm/page_alloc.c            |   1 -
 mm/swap.c                  | 116 ++++++++++++++++++++++-----------------------
 mm/vmscan.c                |  55 ++++++++++-----------
 10 files changed, 275 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f530d634f055..aa5d559853c2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -491,6 +491,19 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
 
+struct lruvec *lock_page_lruvec(struct page *page);
+struct lruvec *lock_page_lruvec_irq(struct page *page);
+struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+						unsigned long *flags);
+
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+#else
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+#endif
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
 	return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -996,6 +1009,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
 
+static inline struct lruvec *lock_page_lruvec(struct page *page)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock(&pgdat->__lruvec.lru_lock);
+	return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock_irq(&pgdat->__lruvec.lru_lock);
+	return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+		unsigned long *flagsp)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
+	return &pgdat->__lruvec;
+}
+
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
@@ -1215,6 +1253,10 @@ static inline
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
 #endif /* CONFIG_MEMCG */
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1296,6 +1338,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
 	return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
 }
 
+static inline void unlock_page_lruvec(struct lruvec *lruvec)
+{
+	spin_unlock(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+{
+	spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+		unsigned long flags)
+{
+	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 98a80c01d150..9da23c019dc5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -276,6 +276,8 @@ enum lruvec_flags {
 
 struct lruvec {
 	struct list_head		lists[NR_LRU_LISTS];
+	/* per lruvec lru_lock for memcg */
+	spinlock_t			lru_lock;
 	/*
 	 * These track the cost of reclaiming one LRU - file or anon -
 	 * over the other. As the observed cost of reclaiming one LRU
@@ -782,7 +784,6 @@ typedef struct pglist_data {
 
 	/* Write-intensive fields used by page reclaim */
 	ZONE_PADDING(_pad1_)
-	spinlock_t		lru_lock;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 	/*
diff --git a/mm/compaction.c b/mm/compaction.c
index 50938e66083c..e5acb9714436 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -804,7 +804,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 	unsigned long nr_scanned = 0, nr_isolated = 0;
 	struct lruvec *lruvec;
 	unsigned long flags = 0;
-	bool locked = false;
+	struct lruvec *locked = NULL;
 	struct page *page = NULL, *valid_page = NULL;
 	unsigned long start_pfn = low_pfn;
 	bool skip_on_failure = false;
@@ -868,11 +868,20 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		 * contention, to give chance to IRQs. Abort completely if
 		 * a fatal signal is pending.
 		 */
-		if (!(low_pfn % SWAP_CLUSTER_MAX)
-		    && compact_unlock_should_abort(&pgdat->lru_lock,
-					    flags, &locked, cc)) {
-			low_pfn = 0;
-			goto fatal_pending;
+		if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+			if (locked) {
+				unlock_page_lruvec_irqrestore(locked, flags);
+				locked = NULL;
+			}
+
+			if (fatal_signal_pending(current)) {
+				cc->contended = true;
+
+				low_pfn = 0;
+				goto fatal_pending;
+			}
+
+			cond_resched();
 		}
 
 		if (!pfn_valid_within(low_pfn))
@@ -944,9 +953,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			if (unlikely(__PageMovable(page)) &&
 					!PageIsolated(page)) {
 				if (locked) {
-					spin_unlock_irqrestore(&pgdat->lru_lock,
-									flags);
-					locked = false;
+					unlock_page_lruvec_irqrestore(locked, flags);
+					locked = NULL;
 				}
 
 				if (!isolate_movable_page(page, isolate_mode))
@@ -987,10 +995,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (!TestClearPageLRU(page))
 			goto isolate_fail_put;
 
+		rcu_read_lock();
+		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
 		/* If we already hold the lock, we can skip some rechecking */
-		if (!locked) {
-			locked = compact_lock_irqsave(&pgdat->lru_lock,
-								&flags, cc);
+		if (lruvec != locked) {
+			if (locked)
+				unlock_page_lruvec_irqrestore(locked, flags);
+
+			compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+			locked = lruvec;
+			rcu_read_unlock();
+
+			lruvec_memcg_debug(lruvec, page);
 
 			/* Try get exclusive access under lock */
 			if (!skip_updated) {
@@ -1009,9 +1026,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 				SetPageLRU(page);
 				goto isolate_fail_put;
 			}
-		}
-
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		} else
+			rcu_read_unlock();
 
 		/* The whole page is taken off the LRU; skip the tail pages. */
 		if (PageCompound(page))
@@ -1045,8 +1061,8 @@ isolate_success:
 isolate_fail_put:
 		/* Avoid potential deadlock in freeing page under lru_lock */
 		if (locked) {
-			spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			locked = false;
+			unlock_page_lruvec_irqrestore(locked, flags);
+			locked = NULL;
 		}
 		put_page(page);
 
@@ -1061,8 +1077,8 @@ isolate_fail:
 		 */
 		if (nr_isolated) {
 			if (locked) {
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-				locked = false;
+				unlock_page_lruvec_irqrestore(locked, flags);
+				locked = NULL;
 			}
 			putback_movable_pages(&cc->migratepages);
 			cc->nr_migratepages = 0;
@@ -1090,7 +1106,7 @@ isolate_fail:
 
 isolate_abort:
 	if (locked)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+		unlock_page_lruvec_irqrestore(locked, flags);
 	if (page) {
 		SetPageLRU(page);
 		put_page(page);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a59333aa4020..3c4a8fc9102f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2365,7 +2365,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
 	VM_BUG_ON_PAGE(!PageHead(head), head);
 	VM_BUG_ON_PAGE(PageCompound(tail), head);
 	VM_BUG_ON_PAGE(PageLRU(tail), head);
-	lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+	lockdep_assert_held(&lruvec->lru_lock);
 
 	if (list) {
 		/* page reclaim is reclaiming a huge page */
@@ -2449,7 +2449,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 		pgoff_t end)
 {
 	struct page *head = compound_head(page);
-	pg_data_t *pgdat = page_pgdat(head);
 	struct lruvec *lruvec;
 	struct address_space *swap_cache = NULL;
 	unsigned long offset = 0;
@@ -2467,10 +2466,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 		xa_lock(&swap_cache->i_pages);
 	}
 
-	/* prevent PageLRU to go away from under us, and freeze lru stats */
-	spin_lock(&pgdat->lru_lock);
-
-	lruvec = mem_cgroup_page_lruvec(head, pgdat);
+	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+	lruvec = lock_page_lruvec(head);
 
 	for (i = nr - 1; i >= 1; i--) {
 		__split_huge_page_tail(head, i, lruvec, list);
@@ -2491,7 +2488,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	}
 
 	ClearPageCompound(head);
-	spin_unlock(&pgdat->lru_lock);
+	unlock_page_lruvec(lruvec);
 	/* Caller disabled irqs, so they are still disabled here */
 
 	split_page_owner(head, nr);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 827879d8d447..2f7824d0c897 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -20,6 +20,9 @@
  * Lockless page tracking & accounting
  * Unified hierarchy configuration model
  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
+ * Per memcg lru locking
+ * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  */
 
 #include <linux/page_counter.h>
@@ -1330,6 +1333,23 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	memcg = page_memcg(page);
+
+	if (!memcg)
+		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+	else
+		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
+}
+#endif
+
 /**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
@@ -1370,6 +1390,60 @@ out:
 	return lruvec;
 }
 
+/**
+ * lock_page_lruvec - lock and return lruvec for a given page.
+ * @page: the page
+ *
+ * This series functions should be used in either conditions:
+ * PageLRU is cleared or unset
+ * or page->_refcount is zero
+ * or page is locked.
+ */
+struct lruvec *lock_page_lruvec(struct page *page)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock(&lruvec->lru_lock);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock_irq(&lruvec->lru_lock);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock_irqsave(&lruvec->lru_lock, *flags);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
 /**
  * mem_cgroup_update_lru_size - account for adding or removing an lru page
  * @lruvec: mem_cgroup per zone lru vector
@@ -3281,10 +3355,8 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
 /*
- * Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * Because page_memcg(head) is not set on compound tails, set it now.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b0e6334be6f..ab164a675c25 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -262,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 	int nr = pagevec_count(pvec);
 	int delta_munlocked = -nr;
 	struct pagevec pvec_putback;
+	struct lruvec *lruvec = NULL;
 	int pgrescued = 0;
 
 	pagevec_init(&pvec_putback);
 
 	/* Phase 1: page isolation */
-	spin_lock_irq(&zone->zone_pgdat->lru_lock);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pvec->pages[i];
 
@@ -277,10 +277,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 			 * so we can spare the get_page() here.
 			 */
 			if (TestClearPageLRU(page)) {
-				struct lruvec *lruvec;
+				struct lruvec *new_lruvec;
+
+				new_lruvec = mem_cgroup_page_lruvec(page,
+						page_pgdat(page));
+				if (new_lruvec != lruvec) {
+					if (lruvec)
+						unlock_page_lruvec_irq(lruvec);
+					lruvec = lock_page_lruvec_irq(page);
+				}
 
-				lruvec = mem_cgroup_page_lruvec(page,
-							page_pgdat(page));
 				del_page_from_lru_list(page, lruvec,
 							page_lru(page));
 				continue;
@@ -299,8 +305,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 		pagevec_add(&pvec_putback, pvec->pages[i]);
 		pvec->pages[i] = NULL;
 	}
-	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-	spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+	if (lruvec) {
+		__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+		unlock_page_lruvec_irq(lruvec);
+	} else if (delta_munlocked) {
+		mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+	}
 
 	/* Now we can release pins of pages that we are not munlocking */
 	pagevec_release(&pvec_putback);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f337831affc2..eb89d6e018e2 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -77,6 +77,7 @@ void lruvec_init(struct lruvec *lruvec)
 	enum lru_list lru;
 
 	memset(lruvec, 0, sizeof(struct lruvec));
+	spin_lock_init(&lruvec->lru_lock);
 
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b63294517e04..b1cc2b7483a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6870,7 +6870,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 
 	pgdat_page_ext_init(pgdat);
-	spin_lock_init(&pgdat->lru_lock);
 	lruvec_init(&pgdat->__lruvec);
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index d952af770053..ba9fc21b24ea 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
 static void __page_cache_release(struct page *page)
 {
 	if (PageLRU(page)) {
-		pg_data_t *pgdat = page_pgdat(page);
 		struct lruvec *lruvec;
 		unsigned long flags;
 
-		spin_lock_irqsave(&pgdat->lru_lock, flags);
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		lruvec = lock_page_lruvec_irqsave(page, &flags);
 		VM_BUG_ON_PAGE(!PageLRU(page), page);
 		__ClearPageLRU(page);
 		del_page_from_lru_list(page, lruvec, page_off_lru(page));
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
 	__ClearPageWaiters(page);
 }
@@ -207,32 +205,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 	void (*move_fn)(struct page *page, struct lruvec *lruvec))
 {
 	int i;
-	struct pglist_data *pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
-
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			pgdat = pagepgdat;
-			spin_lock_irqsave(&pgdat->lru_lock, flags);
-		}
+		struct lruvec *new_lruvec;
 
 		/* block memcg migration during page moving between lru */
 		if (!TestClearPageLRU(page))
 			continue;
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = lock_page_lruvec_irqsave(page, &flags);
+		}
+
 		(*move_fn)(page, lruvec);
 
 		SetPageLRU(page);
 	}
-	if (pgdat)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	release_pages(pvec->pages, pvec->nr);
 	pagevec_reinit(pvec);
 }
@@ -274,9 +270,15 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
 {
 	do {
 		unsigned long lrusize;
-		struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
-		spin_lock_irq(&pgdat->lru_lock);
+		/*
+		 * Hold lruvec->lru_lock is safe here, since
+		 * 1) The pinned lruvec in reclaim, or
+		 * 2) From a pre-LRU page during refault (which also holds the
+		 *    rcu lock, so would be safe even if the page was on the LRU
+		 *    and could move simultaneously to a new lruvec).
+		 */
+		spin_lock_irq(&lruvec->lru_lock);
 		/* Record cost event */
 		if (file)
 			lruvec->file_cost += nr_pages;
@@ -300,7 +302,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
 			lruvec->file_cost /= 2;
 			lruvec->anon_cost /= 2;
 		}
-		spin_unlock_irq(&pgdat->lru_lock);
+		spin_unlock_irq(&lruvec->lru_lock);
 	} while ((lruvec = parent_lruvec(lruvec)));
 }
 
@@ -364,13 +366,15 @@ static inline void activate_page_drain(int cpu)
 
 static void activate_page(struct page *page)
 {
-	pg_data_t *pgdat = page_pgdat(page);
+	struct lruvec *lruvec;
 
 	page = compound_head(page);
-	spin_lock_irq(&pgdat->lru_lock);
-	if (PageLRU(page))
-		__activate_page(page, mem_cgroup_page_lruvec(page, pgdat));
-	spin_unlock_irq(&pgdat->lru_lock);
+	if (TestClearPageLRU(page)) {
+		lruvec = lock_page_lruvec_irq(page);
+		__activate_page(page, lruvec);
+		unlock_page_lruvec_irq(lruvec);
+		SetPageLRU(page);
+	}
 }
 #endif
 
@@ -860,8 +864,7 @@ void release_pages(struct page **pages, int nr)
 {
 	int i;
 	LIST_HEAD(pages_to_free);
-	struct pglist_data *locked_pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long flags;
 	unsigned int lock_batch;
 
@@ -871,11 +874,11 @@ void release_pages(struct page **pages, int nr)
 		/*
 		 * Make sure the IRQ-safe lock-holding time does not get
 		 * excessive with a continuous string of pages from the
-		 * same pgdat. The lock is held only if pgdat != NULL.
+		 * same lruvec. The lock is held only if lruvec != NULL.
 		 */
-		if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
-			spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
-			locked_pgdat = NULL;
+		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
+			unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = NULL;
 		}
 
 		page = compound_head(page);
@@ -883,10 +886,9 @@ void release_pages(struct page **pages, int nr)
 			continue;
 
 		if (is_zone_device_page(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
-						       flags);
-				locked_pgdat = NULL;
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
 			}
 			/*
 			 * ZONE_DEVICE pages that return 'false' from
@@ -907,27 +909,27 @@ void release_pages(struct page **pages, int nr)
 			continue;
 
 		if (PageCompound(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
-				locked_pgdat = NULL;
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
 			}
 			__put_compound_page(page);
 			continue;
 		}
 
 		if (PageLRU(page)) {
-			struct pglist_data *pgdat = page_pgdat(page);
+			struct lruvec *new_lruvec;
 
-			if (pgdat != locked_pgdat) {
-				if (locked_pgdat)
-					spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+			new_lruvec = mem_cgroup_page_lruvec(page,
+							page_pgdat(page));
+			if (new_lruvec != lruvec) {
+				if (lruvec)
+					unlock_page_lruvec_irqrestore(lruvec,
 									flags);
 				lock_batch = 0;
-				locked_pgdat = pgdat;
-				spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
+				lruvec = lock_page_lruvec_irqsave(page, &flags);
 			}
 
-			lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
 			VM_BUG_ON_PAGE(!PageLRU(page), page);
 			__ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -937,8 +939,8 @@ void release_pages(struct page **pages, int nr)
 
 		list_add(&page->lru, &pages_to_free);
 	}
-	if (locked_pgdat)
-		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 
 	mem_cgroup_uncharge_list(&pages_to_free);
 	free_unref_page_list(&pages_to_free);
@@ -1026,26 +1028,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
 void __pagevec_lru_add(struct pagevec *pvec)
 {
 	int i;
-	struct pglist_data *pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
+		struct lruvec *new_lruvec;
 
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			pgdat = pagepgdat;
-			spin_lock_irqsave(&pgdat->lru_lock, flags);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = lock_page_lruvec_irqsave(page, &flags);
 		}
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 		__pagevec_lru_add_fn(page, lruvec);
 	}
-	if (pgdat)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	release_pages(pvec->pages, pvec->nr);
 	pagevec_reinit(pvec);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cf99e66e7f33..b27b5dba3fdd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1764,14 +1764,12 @@ int isolate_lru_page(struct page *page)
 	WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
 
 	if (TestClearPageLRU(page)) {
-		pg_data_t *pgdat = page_pgdat(page);
 		struct lruvec *lruvec;
 
 		get_page(page);
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
-		spin_lock_irq(&pgdat->lru_lock);
+		lruvec = lock_page_lruvec_irq(page);
 		del_page_from_lru_list(page, lruvec, page_lru(page));
-		spin_unlock_irq(&pgdat->lru_lock);
+		unlock_page_lruvec_irq(lruvec);
 		ret = 0;
 	}
 
@@ -1838,7 +1836,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
 static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 						     struct list_head *list)
 {
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	int nr_pages, nr_moved = 0;
 	LIST_HEAD(pages_to_free);
 	struct page *page;
@@ -1849,9 +1846,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		list_del(&page->lru);
 		if (unlikely(!page_evictable(page))) {
-			spin_unlock_irq(&pgdat->lru_lock);
+			spin_unlock_irq(&lruvec->lru_lock);
 			putback_lru_page(page);
-			spin_lock_irq(&pgdat->lru_lock);
+			spin_lock_irq(&lruvec->lru_lock);
 			continue;
 		}
 
@@ -1873,9 +1870,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 			__ClearPageActive(page);
 
 			if (unlikely(PageCompound(page))) {
-				spin_unlock_irq(&pgdat->lru_lock);
+				spin_unlock_irq(&lruvec->lru_lock);
 				destroy_compound_page(page);
-				spin_lock_irq(&pgdat->lru_lock);
+				spin_lock_irq(&lruvec->lru_lock);
 			} else
 				list_add(&page->lru, &pages_to_free);
 
@@ -1952,7 +1949,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	lru_add_drain();
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, lru);
@@ -1964,14 +1961,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
 	__count_vm_events(PGSCAN_ANON + file, nr_scanned);
 
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	if (nr_taken == 0)
 		return 0;
 
 	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 	move_pages_to_lru(lruvec, &page_list);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1980,7 +1977,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		__count_vm_events(item, nr_reclaimed);
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
 	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	lru_note_cost(lruvec, file, stat.nr_pageout);
 	mem_cgroup_uncharge_list(&page_list);
@@ -2033,7 +2030,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	lru_add_drain();
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, lru);
@@ -2044,7 +2041,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 		__count_vm_events(PGREFILL, nr_scanned);
 	__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
 
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	while (!list_empty(&l_hold)) {
 		cond_resched();
@@ -2090,7 +2087,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	/*
 	 * Move pages back to the lru list.
 	 */
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_activate = move_pages_to_lru(lruvec, &l_active);
 	nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2101,7 +2098,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_active);
 	free_unref_page_list(&l_active);
@@ -2689,10 +2686,10 @@ again:
 	/*
 	 * Determine the scan balance between anon and file LRUs.
 	 */
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&target_lruvec->lru_lock);
 	sc->anon_cost = target_lruvec->anon_cost;
 	sc->file_cost = target_lruvec->file_cost;
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&target_lruvec->lru_lock);
 
 	/*
 	 * Target desirable inactive:active list ratios for the anon
@@ -4268,16 +4265,15 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  */
 void check_move_unevictable_pages(struct pagevec *pvec)
 {
-	struct lruvec *lruvec;
-	struct pglist_data *pgdat = NULL;
+	struct lruvec *lruvec = NULL;
 	int pgscanned = 0;
 	int pgrescued = 0;
 	int i;
 
 	for (i = 0; i < pvec->nr; i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
 		int nr_pages;
+		struct lruvec *new_lruvec;
 
 		if (PageTransTail(page))
 			continue;
@@ -4289,13 +4285,12 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		if (!TestClearPageLRU(page))
 			continue;
 
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irq(&pgdat->lru_lock);
-			pgdat = pagepgdat;
-			spin_lock_irq(&pgdat->lru_lock);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irq(lruvec);
+			lruvec = lock_page_lruvec_irq(page);
 		}
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
 		if (page_evictable(page) && PageUnevictable(page)) {
 			enum lru_list lru = page_lru_base_type(page);
@@ -4309,10 +4304,10 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		SetPageLRU(page);
 	}
 
-	if (pgdat) {
+	if (lruvec) {
 		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
 		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-		spin_unlock_irq(&pgdat->lru_lock);
+		unlock_page_lruvec_irq(lruvec);
 	} else if (pgscanned) {
 		count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
 	}
-- 
cgit 


From 2a5e4e340b0fe0f8d402196a466887db6a270b9b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Tue, 15 Dec 2020 12:34:33 -0800
Subject: mm/lru: introduce relock_page_lruvec()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add relock_page_lruvec() to replace repeated same code, no functional
change.

When testing for relock we can avoid the need for RCU locking if we simply
compare the page pgdat and memcg pointers versus those that the lruvec is
holding.  By doing this we can avoid the extra pointer walks and accesses
of the memory cgroup.

In addition we can avoid the checks entirely if lruvec is currently NULL.

[alex.shi@linux.alibaba.com: use page_memcg()]
  Link: https://lkml.kernel.org/r/66d8e79d-7ec6-bfbc-1c82-bf32db3ae5b7@linux.alibaba.com

Link: https://lkml.kernel.org/r/1604566549-62481-19-git-send-email-alex.shi@linux.alibaba.com
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Chen, Rong A" <rong.a.chen@intel.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/mlock.c                 | 11 +---------
 mm/swap.c                  | 33 +++++++----------------------
 mm/vmscan.c                | 12 ++---------
 4 files changed, 62 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index aa5d559853c2..ff02f831e7e1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -485,6 +485,22 @@ out:
 
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
 
+static inline bool lruvec_holds_page_lru_lock(struct page *page,
+					      struct lruvec *lruvec)
+{
+	pg_data_t *pgdat = page_pgdat(page);
+	const struct mem_cgroup *memcg;
+	struct mem_cgroup_per_node *mz;
+
+	if (mem_cgroup_disabled())
+		return lruvec == &pgdat->__lruvec;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	memcg = page_memcg(page) ? : root_mem_cgroup;
+
+	return lruvec->pgdat == pgdat && mz->memcg == memcg;
+}
+
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
@@ -984,6 +1000,14 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
 	return &pgdat->__lruvec;
 }
 
+static inline bool lruvec_holds_page_lru_lock(struct page *page,
+					      struct lruvec *lruvec)
+{
+	pg_data_t *pgdat = page_pgdat(page);
+
+	return lruvec == &pgdat->__lruvec;
+}
+
 static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
 	return NULL;
@@ -1354,6 +1378,34 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
 	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
 }
 
+/* Don't lock again iff page's lruvec locked */
+static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
+		struct lruvec *locked_lruvec)
+{
+	if (locked_lruvec) {
+		if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+			return locked_lruvec;
+
+		unlock_page_lruvec_irq(locked_lruvec);
+	}
+
+	return lock_page_lruvec_irq(page);
+}
+
+/* Don't lock again iff page's lruvec locked */
+static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
+		struct lruvec *locked_lruvec, unsigned long *flags)
+{
+	if (locked_lruvec) {
+		if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+			return locked_lruvec;
+
+		unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
+	}
+
+	return lock_page_lruvec_irqsave(page, flags);
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/mm/mlock.c b/mm/mlock.c
index ab164a675c25..55b3b3672977 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -277,16 +277,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 			 * so we can spare the get_page() here.
 			 */
 			if (TestClearPageLRU(page)) {
-				struct lruvec *new_lruvec;
-
-				new_lruvec = mem_cgroup_page_lruvec(page,
-						page_pgdat(page));
-				if (new_lruvec != lruvec) {
-					if (lruvec)
-						unlock_page_lruvec_irq(lruvec);
-					lruvec = lock_page_lruvec_irq(page);
-				}
-
+				lruvec = relock_page_lruvec_irq(page, lruvec);
 				del_page_from_lru_list(page, lruvec,
 							page_lru(page));
 				continue;
diff --git a/mm/swap.c b/mm/swap.c
index ba9fc21b24ea..2cca7141470c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -210,19 +210,12 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct lruvec *new_lruvec;
 
 		/* block memcg migration during page moving between lru */
 		if (!TestClearPageLRU(page))
 			continue;
 
-		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
-		if (lruvec != new_lruvec) {
-			if (lruvec)
-				unlock_page_lruvec_irqrestore(lruvec, flags);
-			lruvec = lock_page_lruvec_irqsave(page, &flags);
-		}
-
+		lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
 		(*move_fn)(page, lruvec);
 
 		SetPageLRU(page);
@@ -918,17 +911,12 @@ void release_pages(struct page **pages, int nr)
 		}
 
 		if (PageLRU(page)) {
-			struct lruvec *new_lruvec;
-
-			new_lruvec = mem_cgroup_page_lruvec(page,
-							page_pgdat(page));
-			if (new_lruvec != lruvec) {
-				if (lruvec)
-					unlock_page_lruvec_irqrestore(lruvec,
-									flags);
+			struct lruvec *prev_lruvec = lruvec;
+
+			lruvec = relock_page_lruvec_irqsave(page, lruvec,
+									&flags);
+			if (prev_lruvec != lruvec)
 				lock_batch = 0;
-				lruvec = lock_page_lruvec_irqsave(page, &flags);
-			}
 
 			VM_BUG_ON_PAGE(!PageLRU(page), page);
 			__ClearPageLRU(page);
@@ -1033,15 +1021,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct lruvec *new_lruvec;
-
-		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
-		if (lruvec != new_lruvec) {
-			if (lruvec)
-				unlock_page_lruvec_irqrestore(lruvec, flags);
-			lruvec = lock_page_lruvec_irqsave(page, &flags);
-		}
 
+		lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
 		__pagevec_lru_add_fn(page, lruvec);
 	}
 	if (lruvec)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b27b5dba3fdd..60705ea598ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1883,8 +1883,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		 * All pages were isolated from the same lruvec (and isolation
 		 * inhibits memcg migration).
 		 */
-		VM_BUG_ON_PAGE(mem_cgroup_page_lruvec(page, page_pgdat(page))
-							!= lruvec, page);
+		VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
 		lru = page_lru(page);
 		nr_pages = thp_nr_pages(page);
 
@@ -4273,7 +4272,6 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 	for (i = 0; i < pvec->nr; i++) {
 		struct page *page = pvec->pages[i];
 		int nr_pages;
-		struct lruvec *new_lruvec;
 
 		if (PageTransTail(page))
 			continue;
@@ -4285,13 +4283,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		if (!TestClearPageLRU(page))
 			continue;
 
-		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
-		if (lruvec != new_lruvec) {
-			if (lruvec)
-				unlock_page_lruvec_irq(lruvec);
-			lruvec = lock_page_lruvec_irq(page);
-		}
-
+		lruvec = relock_page_lruvec_irq(page, lruvec);
 		if (page_evictable(page) && PageUnevictable(page)) {
 			enum lru_list lru = page_lru_base_type(page);
 
-- 
cgit 


From 15b447361794271f4d03c04d82276a841fe06328 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 15 Dec 2020 14:21:31 -0800
Subject: mm/lru: revise the comments of lru_lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since we changed the pgdat->lru_lock to lruvec->lru_lock, it's time to fix
the incorrect comments in code.  Also fixed some zone->lru_lock comment
error from ancient time.  etc.

I struggled to understand the comment above move_pages_to_lru() (surely
it never calls page_referenced()), and eventually realized that most of
it had got separated from shrink_active_list(): move that comment back.

Link: https://lkml.kernel.org/r/1604566549-62481-20-git-send-email-alex.shi@linux.alibaba.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Jann Horn <jannh@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Chen, Rong A" <rong.a.chen@intel.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v1/memcg_test.rst | 15 ++------
 Documentation/admin-guide/cgroup-v1/memory.rst     | 21 +++++------
 Documentation/trace/events-kmem.rst                |  2 +-
 Documentation/vm/unevictable-lru.rst               | 22 +++++-------
 include/linux/mm_types.h                           |  2 +-
 include/linux/mmzone.h                             |  3 +-
 mm/filemap.c                                       |  4 +--
 mm/rmap.c                                          |  4 +--
 mm/vmscan.c                                        | 41 ++++++++++++----------
 9 files changed, 50 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
index 4f83de2dab6e..45b94f7b3beb 100644
--- a/Documentation/admin-guide/cgroup-v1/memcg_test.rst
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -133,18 +133,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 
 8. LRU
 ======
-        Each memcg has its own private LRU. Now, its handling is under global
-	VM's control (means that it's handled under global pgdat->lru_lock).
-	Almost all routines around memcg's LRU is called by global LRU's
-	list management functions under pgdat->lru_lock.
-
-	A special function is mem_cgroup_isolate_pages(). This scans
-	memcg's private LRU and call __isolate_lru_page() to extract a page
-	from LRU.
-
-	(By __isolate_lru_page(), the page is removed from both of global and
-	private LRU.)
-
+	Each memcg has its own vector of LRUs (inactive anon, active anon,
+	inactive file, active file, unevictable) of pages from each node,
+	each LRU handled under a single lru_lock for that memcg and node.
 
 9. Typical Tests.
 =================
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index a44cd467d218..52688ae34461 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -287,20 +287,17 @@ When oom event notifier is registered, event will be delivered.
 2.6 Locking
 -----------
 
-   lock_page_cgroup()/unlock_page_cgroup() should not be called under
-   the i_pages lock.
+Lock order is as follows:
 
-   Other lock order is following:
+  Page lock (PG_locked bit of page->flags)
+    mm->page_table_lock or split pte_lock
+      lock_page_memcg (memcg->move_lock)
+        mapping->i_pages lock
+          lruvec->lru_lock.
 
-   PG_locked.
-     mm->page_table_lock
-         pgdat->lru_lock
-	   lock_page_cgroup.
-
-  In many cases, just lock_page_cgroup() is called.
-
-  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  pgdat->lru_lock, it has no lock of its own.
+Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by
+lruvec->lru_lock; PG_lru bit of page->flags is cleared before
+isolating a page from its LRU under lruvec->lru_lock.
 
 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
 -----------------------------------------------
diff --git a/Documentation/trace/events-kmem.rst b/Documentation/trace/events-kmem.rst
index 555484110e36..68fa75247488 100644
--- a/Documentation/trace/events-kmem.rst
+++ b/Documentation/trace/events-kmem.rst
@@ -69,7 +69,7 @@ When pages are freed in batch, the also mm_page_free_batched is triggered.
 Broadly speaking, pages are taken off the LRU lock in bulk and
 freed in batch with a page list. Significant amounts of activity here could
 indicate that the system is under memory pressure and can also indicate
-contention on the zone->lru_lock.
+contention on the lruvec->lru_lock.
 
 4. Per-CPU Allocator Activity
 =============================
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index 17d0861b0f1d..0e1490524f53 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -33,7 +33,7 @@ reclaim in Linux.  The problems have been observed at customer sites on large
 memory x86_64 systems.
 
 To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
-main memory will have over 32 million 4k pages in a single zone.  When a large
+main memory will have over 32 million 4k pages in a single node.  When a large
 fraction of these pages are not evictable for any reason [see below], vmscan
 will spend a lot of time scanning the LRU lists looking for the small fraction
 of pages that are evictable.  This can result in a situation where all CPUs are
@@ -55,7 +55,7 @@ unevictable, either by definition or by circumstance, in the future.
 The Unevictable Page List
 -------------------------
 
-The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
+The Unevictable LRU infrastructure consists of an additional, per-node, LRU list
 called the "unevictable" list and an associated page flag, PG_unevictable, to
 indicate that the page is being managed on the unevictable list.
 
@@ -84,15 +84,9 @@ The unevictable list does not differentiate between file-backed and anonymous,
 swap-backed pages.  This differentiation is only important while the pages are,
 in fact, evictable.
 
-The unevictable list benefits from the "arrayification" of the per-zone LRU
+The unevictable list benefits from the "arrayification" of the per-node LRU
 lists and statistics originally proposed and posted by Christoph Lameter.
 
-The unevictable list does not use the LRU pagevec mechanism. Rather,
-unevictable pages are placed directly on the page's zone's unevictable list
-under the zone lru_lock.  This allows us to prevent the stranding of pages on
-the unevictable list when one task has the page isolated from the LRU and other
-tasks are changing the "evictability" state of the page.
-
 
 Memory Control Group Interaction
 --------------------------------
@@ -101,8 +95,8 @@ The unevictable LRU facility interacts with the memory control group [aka
 memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the
 lru_list enum.
 
-The memory controller data structure automatically gets a per-zone unevictable
-list as a result of the "arrayification" of the per-zone LRU lists (one per
+The memory controller data structure automatically gets a per-node unevictable
+list as a result of the "arrayification" of the per-node LRU lists (one per
 lru_list enum element).  The memory controller tracks the movement of pages to
 and from the unevictable list.
 
@@ -196,7 +190,7 @@ for the sake of expediency, to leave a unevictable page on one of the regular
 active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
 pages in all of the shrink_{active|inactive|page}_list() functions and will
 "cull" such pages that it encounters: that is, it diverts those pages to the
-unevictable list for the zone being scanned.
+unevictable list for the node being scanned.
 
 There may be situations where a page is mapped into a VM_LOCKED VMA, but the
 page is not marked as PG_mlocked.  Such pages will make it all the way to
@@ -328,7 +322,7 @@ If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
 page from the LRU, as it is likely on the appropriate active or inactive list
 at that time.  If the isolate_lru_page() succeeds, mlock_vma_page() will put
 back the page - by calling putback_lru_page() - which will notice that the page
-is now mlocked and divert the page to the zone's unevictable list.  If
+is now mlocked and divert the page to the node's unevictable list.  If
 mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
 it later if and when it attempts to reclaim the page.
 
@@ -603,7 +597,7 @@ Some examples of these unevictable pages on the LRU lists are:
      unevictable list in mlock_vma_page().
 
 shrink_inactive_list() also diverts any unevictable pages that it finds on the
-inactive lists to the appropriate zone's unevictable list.
+inactive lists to the appropriate node's unevictable list.
 
 shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
 after shrink_active_list() had moved them to the inactive list, or pages mapped
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 915f4f100383..a9688cc55964 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -79,7 +79,7 @@ struct page {
 		struct {	/* Page cache and anonymous pages */
 			/**
 			 * @lru: Pageout list, eg. active_list protected by
-			 * pgdat->lru_lock.  Sometimes used as a generic list
+			 * lruvec->lru_lock.  Sometimes used as a generic list
 			 * by the page owner.
 			 */
 			struct list_head lru;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9da23c019dc5..b593316bff3d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -113,8 +113,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
 struct pglist_data;
 
 /*
- * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
- * So add a wild amount of padding here to ensure that they fall into separate
+ * Add a wild amount of padding here to ensure datas fall into separate
  * cachelines.  There are very few zone structures in the machine, so space
  * consumption is not a concern here.
  */
diff --git a/mm/filemap.c b/mm/filemap.c
index 39bb88140680..c178022d7893 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,8 +102,8 @@
  *    ->swap_lock		(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
  *    ->i_pages lock		(try_to_unmap_one)
- *    ->pgdat->lru_lock		(follow_page->mark_page_accessed)
- *    ->pgdat->lru_lock		(check_pte_range->isolate_lru_page)
+ *    ->lruvec->lru_lock	(follow_page->mark_page_accessed)
+ *    ->lruvec->lru_lock	(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
diff --git a/mm/rmap.c b/mm/rmap.c
index ab16c96efdfb..08c56aaf72eb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -28,12 +28,12 @@
  *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
  *           anon_vma->rwsem
  *             mm->page_table_lock or pte_lock
- *               pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
  *               swap_lock (in swap_duplicate, swap_info_get)
  *                 mmlist_lock (in mmput, drain_mmlist and others)
  *                 mapping->private_lock (in __set_page_dirty_buffers)
- *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                   lock_page_memcg move_lock (in __set_page_dirty_buffers)
  *                     i_pages lock (widely used)
+ *                       lruvec->lru_lock (in lock_page_lruvec_irq)
  *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                   sb_lock (within inode_lock in fs/fs-writeback.c)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 60705ea598ee..257cba79a96d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1613,14 +1613,16 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
 }
 
 /**
- * pgdat->lru_lock is heavily contended.  Some of the functions that
+ * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
+ *
+ * lruvec->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
  * and working on them outside the LRU lock.
  *
  * For pagecache intensive workloads, this function is the hottest
  * spot in the kernel (apart from copy_*_user functions).
  *
- * Appropriate locks must be held before calling this function.
+ * Lru_lock must be held before calling this function.
  *
  * @nr_to_scan:	The number of eligible pages to look through on the list.
  * @lruvec:	The LRU vector to pull pages from.
@@ -1814,25 +1816,11 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
 }
 
 /*
- * This moves pages from @list to corresponding LRU list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone_lru_lock across the whole operation.  But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone_lru_lock around each page.  It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
+ * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
+ * On return, @list is reused as a list of pages to be freed by the caller.
  *
  * Returns the number of pages moved to the given lruvec.
  */
-
 static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 						     struct list_head *list)
 {
@@ -2010,6 +1998,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	return nr_reclaimed;
 }
 
+/*
+ * shrink_active_list() moves pages from the active LRU to the inactive LRU.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold lru_lock across the whole operation.  But if
+ * the pages are mapped, the processing is slow (page_referenced()), so
+ * we should drop lru_lock around each page.  It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ */
 static void shrink_active_list(unsigned long nr_to_scan,
 			       struct lruvec *lruvec,
 			       struct scan_control *sc,
-- 
cgit 


From c6c75deda81344c3a95d1d1f606d5cee109e5d54 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 15 Dec 2020 20:42:39 -0800
Subject: proc: fix lookup in /proc/net subdirectories after setns(2)

Commit 1fde6f21d90f ("proc: fix /proc/net/* after setns(2)") only forced
revalidation of regular files under /proc/net/

However, /proc/net/ is unusual in the sense of /proc/net/foo handlers
take netns pointer from parent directory which is old netns.

Steps to reproduce:

	(void)open("/proc/net/sctp/snmp", O_RDONLY);
	unshare(CLONE_NEWNET);

	int fd = open("/proc/net/sctp/snmp", O_RDONLY);
	read(fd, &c, 1);

Read will read wrong data from original netns.

Patch forces lookup on every directory under /proc/net .

Link: https://lkml.kernel.org/r/20201205160916.GA109739@localhost.localdomain
Fixes: 1da4d377f943 ("proc: revalidate misc dentries")
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reported-by: "Rantala, Tommi T. (Nokia - FI/Espoo)" <tommi.t.rantala@nokia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c       | 24 ++++++++++++++++++++++--
 fs/proc/internal.h      |  7 +++++++
 fs/proc/proc_net.c      | 16 ----------------
 include/linux/proc_fs.h |  8 +++++++-
 4 files changed, 36 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b84663252add..6c0a05f55d6b 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -349,6 +349,16 @@ static const struct file_operations proc_dir_operations = {
 	.iterate_shared		= proc_readdir,
 };
 
+static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	return 0;
+}
+
+const struct dentry_operations proc_net_dentry_ops = {
+	.d_revalidate	= proc_net_d_revalidate,
+	.d_delete	= always_delete_dentry,
+};
+
 /*
  * proc directories can do almost nothing..
  */
@@ -471,8 +481,8 @@ struct proc_dir_entry *proc_symlink(const char *name,
 }
 EXPORT_SYMBOL(proc_symlink);
 
-struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
-		struct proc_dir_entry *parent, void *data)
+struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, void *data, bool force_lookup)
 {
 	struct proc_dir_entry *ent;
 
@@ -484,10 +494,20 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
 		ent->data = data;
 		ent->proc_dir_ops = &proc_dir_operations;
 		ent->proc_iops = &proc_dir_inode_operations;
+		if (force_lookup) {
+			pde_force_lookup(ent);
+		}
 		ent = proc_register(parent, ent);
 	}
 	return ent;
 }
+EXPORT_SYMBOL_GPL(_proc_mkdir);
+
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, void *data)
+{
+	return _proc_mkdir(name, mode, parent, data, false);
+}
 EXPORT_SYMBOL_GPL(proc_mkdir_data);
 
 struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 917cc85e3466..afbe96b6bf77 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -310,3 +310,10 @@ extern unsigned long task_statm(struct mm_struct *,
 				unsigned long *, unsigned long *,
 				unsigned long *, unsigned long *);
 extern void task_mem(struct seq_file *, struct mm_struct *);
+
+extern const struct dentry_operations proc_net_dentry_ops;
+static inline void pde_force_lookup(struct proc_dir_entry *pde)
+{
+	/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+	pde->proc_dops = &proc_net_dentry_ops;
+}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 45c7318a2ce5..18601042af99 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -39,22 +39,6 @@ static struct net *get_proc_net(const struct inode *inode)
 	return maybe_get_net(PDE_NET(PDE(inode)));
 }
 
-static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	return 0;
-}
-
-static const struct dentry_operations proc_net_dentry_ops = {
-	.d_revalidate	= proc_net_d_revalidate,
-	.d_delete	= always_delete_dentry,
-};
-
-static void pde_force_lookup(struct proc_dir_entry *pde)
-{
-	/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
-	pde->proc_dops = &proc_net_dentry_ops;
-}
-
 static int seq_open_net(struct inode *inode, struct file *file)
 {
 	unsigned int state_size = PDE(inode)->state_size;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 270cab43ca3d..000cc0533c33 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -80,6 +80,7 @@ extern void proc_flush_pid(struct pid *);
 
 extern struct proc_dir_entry *proc_symlink(const char *,
 		struct proc_dir_entry *, const char *);
+struct proc_dir_entry *_proc_mkdir(const char *, umode_t, struct proc_dir_entry *, void *, bool);
 extern struct proc_dir_entry *proc_mkdir(const char *, struct proc_dir_entry *);
 extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
 					      struct proc_dir_entry *, void *);
@@ -162,6 +163,11 @@ static inline struct proc_dir_entry *proc_symlink(const char *name,
 static inline struct proc_dir_entry *proc_mkdir(const char *name,
 	struct proc_dir_entry *parent) {return NULL;}
 static inline struct proc_dir_entry *proc_create_mount_point(const char *name) { return NULL; }
+static inline struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, void *data, bool force_lookup)
+{
+	return NULL;
+}
 static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
@@ -199,7 +205,7 @@ struct net;
 static inline struct proc_dir_entry *proc_net_mkdir(
 	struct net *net, const char *name, struct proc_dir_entry *parent)
 {
-	return proc_mkdir_data(name, 0, parent, net);
+	return _proc_mkdir(name, 0, parent, net, true);
 }
 
 struct ns_common;
-- 
cgit 


From aa6159ab99a9ab5df835b4750b66cf132a5aa292 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 15 Dec 2020 20:42:48 -0800
Subject: kernel.h: split out mathematical helpers

kernel.h is being used as a dump for all kinds of stuff for a long time.
Here is the attempt to start cleaning it up by splitting out
mathematical helpers.

At the same time convert users in header and lib folder to use new
header.  Though for time being include new header back to kernel.h to
avoid twisted indirected includes for existing users.

[sfr@canb.auug.org.au: fix powerpc build]
  Link: https://lkml.kernel.org/r/20201029150809.13059608@canb.auug.org.au

Link: https://lkml.kernel.org/r/20201028173212.41768-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/callback_proc.c        |   5 ++
 include/linux/bitops.h        |  11 ++-
 include/linux/dcache.h        |   1 +
 include/linux/iommu-helper.h  |   4 +-
 include/linux/kernel.h        | 173 +----------------------------------------
 include/linux/math.h          | 177 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/rcu_node_tree.h |   2 +
 include/linux/units.h         |   2 +-
 lib/errname.c                 |   1 +
 lib/errseq.c                  |   1 +
 lib/find_bit.c                |   3 +-
 lib/math/div64.c              |   4 +-
 lib/math/int_pow.c            |   2 +-
 lib/math/int_sqrt.c           |   3 +-
 lib/math/reciprocal_div.c     |   9 ++-
 15 files changed, 215 insertions(+), 183 deletions(-)
 create mode 100644 include/linux/math.h

(limited to 'include/linux')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e61dbc9b86ae..f7786e00a6a7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -6,10 +6,15 @@
  *
  * NFSv4 callback procedures
  */
+
+#include <linux/errno.h>
+#include <linux/math.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
+#include <linux/types.h>
+
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 5b74bdf159d6..a61f192c096b 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -1,9 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_BITOPS_H
 #define _LINUX_BITOPS_H
+
 #include <asm/types.h>
 #include <linux/bits.h>
 
+#include <uapi/linux/kernel.h>
+
 /* Set bits in the first 'n' bytes when loaded from memory */
 #ifdef __LITTLE_ENDIAN
 #  define aligned_byte_mask(n) ((1UL << 8*(n))-1)
@@ -12,10 +15,10 @@
 #endif
 
 #define BITS_PER_TYPE(type)	(sizeof(type) * BITS_PER_BYTE)
-#define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
-#define BITS_TO_U64(nr)		DIV_ROUND_UP(nr, BITS_PER_TYPE(u64))
-#define BITS_TO_U32(nr)		DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
-#define BITS_TO_BYTES(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(char))
+#define BITS_TO_LONGS(nr)	__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
+#define BITS_TO_U64(nr)		__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64))
+#define BITS_TO_U32(nr)		__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
+#define BITS_TO_BYTES(nr)	__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char))
 
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6f95c3300cbb..d7b369fc15d3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -4,6 +4,7 @@
 
 #include <linux/atomic.h>
 #include <linux/list.h>
+#include <linux/math.h>
 #include <linux/rculist.h>
 #include <linux/rculist_bl.h>
 #include <linux/spinlock.h>
diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
index 70d01edcbf8b..74be34f3a20a 100644
--- a/include/linux/iommu-helper.h
+++ b/include/linux/iommu-helper.h
@@ -3,7 +3,9 @@
 #define _LINUX_IOMMU_HELPER_H
 
 #include <linux/bug.h>
-#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/math.h>
+#include <linux/types.h>
 
 static inline unsigned long iommu_device_max_index(unsigned long size,
 						   unsigned long offset,
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index dbf6018fc312..f7902d8c1048 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_KERNEL_H
 #define _LINUX_KERNEL_H
 
-
 #include <stdarg.h>
 #include <linux/limits.h>
 #include <linux/linkage.h>
@@ -11,12 +10,14 @@
 #include <linux/compiler.h>
 #include <linux/bitops.h>
 #include <linux/log2.h>
+#include <linux/math.h>
 #include <linux/minmax.h>
 #include <linux/typecheck.h>
 #include <linux/printk.h>
 #include <linux/build_bug.h>
+
 #include <asm/byteorder.h>
-#include <asm/div64.h>
+
 #include <uapi/linux/kernel.h>
 
 #define STACK_MAGIC	0xdeadbeef
@@ -54,125 +55,11 @@
 }					\
 )
 
-/*
- * This looks more complex than it should be. But we need to
- * get the type for the ~ right in round_down (it needs to be
- * as wide as the result!), and we want to evaluate the macro
- * arguments just once each.
- */
-#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-/**
- * round_up - round up to next specified power of 2
- * @x: the value to round
- * @y: multiple to round up to (must be a power of 2)
- *
- * Rounds @x up to next multiple of @y (which must be a power of 2).
- * To perform arbitrary rounding up, use roundup() below.
- */
-#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
-/**
- * round_down - round down to next specified power of 2
- * @x: the value to round
- * @y: multiple to round down to (must be a power of 2)
- *
- * Rounds @x down to next multiple of @y (which must be a power of 2).
- * To perform arbitrary rounding down, use rounddown() below.
- */
-#define round_down(x, y) ((x) & ~__round_mask(x, y))
-
 #define typeof_member(T, m)	typeof(((T*)0)->m)
 
-#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
-
-#define DIV_ROUND_DOWN_ULL(ll, d) \
-	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
-
-#define DIV_ROUND_UP_ULL(ll, d) \
-	DIV_ROUND_DOWN_ULL((unsigned long long)(ll) + (d) - 1, (d))
-
-#if BITS_PER_LONG == 32
-# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d)
-#else
-# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d)
-#endif
-
-/**
- * roundup - round up to the next specified multiple
- * @x: the value to up
- * @y: multiple to round up to
- *
- * Rounds @x up to next multiple of @y. If @y will always be a power
- * of 2, consider using the faster round_up().
- */
-#define roundup(x, y) (					\
-{							\
-	typeof(y) __y = y;				\
-	(((x) + (__y - 1)) / __y) * __y;		\
-}							\
-)
-/**
- * rounddown - round down to next specified multiple
- * @x: the value to round
- * @y: multiple to round down to
- *
- * Rounds @x down to next multiple of @y. If @y will always be a power
- * of 2, consider using the faster round_down().
- */
-#define rounddown(x, y) (				\
-{							\
-	typeof(x) __x = (x);				\
-	__x - (__x % (y));				\
-}							\
-)
-
-/*
- * Divide positive or negative dividend by positive or negative divisor
- * and round to closest integer. Result is undefined for negative
- * divisors if the dividend variable type is unsigned and for negative
- * dividends if the divisor variable type is unsigned.
- */
-#define DIV_ROUND_CLOSEST(x, divisor)(			\
-{							\
-	typeof(x) __x = x;				\
-	typeof(divisor) __d = divisor;			\
-	(((typeof(x))-1) > 0 ||				\
-	 ((typeof(divisor))-1) > 0 ||			\
-	 (((__x) > 0) == ((__d) > 0))) ?		\
-		(((__x) + ((__d) / 2)) / (__d)) :	\
-		(((__x) - ((__d) / 2)) / (__d));	\
-}							\
-)
-/*
- * Same as above but for u64 dividends. divisor must be a 32-bit
- * number.
- */
-#define DIV_ROUND_CLOSEST_ULL(x, divisor)(		\
-{							\
-	typeof(divisor) __d = divisor;			\
-	unsigned long long _tmp = (x) + (__d) / 2;	\
-	do_div(_tmp, __d);				\
-	_tmp;						\
-}							\
-)
-
-/*
- * Multiplies an integer by a fraction, while avoiding unnecessary
- * overflow or loss of precision.
- */
-#define mult_frac(x, numer, denom)(			\
-{							\
-	typeof(x) quot = (x) / (denom);			\
-	typeof(x) rem  = (x) % (denom);			\
-	(quot * (numer)) + ((rem * (numer)) / (denom));	\
-}							\
-)
-
-
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
-#define sector_div(a, b) do_div(a, b)
-
 /**
  * upper_32_bits - return bits 32-63 of a number
  * @n: the number we're accessing
@@ -272,48 +159,6 @@ extern void __cant_migrate(const char *file, int line);
 
 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
 
-/**
- * abs - return absolute value of an argument
- * @x: the value.  If it is unsigned type, it is converted to signed type first.
- *     char is treated as if it was signed (regardless of whether it really is)
- *     but the macro's return type is preserved as char.
- *
- * Return: an absolute value of x.
- */
-#define abs(x)	__abs_choose_expr(x, long long,				\
-		__abs_choose_expr(x, long,				\
-		__abs_choose_expr(x, int,				\
-		__abs_choose_expr(x, short,				\
-		__abs_choose_expr(x, char,				\
-		__builtin_choose_expr(					\
-			__builtin_types_compatible_p(typeof(x), char),	\
-			(char)({ signed char __x = (x); __x<0?-__x:__x; }), \
-			((void)0)))))))
-
-#define __abs_choose_expr(x, type, other) __builtin_choose_expr(	\
-	__builtin_types_compatible_p(typeof(x),   signed type) ||	\
-	__builtin_types_compatible_p(typeof(x), unsigned type),		\
-	({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
-
-/**
- * reciprocal_scale - "scale" a value into range [0, ep_ro)
- * @val: value
- * @ep_ro: right open interval endpoint
- *
- * Perform a "reciprocal multiplication" in order to "scale" a value into
- * range [0, @ep_ro), where the upper interval endpoint is right-open.
- * This is useful, e.g. for accessing a index of an array containing
- * @ep_ro elements, for example. Think of it as sort of modulus, only that
- * the result isn't that of modulo. ;) Note that if initial input is a
- * small value, then result will return 0.
- *
- * Return: a result based on @val in interval [0, @ep_ro).
- */
-static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
-{
-	return (u32)(((u64) val * ep_ro) >> 32);
-}
-
 #if defined(CONFIG_MMU) && \
 	(defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
 #define might_fault() __might_fault(__FILE__, __LINE__)
@@ -515,18 +360,6 @@ extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int func_ptr_is_kernel_text(void *ptr);
 
-u64 int_pow(u64 base, unsigned int exp);
-unsigned long int_sqrt(unsigned long);
-
-#if BITS_PER_LONG < 64
-u32 int_sqrt64(u64 x);
-#else
-static inline u32 int_sqrt64(u64 x)
-{
-	return (u32)int_sqrt(x);
-}
-#endif
-
 #ifdef CONFIG_SMP
 extern unsigned int sysctl_oops_all_cpu_backtrace;
 #else
diff --git a/include/linux/math.h b/include/linux/math.h
new file mode 100644
index 000000000000..53674a327e39
--- /dev/null
+++ b/include/linux/math.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MATH_H
+#define _LINUX_MATH_H
+
+#include <asm/div64.h>
+#include <uapi/linux/kernel.h>
+
+/*
+ * This looks more complex than it should be. But we need to
+ * get the type for the ~ right in round_down (it needs to be
+ * as wide as the result!), and we want to evaluate the macro
+ * arguments just once each.
+ */
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+
+/**
+ * round_up - round up to next specified power of 2
+ * @x: the value to round
+ * @y: multiple to round up to (must be a power of 2)
+ *
+ * Rounds @x up to next multiple of @y (which must be a power of 2).
+ * To perform arbitrary rounding up, use roundup() below.
+ */
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+
+/**
+ * round_down - round down to next specified power of 2
+ * @x: the value to round
+ * @y: multiple to round down to (must be a power of 2)
+ *
+ * Rounds @x down to next multiple of @y (which must be a power of 2).
+ * To perform arbitrary rounding down, use rounddown() below.
+ */
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+
+#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
+
+#define DIV_ROUND_DOWN_ULL(ll, d) \
+	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
+
+#define DIV_ROUND_UP_ULL(ll, d) \
+	DIV_ROUND_DOWN_ULL((unsigned long long)(ll) + (d) - 1, (d))
+
+#if BITS_PER_LONG == 32
+# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d)
+#else
+# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d)
+#endif
+
+/**
+ * roundup - round up to the next specified multiple
+ * @x: the value to up
+ * @y: multiple to round up to
+ *
+ * Rounds @x up to next multiple of @y. If @y will always be a power
+ * of 2, consider using the faster round_up().
+ */
+#define roundup(x, y) (					\
+{							\
+	typeof(y) __y = y;				\
+	(((x) + (__y - 1)) / __y) * __y;		\
+}							\
+)
+/**
+ * rounddown - round down to next specified multiple
+ * @x: the value to round
+ * @y: multiple to round down to
+ *
+ * Rounds @x down to next multiple of @y. If @y will always be a power
+ * of 2, consider using the faster round_down().
+ */
+#define rounddown(x, y) (				\
+{							\
+	typeof(x) __x = (x);				\
+	__x - (__x % (y));				\
+}							\
+)
+
+/*
+ * Divide positive or negative dividend by positive or negative divisor
+ * and round to closest integer. Result is undefined for negative
+ * divisors if the dividend variable type is unsigned and for negative
+ * dividends if the divisor variable type is unsigned.
+ */
+#define DIV_ROUND_CLOSEST(x, divisor)(			\
+{							\
+	typeof(x) __x = x;				\
+	typeof(divisor) __d = divisor;			\
+	(((typeof(x))-1) > 0 ||				\
+	 ((typeof(divisor))-1) > 0 ||			\
+	 (((__x) > 0) == ((__d) > 0))) ?		\
+		(((__x) + ((__d) / 2)) / (__d)) :	\
+		(((__x) - ((__d) / 2)) / (__d));	\
+}							\
+)
+/*
+ * Same as above but for u64 dividends. divisor must be a 32-bit
+ * number.
+ */
+#define DIV_ROUND_CLOSEST_ULL(x, divisor)(		\
+{							\
+	typeof(divisor) __d = divisor;			\
+	unsigned long long _tmp = (x) + (__d) / 2;	\
+	do_div(_tmp, __d);				\
+	_tmp;						\
+}							\
+)
+
+/*
+ * Multiplies an integer by a fraction, while avoiding unnecessary
+ * overflow or loss of precision.
+ */
+#define mult_frac(x, numer, denom)(			\
+{							\
+	typeof(x) quot = (x) / (denom);			\
+	typeof(x) rem  = (x) % (denom);			\
+	(quot * (numer)) + ((rem * (numer)) / (denom));	\
+}							\
+)
+
+#define sector_div(a, b) do_div(a, b)
+
+/**
+ * abs - return absolute value of an argument
+ * @x: the value.  If it is unsigned type, it is converted to signed type first.
+ *     char is treated as if it was signed (regardless of whether it really is)
+ *     but the macro's return type is preserved as char.
+ *
+ * Return: an absolute value of x.
+ */
+#define abs(x)	__abs_choose_expr(x, long long,				\
+		__abs_choose_expr(x, long,				\
+		__abs_choose_expr(x, int,				\
+		__abs_choose_expr(x, short,				\
+		__abs_choose_expr(x, char,				\
+		__builtin_choose_expr(					\
+			__builtin_types_compatible_p(typeof(x), char),	\
+			(char)({ signed char __x = (x); __x<0?-__x:__x; }), \
+			((void)0)))))))
+
+#define __abs_choose_expr(x, type, other) __builtin_choose_expr(	\
+	__builtin_types_compatible_p(typeof(x),   signed type) ||	\
+	__builtin_types_compatible_p(typeof(x), unsigned type),		\
+	({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
+
+/**
+ * reciprocal_scale - "scale" a value into range [0, ep_ro)
+ * @val: value
+ * @ep_ro: right open interval endpoint
+ *
+ * Perform a "reciprocal multiplication" in order to "scale" a value into
+ * range [0, @ep_ro), where the upper interval endpoint is right-open.
+ * This is useful, e.g. for accessing a index of an array containing
+ * @ep_ro elements, for example. Think of it as sort of modulus, only that
+ * the result isn't that of modulo. ;) Note that if initial input is a
+ * small value, then result will return 0.
+ *
+ * Return: a result based on @val in interval [0, @ep_ro).
+ */
+static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
+{
+	return (u32)(((u64) val * ep_ro) >> 32);
+}
+
+u64 int_pow(u64 base, unsigned int exp);
+unsigned long int_sqrt(unsigned long);
+
+#if BITS_PER_LONG < 64
+u32 int_sqrt64(u64 x);
+#else
+static inline u32 int_sqrt64(u64 x)
+{
+	return (u32)int_sqrt(x);
+}
+#endif
+
+#endif	/* _LINUX_MATH_H */
diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h
index b8e094b125ee..78feb8ba7358 100644
--- a/include/linux/rcu_node_tree.h
+++ b/include/linux/rcu_node_tree.h
@@ -20,6 +20,8 @@
 #ifndef __LINUX_RCU_NODE_TREE_H
 #define __LINUX_RCU_NODE_TREE_H
 
+#include <linux/math.h>
+
 /*
  * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
  * CONFIG_RCU_FANOUT_LEAF.
diff --git a/include/linux/units.h b/include/linux/units.h
index aaf716364ec3..5c115c809507 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_UNITS_H
 #define _LINUX_UNITS_H
 
-#include <linux/kernel.h>
+#include <linux/math.h>
 
 #define ABSOLUTE_ZERO_MILLICELSIUS -273150
 
diff --git a/lib/errname.c b/lib/errname.c
index 0c4d3e66170e..05cbf731545f 100644
--- a/lib/errname.c
+++ b/lib/errname.c
@@ -3,6 +3,7 @@
 #include <linux/errno.h>
 #include <linux/errname.h>
 #include <linux/kernel.h>
+#include <linux/math.h>
 
 /*
  * Ensure these tables do not accidentally become gigantic if some
diff --git a/lib/errseq.c b/lib/errseq.c
index 81f9e33aa7e7..93e9b94358dc 100644
--- a/lib/errseq.c
+++ b/lib/errseq.c
@@ -3,6 +3,7 @@
 #include <linux/bug.h>
 #include <linux/atomic.h>
 #include <linux/errseq.h>
+#include <linux/log2.h>
 
 /*
  * An errseq_t is a way of recording errors in one place, and allowing any
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 4a8751010d59..f67f86fd2f62 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -15,8 +15,9 @@
 #include <linux/bitops.h>
 #include <linux/bitmap.h>
 #include <linux/export.h>
-#include <linux/kernel.h>
+#include <linux/math.h>
 #include <linux/minmax.h>
+#include <linux/swab.h>
 
 #if !defined(find_next_bit) || !defined(find_next_zero_bit) ||			\
 	!defined(find_next_bit_le) || !defined(find_next_zero_bit_le) ||	\
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 3952a07130d8..064d68a5391a 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -18,9 +18,11 @@
  * or by defining a preprocessor macro in arch/include/asm/div64.h.
  */
 
+#include <linux/bitops.h>
 #include <linux/export.h>
-#include <linux/kernel.h>
+#include <linux/math.h>
 #include <linux/math64.h>
+#include <linux/log2.h>
 
 /* Not needed on 64bit architectures */
 #if BITS_PER_LONG == 32
diff --git a/lib/math/int_pow.c b/lib/math/int_pow.c
index 622fc1ab3c74..0cf426e69bda 100644
--- a/lib/math/int_pow.c
+++ b/lib/math/int_pow.c
@@ -6,7 +6,7 @@
  */
 
 #include <linux/export.h>
-#include <linux/kernel.h>
+#include <linux/math.h>
 #include <linux/types.h>
 
 /**
diff --git a/lib/math/int_sqrt.c b/lib/math/int_sqrt.c
index 30e0f9770f88..a8170bb9142f 100644
--- a/lib/math/int_sqrt.c
+++ b/lib/math/int_sqrt.c
@@ -6,9 +6,10 @@
  *  square root from Guy L. Steele.
  */
 
-#include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/bitops.h>
+#include <linux/limits.h>
+#include <linux/math.h>
 
 /**
  * int_sqrt - computes the integer square root
diff --git a/lib/math/reciprocal_div.c b/lib/math/reciprocal_div.c
index 32436dd4171e..6cb4adbb81d2 100644
--- a/lib/math/reciprocal_div.c
+++ b/lib/math/reciprocal_div.c
@@ -1,10 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/bitops.h>
 #include <linux/bug.h>
-#include <linux/kernel.h>
-#include <asm/div64.h>
-#include <linux/reciprocal_div.h>
 #include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
 #include <linux/minmax.h>
+#include <linux/types.h>
+
+#include <linux/reciprocal_div.h>
 
 /*
  * For a description of the algorithm please have a look at
-- 
cgit 


From 0bb867795540a9223d44ddcdf478330cba5917f8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 15 Dec 2020 20:42:55 -0800
Subject: include/linux/bitmap.h: convert bitmap_empty() / bitmap_full() to
 return boolean

There is no need to return int type out of boolean expression.

Link: https://lkml.kernel.org/r/20201027180936.20806-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 99058eb81042..719fe036f567 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -379,7 +379,7 @@ static inline int bitmap_subset(const unsigned long *src1,
 		return __bitmap_subset(src1, src2, nbits);
 }
 
-static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
+static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
@@ -387,7 +387,7 @@ static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
 	return find_first_bit(src, nbits) == nbits;
 }
 
-static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
+static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
-- 
cgit 


From ab7d7798dad5aae23bb502f1a6fc0d637b07dc47 Mon Sep 17 00:00:00 2001
From: "Ma, Jianpeng" <jianpeng.ma@intel.com>
Date: Tue, 15 Dec 2020 20:42:57 -0800
Subject: bitmap: remove unused function declaration

Link: https://lkml.kernel.org/r/BN7PR11MB26097166B6B46387D8A1ABA4FDE30@BN7PR11MB2609.namprd11.prod.outlook.com
Fixes: 2afe27c718b6 ("lib/bitmap.c: bitmap_[empty,full]: remove code duplication")
Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 719fe036f567..70a932470b2d 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -126,8 +126,6 @@ extern void bitmap_free(const unsigned long *bitmap);
  * lib/bitmap.c provides these functions:
  */
 
-extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits);
-extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits);
 extern int __bitmap_equal(const unsigned long *bitmap1,
 			  const unsigned long *bitmap2, unsigned int nbits);
 extern bool __pure __bitmap_or_equal(const unsigned long *src1,
-- 
cgit 


From 2f78788b55baa3410b1ec91a576286abe1ad4d6a Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 15 Dec 2020 20:43:37 -0800
Subject: ilog2: improve ilog2 for constant arguments

As discussed in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97445 the
const_ilog2 macro generates a lot of code which interferes badly with GCC
inlining heuristics, until it can be proven that the ilog2 argument can or
can't be simplified into a constant.

It can be expressed using __builtin_clzll builtin which is supported by
GCC 3.4 and later and when used only in the __builtin_constant_p guarded
code it ought to always fold back to a constant.  Other compilers support
the same builtin for many years too.

Other option would be to change the const_ilog2 macro, though as the
description says it is meant to be used also in C constant expressions,
and while GCC will fold it to constant with constant argument even in
those, perhaps it is better to avoid using extensions in that case.

[akpm@linux-foundation.org: coding style fixes]

Link: https://lkml.kernel.org/r/20201120125154.GB3040@hirez.programming.kicks-ass.net
Link: https://lkml.kernel.org/r/20201021132718.GB2176@tucnak
Signed-off-by: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/log2.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/log2.h b/include/linux/log2.h
index c619ec6eff4a..df0b155c2141 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -156,7 +156,8 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 #define ilog2(n) \
 ( \
 	__builtin_constant_p(n) ?	\
-	const_ilog2(n) :		\
+	((n) < 2 ? 0 :			\
+	 63 - __builtin_clzll(n)) :	\
 	(sizeof(n) <= 4) ?		\
 	__ilog2_u32(n) :		\
 	__ilog2_u64(n)			\
-- 
cgit 


From 6a39e62abbafd1d58d1722f40c7d26ef379c6a2f Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 15 Dec 2020 20:43:44 -0800
Subject: lib: string.h: detect intra-object overflow in fortified string
 functions

Patch series "Fortify strscpy()", v7.

This patch implements a fortified version of strscpy() enabled by setting
CONFIG_FORTIFY_SOURCE=y.  The new version ensures the following before
calling vanilla strscpy():

1. There is no read overflow because either size is smaller than src
   length or we shrink size to src length by calling fortified strnlen().

2. There is no write overflow because we either failed during
   compilation or at runtime by checking that size is smaller than dest
   size.  Note that, if src and dst size cannot be got, the patch defaults
   to call vanilla strscpy().

The patches adds the following:

1. Implement the fortified version of strscpy().

2. Add a new LKDTM test to ensures the fortified version still returns
   the same value as the vanilla one while panic'ing when there is a write
   overflow.

3. Correct some typos in LKDTM related file.

I based my modifications on top of two patches from Daniel Axtens which
modify calls to __builtin_object_size, in fortified string functions, to
ensure the true size of char * are returned and not the surrounding
structure size.

About performance, I measured the slow down of fortified strscpy(), using
the vanilla one as baseline.  The hardware I used is an Intel i3 2130 CPU
clocked at 3.4 GHz.  I ran "Linux 5.10.0-rc4+ SMP PREEMPT" inside qemu
3.10 with 4 CPU cores.  The following code, called through LKDTM, was used
as a benchmark:

#define TIMES 10000
	char *src;
	char dst[7];
	int i;
	ktime_t begin;

	src = kstrdup("foobar", GFP_KERNEL);

	if (src == NULL)
		return;

	begin = ktime_get();
	for (i = 0; i < TIMES; i++)
		strscpy(dst, src, strlen(src));
	pr_info("%d fortified strscpy() tooks %lld", TIMES, ktime_get() - begin);

	begin = ktime_get();
	for (i = 0; i < TIMES; i++)
		__real_strscpy(dst, src, strlen(src));
	pr_info("%d vanilla strscpy() tooks %lld", TIMES, ktime_get() - begin);

	kfree(src);

I called the above code 30 times to compute stats for each version (in ns,
round to int):

| version   | mean    | std    | median  | 95th    |
| --------- | ------- | ------ | ------- | ------- |
| fortified | 245_069 | 54_657 | 216_230 | 331_122 |
| vanilla   | 172_501 | 70_281 | 143_539 | 219_553 |

On average, fortified strscpy() is approximately 1.42 times slower than
vanilla strscpy().  For the 95th percentile, the fortified version is
about 1.50 times slower.

So, clearly the stats are not in favor of fortified strscpy().  But, the
fortified version loops the string twice (one in strnlen() and another in
vanilla strscpy()) while the vanilla one only loops once.  This can
explain why fortified strscpy() is slower than the vanilla one.

This patch (of 5):

When the fortify feature was first introduced in commit 6974f0c4555e
("include/linux/string.h: add the option of fortified string.h
functions"), Daniel Micay observed:

  * It should be possible to optionally use __builtin_object_size(x, 1) for
    some functions (C strings) to detect intra-object overflows (like
    glibc's _FORTIFY_SOURCE=2), but for now this takes the conservative
    approach to avoid likely compatibility issues.

This is a case that often cannot be caught by KASAN. Consider:

struct foo {
    char a[10];
    char b[10];
}

void test() {
    char *msg;
    struct foo foo;

    msg = kmalloc(16, GFP_KERNEL);
    strcpy(msg, "Hello world!!");
    // this copy overwrites foo.b
    strcpy(foo.a, msg);
}

The questionable copy overflows foo.a and writes to foo.b as well.  It
cannot be detected by KASAN.  Currently it is also not detected by
fortify, because strcpy considers __builtin_object_size(x, 0), which
considers the size of the surrounding object (here, struct foo).  However,
if we switch the string functions over to use __builtin_object_size(x, 1),
the compiler will measure the size of the closest surrounding subobject
(here, foo.a), rather than the size of the surrounding object as a whole.
See https://gcc.gnu.org/onlinedocs/gcc/Object-Size-Checking.html for more
info.

Only do this for string functions: we cannot use it on things like memcpy,
memmove, memcmp and memchr_inv due to code like this which purposefully
operates on multiple structure members: (arch/x86/kernel/traps.c)

	/*
	 * regs->sp points to the failing IRET frame on the
	 * ESPFIX64 stack.  Copy it to the entry stack.  This fills
	 * in gpregs->ss through gpregs->ip.
	 *
	 */
	memmove(&gpregs->ip, (void *)regs->sp, 5*8);

This change passes an allyesconfig on powerpc and x86, and an x86 kernel
built with it survives running with syz-stress from syzkaller, so it seems
safe so far.

Link: https://lkml.kernel.org/r/20201122162451.27551-1-laniel_francis@privacyrequired.com
Link: https://lkml.kernel.org/r/20201122162451.27551-2-laniel_francis@privacyrequired.com
Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Francis Laniel <laniel_francis@privacyrequired.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index b1f3894a0a3e..46e91d684c47 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -292,7 +292,7 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 
 __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __builtin_object_size(p, 1);
 	if (__builtin_constant_p(size) && p_size < size)
 		__write_overflow();
 	if (p_size < size)
@@ -302,7 +302,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
 
 __FORTIFY_INLINE char *strcat(char *p, const char *q)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __builtin_object_size(p, 1);
 	if (p_size == (size_t)-1)
 		return __underlying_strcat(p, q);
 	if (strlcat(p, q, p_size) >= p_size)
@@ -313,7 +313,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q)
 __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
 {
 	__kernel_size_t ret;
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __builtin_object_size(p, 1);
 
 	/* Work around gcc excess stack consumption issue */
 	if (p_size == (size_t)-1 ||
@@ -328,7 +328,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
 extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
 __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __builtin_object_size(p, 1);
 	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
 	if (p_size <= ret && maxlen != ret)
 		fortify_panic(__func__);
@@ -340,8 +340,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
 __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
 {
 	size_t ret;
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
 	if (p_size == (size_t)-1 && q_size == (size_t)-1)
 		return __real_strlcpy(p, q, size);
 	ret = strlen(q);
@@ -361,8 +361,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
 __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
 {
 	size_t p_len, copy_len;
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
 	if (p_size == (size_t)-1 && q_size == (size_t)-1)
 		return __underlying_strncat(p, q, count);
 	p_len = strlen(p);
@@ -475,11 +475,16 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp)
 /* defined after fortified strlen and memcpy to reuse them */
 __FORTIFY_INLINE char *strcpy(char *p, const char *q)
 {
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+	size_t size;
 	if (p_size == (size_t)-1 && q_size == (size_t)-1)
 		return __underlying_strcpy(p, q);
-	memcpy(p, q, strlen(q) + 1);
+	size = strlen(q) + 1;
+	/* test here to use the more stringent object size */
+	if (p_size < size)
+		fortify_panic(__func__);
+	memcpy(p, q, size);
 	return p;
 }
 
-- 
cgit 


From 33e56a59e64dfb68778e5da0be13f0c47dc5d445 Mon Sep 17 00:00:00 2001
From: Francis Laniel <laniel_francis@privacyrequired.com>
Date: Tue, 15 Dec 2020 20:43:50 -0800
Subject: string.h: add FORTIFY coverage for strscpy()

The fortified version of strscpy ensures the following before vanilla strscpy
is called:

1. There is no read overflow because we either size is smaller than
   src length or we shrink size to src length by calling fortified
   strnlen.

2. There is no write overflow because we either failed during
   compilation or at runtime by checking that size is smaller than dest
   size.

Link: https://lkml.kernel.org/r/20201122162451.27551-4-laniel_francis@privacyrequired.com
Signed-off-by: Francis Laniel <laniel_francis@privacyrequired.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Daniel Axtens <dja@axtens.net>
Cc: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 46e91d684c47..1cd63a8a23ab 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -6,6 +6,7 @@
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
+#include <linux/errno.h>	/* for E2BIG */
 #include <stdarg.h>
 #include <uapi/linux/string.h>
 
@@ -357,6 +358,53 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
 	return ret;
 }
 
+/* defined after fortified strnlen to reuse it */
+extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
+__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size)
+{
+	size_t len;
+	/* Use string size rather than possible enclosing struct size. */
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+
+	/* If we cannot get size of p and q default to call strscpy. */
+	if (p_size == (size_t) -1 && q_size == (size_t) -1)
+		return __real_strscpy(p, q, size);
+
+	/*
+	 * If size can be known at compile time and is greater than
+	 * p_size, generate a compile time write overflow error.
+	 */
+	if (__builtin_constant_p(size) && size > p_size)
+		__write_overflow();
+
+	/*
+	 * This call protects from read overflow, because len will default to q
+	 * length if it smaller than size.
+	 */
+	len = strnlen(q, size);
+	/*
+	 * If len equals size, we will copy only size bytes which leads to
+	 * -E2BIG being returned.
+	 * Otherwise we will copy len + 1 because of the final '\O'.
+	 */
+	len = len == size ? size : len + 1;
+
+	/*
+	 * Generate a runtime write overflow error if len is greater than
+	 * p_size.
+	 */
+	if (len > p_size)
+		fortify_panic(__func__);
+
+	/*
+	 * We can now safely call vanilla strscpy because we are protected from:
+	 * 1. Read overflow thanks to call to strnlen().
+	 * 2. Write overflow thanks to above ifs.
+	 */
+	return __real_strscpy(p, q, len);
+}
+
 /* defined after fortified strlen and strnlen to reuse them */
 __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
 {
-- 
cgit 


From 5c7b3280d221b84a675b85cb2727df7d82b65c3a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 15 Dec 2020 20:45:34 -0800
Subject: rapidio: remove unused rio_get_asm() and rio_get_device()

The functions rio_get_asm() and rio_get_device() are globally exported
but have almost no users in tree. The only user is rio_init_mports()
which invokes it via rio_init().

rio_init() iterates over every registered device and invokes
rio_fixup_device().  It looks like a fixup function which should perform a
"change" to the device but does nothing.  It has been like this since its
introduction in commit 394b701ce4fbf ("[PATCH] RapidIO support: core
base") which was merged into v2.6.15-rc1.

Remove rio_init() because the performed fixup function
(rio_fixup_device()) does nothing.  Remove rio_get_asm() and
rio_get_device() which have no callers now.

Link: https://lkml.kernel.org/r/20201116170004.420143-1-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Alexandre Bounine <alex.bou9@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio.c   | 81 -------------------------------------------------
 include/linux/rio_drv.h |  3 --
 2 files changed, 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index 606986c5ba2c..c2b79736a92b 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -1412,71 +1412,6 @@ rio_mport_get_feature(struct rio_mport * port, int local, u16 destid,
 }
 EXPORT_SYMBOL_GPL(rio_mport_get_feature);
 
-/**
- * rio_get_asm - Begin or continue searching for a RIO device by vid/did/asm_vid/asm_did
- * @vid: RIO vid to match or %RIO_ANY_ID to match all vids
- * @did: RIO did to match or %RIO_ANY_ID to match all dids
- * @asm_vid: RIO asm_vid to match or %RIO_ANY_ID to match all asm_vids
- * @asm_did: RIO asm_did to match or %RIO_ANY_ID to match all asm_dids
- * @from: Previous RIO device found in search, or %NULL for new search
- *
- * Iterates through the list of known RIO devices. If a RIO device is
- * found with a matching @vid, @did, @asm_vid, @asm_did, the reference
- * count to the device is incrememted and a pointer to its device
- * structure is returned. Otherwise, %NULL is returned. A new search
- * is initiated by passing %NULL to the @from argument. Otherwise, if
- * @from is not %NULL, searches continue from next device on the global
- * list. The reference count for @from is always decremented if it is
- * not %NULL.
- */
-struct rio_dev *rio_get_asm(u16 vid, u16 did,
-			    u16 asm_vid, u16 asm_did, struct rio_dev *from)
-{
-	struct list_head *n;
-	struct rio_dev *rdev;
-
-	WARN_ON(in_interrupt());
-	spin_lock(&rio_global_list_lock);
-	n = from ? from->global_list.next : rio_devices.next;
-
-	while (n && (n != &rio_devices)) {
-		rdev = rio_dev_g(n);
-		if ((vid == RIO_ANY_ID || rdev->vid == vid) &&
-		    (did == RIO_ANY_ID || rdev->did == did) &&
-		    (asm_vid == RIO_ANY_ID || rdev->asm_vid == asm_vid) &&
-		    (asm_did == RIO_ANY_ID || rdev->asm_did == asm_did))
-			goto exit;
-		n = n->next;
-	}
-	rdev = NULL;
-      exit:
-	rio_dev_put(from);
-	rdev = rio_dev_get(rdev);
-	spin_unlock(&rio_global_list_lock);
-	return rdev;
-}
-EXPORT_SYMBOL_GPL(rio_get_asm);
-
-/**
- * rio_get_device - Begin or continue searching for a RIO device by vid/did
- * @vid: RIO vid to match or %RIO_ANY_ID to match all vids
- * @did: RIO did to match or %RIO_ANY_ID to match all dids
- * @from: Previous RIO device found in search, or %NULL for new search
- *
- * Iterates through the list of known RIO devices. If a RIO device is
- * found with a matching @vid and @did, the reference count to the
- * device is incrememted and a pointer to its device structure is returned.
- * Otherwise, %NULL is returned. A new search is initiated by passing %NULL
- * to the @from argument. Otherwise, if @from is not %NULL, searches
- * continue from next device on the global list. The reference count for
- * @from is always decremented if it is not %NULL.
- */
-struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from)
-{
-	return rio_get_asm(vid, did, RIO_ANY_ID, RIO_ANY_ID, from);
-}
-EXPORT_SYMBOL_GPL(rio_get_device);
-
 /**
  * rio_std_route_add_entry - Add switch route table entry using standard
  *   registers defined in RIO specification rev.1.3
@@ -2106,20 +2041,6 @@ found:
 	return rc;
 }
 
-static void rio_fixup_device(struct rio_dev *dev)
-{
-}
-
-static int rio_init(void)
-{
-	struct rio_dev *dev = NULL;
-
-	while ((dev = rio_get_device(RIO_ANY_ID, RIO_ANY_ID, dev)) != NULL) {
-		rio_fixup_device(dev);
-	}
-	return 0;
-}
-
 static struct workqueue_struct *rio_wq;
 
 struct rio_disc_work {
@@ -2206,8 +2127,6 @@ int rio_init_mports(void)
 	kfree(work);
 
 no_disc:
-	rio_init();
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rio_init_mports);
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index d637742e5039..e49c32b0f394 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -444,9 +444,6 @@ static inline void rio_set_drvdata(struct rio_dev *rdev, void *data)
 /* Misc driver helpers */
 extern u16 rio_local_get_device_id(struct rio_mport *port);
 extern void rio_local_set_device_id(struct rio_mport *port, u16 did);
-extern struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from);
-extern struct rio_dev *rio_get_asm(u16 vid, u16 did, u16 asm_vid, u16 asm_did,
-				   struct rio_dev *from);
 extern int rio_init_mports(void);
 
 #endif				/* LINUX_RIO_DRV_H */
-- 
cgit 


From 3d03295a7e9194c2318977b44999972ce3609664 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 15 Dec 2020 20:45:47 -0800
Subject: relay: remove unused buf_mapped and buf_unmapped callbacks

Patch series "relay: cleanup and const callbacks", v2.

None of the relay users require the use of mutable structs for callbacks,
however the relay code does.  Instead of assigning default callbacks when
there is none, add callback wrappers to conditionally call the client
callbacks if available, and fall back to default behaviour (typically
no-op) otherwise.

This lets all relay users make their struct rchan_callbacks const data.

This series starts with a number of cleanups first based on Christoph's
feedback.

This patch (of 9):

No relay client uses the buf_mapped or buf_unmapped callbacks.  Remove
them.  This makes relay's vm_operations_struct close callback a dummy,
remove it as well.

Link: https://lkml.kernel.org/r/cover.1606153547.git.jani.nikula@intel.com
Link: https://lkml.kernel.org/r/c69fff6e0cd485563604240bbfcc028434983bec.1606153547.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/relay.h | 19 -------------------
 kernel/relay.c        | 34 ----------------------------------
 2 files changed, 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index e13a333e7c37..b3c4f49f6951 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -101,25 +101,6 @@ struct rchan_callbacks
 			     void *prev_subbuf,
 			     size_t prev_padding);
 
-	/*
-	 * buf_mapped - relay buffer mmap notification
-	 * @buf: the channel buffer
-	 * @filp: relay file pointer
-	 *
-	 * Called when a relay file is successfully mmapped
-	 */
-        void (*buf_mapped)(struct rchan_buf *buf,
-			   struct file *filp);
-
-	/*
-	 * buf_unmapped - relay buffer unmap notification
-	 * @buf: the channel buffer
-	 * @filp: relay file pointer
-	 *
-	 * Called when a relay file is successfully unmapped
-	 */
-        void (*buf_unmapped)(struct rchan_buf *buf,
-			     struct file *filp);
 	/*
 	 * create_buf_file - create file to represent a relay channel buffer
 	 * @filename: the name of the file to create
diff --git a/kernel/relay.c b/kernel/relay.c
index b08d936d5fa7..b51343642bf4 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -27,15 +27,6 @@
 static DEFINE_MUTEX(relay_channels_mutex);
 static LIST_HEAD(relay_channels);
 
-/*
- * close() vm_op implementation for relay file mapping.
- */
-static void relay_file_mmap_close(struct vm_area_struct *vma)
-{
-	struct rchan_buf *buf = vma->vm_private_data;
-	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
-}
-
 /*
  * fault() vm_op implementation for relay file mapping.
  */
@@ -62,7 +53,6 @@ static vm_fault_t relay_buf_fault(struct vm_fault *vmf)
  */
 static const struct vm_operations_struct relay_file_mmap_ops = {
 	.fault = relay_buf_fault,
-	.close = relay_file_mmap_close,
 };
 
 /*
@@ -96,7 +86,6 @@ static void relay_free_page_array(struct page **array)
 static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
 {
 	unsigned long length = vma->vm_end - vma->vm_start;
-	struct file *filp = vma->vm_file;
 
 	if (!buf)
 		return -EBADF;
@@ -107,7 +96,6 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
 	vma->vm_ops = &relay_file_mmap_ops;
 	vma->vm_flags |= VM_DONTEXPAND;
 	vma->vm_private_data = buf;
-	buf->chan->cb->buf_mapped(buf, filp);
 
 	return 0;
 }
@@ -283,22 +271,6 @@ static int subbuf_start_default_callback (struct rchan_buf *buf,
 	return 1;
 }
 
-/*
- * buf_mapped() default callback.  Does nothing.
- */
-static void buf_mapped_default_callback(struct rchan_buf *buf,
-					struct file *filp)
-{
-}
-
-/*
- * buf_unmapped() default callback.  Does nothing.
- */
-static void buf_unmapped_default_callback(struct rchan_buf *buf,
-					  struct file *filp)
-{
-}
-
 /*
  * create_buf_file_create() default callback.  Does nothing.
  */
@@ -322,8 +294,6 @@ static int remove_buf_file_default_callback(struct dentry *dentry)
 /* relay channel default callbacks */
 static struct rchan_callbacks default_channel_callbacks = {
 	.subbuf_start = subbuf_start_default_callback,
-	.buf_mapped = buf_mapped_default_callback,
-	.buf_unmapped = buf_unmapped_default_callback,
 	.create_buf_file = create_buf_file_default_callback,
 	.remove_buf_file = remove_buf_file_default_callback,
 };
@@ -509,10 +479,6 @@ static void setup_callbacks(struct rchan *chan,
 
 	if (!cb->subbuf_start)
 		cb->subbuf_start = subbuf_start_default_callback;
-	if (!cb->buf_mapped)
-		cb->buf_mapped = buf_mapped_default_callback;
-	if (!cb->buf_unmapped)
-		cb->buf_unmapped = buf_unmapped_default_callback;
 	if (!cb->create_buf_file)
 		cb->create_buf_file = create_buf_file_default_callback;
 	if (!cb->remove_buf_file)
-- 
cgit 


From 371e03880d9d34534d3eafd2a7581042be598e39 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 15 Dec 2020 20:45:53 -0800
Subject: relay: make create_buf_file and remove_buf_file callbacks mandatory

All clients provide create_buf_file and remove_buf_file callbacks, and
they're required for relay to make sense.  There is no point in them being
optional.

Also document whether each callback is mandatory/optional.

Link: https://lkml.kernel.org/r/88003c1527386b93036e286e7917f1e33aec84ac.1606153547.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/relay.h |  6 ++++++
 kernel/relay.c        | 26 +-------------------------
 2 files changed, 7 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index b3c4f49f6951..99d024475ba5 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -89,6 +89,8 @@ struct rchan_callbacks
 	 * The client should return 1 to continue logging, 0 to stop
 	 * logging.
 	 *
+	 * This callback is optional.
+	 *
 	 * NOTE: subbuf_start will also be invoked when the buffer is
 	 *       created, so that the first sub-buffer can be initialized
 	 *       if necessary.  In this case, prev_subbuf will be NULL.
@@ -122,6 +124,8 @@ struct rchan_callbacks
 	 * cause relay_open() to create a single global buffer rather
 	 * than the default set of per-cpu buffers.
 	 *
+	 * This callback is mandatory.
+	 *
 	 * See Documentation/filesystems/relay.rst for more info.
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
@@ -139,6 +143,8 @@ struct rchan_callbacks
 	 * channel buffer.
 	 *
 	 * The callback should return 0 if successful, negative if not.
+	 *
+	 * This callback is mandatory.
 	 */
 	int (*remove_buf_file)(struct dentry *dentry);
 };
diff --git a/kernel/relay.c b/kernel/relay.c
index d9b8185161a8..dd4ec4ec07f3 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -271,26 +271,6 @@ static int subbuf_start_default_callback (struct rchan_buf *buf,
 	return 1;
 }
 
-/*
- * create_buf_file_create() default callback.  Does nothing.
- */
-static struct dentry *create_buf_file_default_callback(const char *filename,
-						       struct dentry *parent,
-						       umode_t mode,
-						       struct rchan_buf *buf,
-						       int *is_global)
-{
-	return NULL;
-}
-
-/*
- * remove_buf_file() default callback.  Does nothing.
- */
-static int remove_buf_file_default_callback(struct dentry *dentry)
-{
-	return -EINVAL;
-}
-
 /**
  *	wakeup_readers - wake up readers waiting on a channel
  *	@work: contains the channel buffer
@@ -467,10 +447,6 @@ static void setup_callbacks(struct rchan *chan,
 {
 	if (!cb->subbuf_start)
 		cb->subbuf_start = subbuf_start_default_callback;
-	if (!cb->create_buf_file)
-		cb->create_buf_file = create_buf_file_default_callback;
-	if (!cb->remove_buf_file)
-		cb->remove_buf_file = remove_buf_file_default_callback;
 	chan->cb = cb;
 }
 
@@ -530,7 +506,7 @@ struct rchan *relay_open(const char *base_filename,
 		return NULL;
 	if (subbuf_size > UINT_MAX / n_subbufs)
 		return NULL;
-	if (!cb)
+	if (!cb || !cb->create_buf_file || !cb->remove_buf_file)
 		return NULL;
 
 	chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
-- 
cgit 


From 023542f48b57d6b785fcadb86ac336ae80653e58 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 15 Dec 2020 20:45:57 -0800
Subject: relay: allow the use of const callback structs

None of the relay users require the use of mutable structs for callbacks,
however the relay code does.  Instead of assigning the default callback
for subbuf_start, add a wrapper to conditionally call the client callback
if available, and fall back to default behaviour otherwise.

This lets all relay users make their struct rchan_callbacks const data.

[jani.nikula@intel.com: cleanups, per Christoph]
  Link: https://lkml.kernel.org/r/20201124115412.32402-1-jani.nikula@intel.com

Link: https://lkml.kernel.org/r/cc3ff292e4eb4fdc56bee3d690c7b8e39209cd37.1606153547.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/relay.h |  4 ++--
 kernel/relay.c        | 37 ++++++++++---------------------------
 2 files changed, 12 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index 99d024475ba5..72b876dd5cb8 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -62,7 +62,7 @@ struct rchan
 	size_t subbuf_size;		/* sub-buffer size */
 	size_t n_subbufs;		/* number of sub-buffers per buffer */
 	size_t alloc_size;		/* total buffer size allocated */
-	struct rchan_callbacks *cb;	/* client callbacks */
+	const struct rchan_callbacks *cb; /* client callbacks */
 	struct kref kref;		/* channel refcount */
 	void *private_data;		/* for user-defined data */
 	size_t last_toobig;		/* tried to log event > subbuf size */
@@ -157,7 +157,7 @@ struct rchan *relay_open(const char *base_filename,
 			 struct dentry *parent,
 			 size_t subbuf_size,
 			 size_t n_subbufs,
-			 struct rchan_callbacks *cb,
+			 const struct rchan_callbacks *cb,
 			 void *private_data);
 extern int relay_late_setup_files(struct rchan *chan,
 				  const char *base_filename,
diff --git a/kernel/relay.c b/kernel/relay.c
index dd4ec4ec07f3..d1a67fbb819d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -252,23 +252,14 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
  * High-level relay kernel API and associated functions.
  */
 
-/*
- * rchan_callback implementations defining default channel behavior.  Used
- * in place of corresponding NULL values in client callback struct.
- */
-
-/*
- * subbuf_start() default callback.  Does nothing.
- */
-static int subbuf_start_default_callback (struct rchan_buf *buf,
-					  void *subbuf,
-					  void *prev_subbuf,
-					  size_t prev_padding)
+static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
+			      void *prev_subbuf, size_t prev_padding)
 {
-	if (relay_buf_full(buf))
-		return 0;
+	if (!buf->chan->cb->subbuf_start)
+		return !relay_buf_full(buf);
 
-	return 1;
+	return buf->chan->cb->subbuf_start(buf, subbuf,
+					   prev_subbuf, prev_padding);
 }
 
 /**
@@ -314,7 +305,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	for (i = 0; i < buf->chan->n_subbufs; i++)
 		buf->padding[i] = 0;
 
-	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+	relay_subbuf_start(buf, buf->data, NULL, 0);
 }
 
 /**
@@ -442,14 +433,6 @@ static void relay_close_buf(struct rchan_buf *buf)
 	kref_put(&buf->kref, relay_remove_buf);
 }
 
-static void setup_callbacks(struct rchan *chan,
-				   struct rchan_callbacks *cb)
-{
-	if (!cb->subbuf_start)
-		cb->subbuf_start = subbuf_start_default_callback;
-	chan->cb = cb;
-}
-
 int relay_prepare_cpu(unsigned int cpu)
 {
 	struct rchan *chan;
@@ -495,7 +478,7 @@ struct rchan *relay_open(const char *base_filename,
 			 struct dentry *parent,
 			 size_t subbuf_size,
 			 size_t n_subbufs,
-			 struct rchan_callbacks *cb,
+			 const struct rchan_callbacks *cb,
 			 void *private_data)
 {
 	unsigned int i;
@@ -529,7 +512,7 @@ struct rchan *relay_open(const char *base_filename,
 		chan->has_base_filename = 1;
 		strlcpy(chan->base_filename, base_filename, NAME_MAX);
 	}
-	setup_callbacks(chan, cb);
+	chan->cb = cb;
 	kref_init(&chan->kref);
 
 	mutex_lock(&relay_channels_mutex);
@@ -712,7 +695,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
 	new = buf->start + new_subbuf * buf->chan->subbuf_size;
 	buf->offset = 0;
-	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+	if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) {
 		buf->offset = buf->chan->subbuf_size + 1;
 		return 0;
 	}
-- 
cgit 


From ff5c19ed4b087073cea38ff0edc80c23d7256943 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Dec 2020 20:47:23 -0800
Subject: mm: simplify follow_pte{,pmd}

Merge __follow_pte_pmd, follow_pte_pmd and follow_pte into a single
follow_pte function and just pass two additional NULL arguments for the
two previous follow_pte callers.

[sfr@canb.auug.org.au: merge fix for "s390/pci: remove races against pte updates"]
  Link: https://lkml.kernel.org/r/20201111221254.7f6a3658@canb.auug.org.au

Link: https://lkml.kernel.org/r/20201029101432.47011-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/pci/pci_mmio.c |  4 ++--
 fs/dax.c                 |  9 ++++-----
 include/linux/mm.h       |  6 +++---
 mm/memory.c              | 35 +++++------------------------------
 4 files changed, 14 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index de3bdbed8881..18f2d10c3176 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -170,7 +170,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
 	if (!(vma->vm_flags & VM_WRITE))
 		goto out_unlock_mmap;
 
-	ret = follow_pte_pmd(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+	ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
 	if (ret)
 		goto out_unlock_mmap;
 
@@ -311,7 +311,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
 	if (!(vma->vm_flags & VM_WRITE))
 		goto out_unlock_mmap;
 
-	ret = follow_pte_pmd(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+	ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
 	if (ret)
 		goto out_unlock_mmap;
 
diff --git a/fs/dax.c b/fs/dax.c
index 5b47834f2e1b..26d5dcd2d69e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -810,12 +810,11 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
 		address = pgoff_address(index, vma);
 
 		/*
-		 * Note because we provide range to follow_pte_pmd it will
-		 * call mmu_notifier_invalidate_range_start() on our behalf
-		 * before taking any lock.
+		 * Note because we provide range to follow_pte it will call
+		 * mmu_notifier_invalidate_range_start() on our behalf before
+		 * taking any lock.
 		 */
-		if (follow_pte_pmd(vma->vm_mm, address, &range,
-				   &ptep, &pmdp, &ptl))
+		if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl))
 			continue;
 
 		/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abc7b3154298..855161080f18 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1641,9 +1641,9 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-		   struct mmu_notifier_range *range,
-		   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
+int follow_pte(struct mm_struct *mm, unsigned long address,
+		struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
+		spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 126e140baf73..7d608765932b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4707,9 +4707,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-			    struct mmu_notifier_range *range,
-			    pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+int follow_pte(struct mm_struct *mm, unsigned long address,
+	       struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
+	       spinlock_t **ptlp)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -4774,31 +4774,6 @@ out:
 	return -EINVAL;
 }
 
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
-			     pte_t **ptepp, spinlock_t **ptlp)
-{
-	int res;
-
-	/* (void) is needed to make gcc happy */
-	(void) __cond_lock(*ptlp,
-			   !(res = __follow_pte_pmd(mm, address, NULL,
-						    ptepp, NULL, ptlp)));
-	return res;
-}
-
-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-		   struct mmu_notifier_range *range,
-		   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
-{
-	int res;
-
-	/* (void) is needed to make gcc happy */
-	(void) __cond_lock(*ptlp,
-			   !(res = __follow_pte_pmd(mm, address, range,
-						    ptepp, pmdpp, ptlp)));
-	return res;
-}
-
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping
@@ -4819,7 +4794,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 		return ret;
 
-	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+	ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl);
 	if (ret)
 		return ret;
 	*pfn = pte_pfn(*ptep);
@@ -4840,7 +4815,7 @@ int follow_phys(struct vm_area_struct *vma,
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 		goto out;
 
-	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+	if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl))
 		goto out;
 	pte = *ptep;
 
-- 
cgit 


From c18e68696fdd9fd293f051030bce5aaff3c9b185 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Mon, 14 Dec 2020 21:15:47 -0800
Subject: net/connector: Add const qualifier to cb_id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The connector driver never modifies any cb_id passed to it, so add a const
qualifier to those arguments so callers can declare their struct cb_id as a
constant object.

Fixes build warnings like these when passing a constant struct cb_id:

  warning: passing argument 1 of ‘cn_add_callback’ discards ‘const’ qualifier from pointer target

Signed-off-by: Geoff Levand <geoff@infradead.org>
Link: https://lore.kernel.org/r/a9e49c9e-67fa-16e7-0a6b-72f6bd30c58a@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/driver-api/connector.rst |  2 +-
 drivers/connector/cn_queue.c           |  8 ++++----
 drivers/connector/connector.c          |  4 ++--
 include/linux/connector.h              | 10 +++++-----
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/connector.rst b/Documentation/driver-api/connector.rst
index 23d068191fb1..631b84a48aa5 100644
--- a/Documentation/driver-api/connector.rst
+++ b/Documentation/driver-api/connector.rst
@@ -25,7 +25,7 @@ handling, etc...  The Connector driver allows any kernelspace agents to use
 netlink based networking for inter-process communication in a significantly
 easier way::
 
-  int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
+  int cn_add_callback(const struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
   void cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
   void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
 
diff --git a/drivers/connector/cn_queue.c b/drivers/connector/cn_queue.c
index 49295052ba8b..996f025eb63c 100644
--- a/drivers/connector/cn_queue.c
+++ b/drivers/connector/cn_queue.c
@@ -19,7 +19,7 @@
 
 static struct cn_callback_entry *
 cn_queue_alloc_callback_entry(struct cn_queue_dev *dev, const char *name,
-			      struct cb_id *id,
+			      const struct cb_id *id,
 			      void (*callback)(struct cn_msg *,
 					       struct netlink_skb_parms *))
 {
@@ -51,13 +51,13 @@ void cn_queue_release_callback(struct cn_callback_entry *cbq)
 	kfree(cbq);
 }
 
-int cn_cb_equal(struct cb_id *i1, struct cb_id *i2)
+int cn_cb_equal(const struct cb_id *i1, const struct cb_id *i2)
 {
 	return ((i1->idx == i2->idx) && (i1->val == i2->val));
 }
 
 int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
-			  struct cb_id *id,
+			  const struct cb_id *id,
 			  void (*callback)(struct cn_msg *,
 					   struct netlink_skb_parms *))
 {
@@ -90,7 +90,7 @@ int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
 	return 0;
 }
 
-void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id)
+void cn_queue_del_callback(struct cn_queue_dev *dev, const struct cb_id *id)
 {
 	struct cn_callback_entry *cbq, *n;
 	int found = 0;
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 7d59d18c6f26..48ec7ce6ecac 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -193,7 +193,7 @@ static void cn_rx_skb(struct sk_buff *skb)
  *
  * May sleep.
  */
-int cn_add_callback(struct cb_id *id, const char *name,
+int cn_add_callback(const struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *,
 				     struct netlink_skb_parms *))
 {
@@ -214,7 +214,7 @@ EXPORT_SYMBOL_GPL(cn_add_callback);
  *
  * May sleep while waiting for reference counter to become zero.
  */
-void cn_del_callback(struct cb_id *id)
+void cn_del_callback(const struct cb_id *id)
 {
 	struct cn_dev *dev = &cdev;
 
diff --git a/include/linux/connector.h b/include/linux/connector.h
index cb732643471b..8ea860efea37 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -64,14 +64,14 @@ struct cn_dev {
  * @callback:	connector's callback.
  * 		parameters are %cn_msg and the sender's credentials
  */
-int cn_add_callback(struct cb_id *id, const char *name,
+int cn_add_callback(const struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
 /**
  * cn_del_callback() - Unregisters new callback with connector core.
  *
  * @id:		unique connector's user identifier.
  */
-void cn_del_callback(struct cb_id *id);
+void cn_del_callback(const struct cb_id *id);
 
 
 /**
@@ -122,14 +122,14 @@ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp
 int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 group, gfp_t gfp_mask);
 
 int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
-			  struct cb_id *id,
+			  const struct cb_id *id,
 			  void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
-void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id);
+void cn_queue_del_callback(struct cn_queue_dev *dev, const struct cb_id *id);
 void cn_queue_release_callback(struct cn_callback_entry *);
 
 struct cn_queue_dev *cn_queue_alloc_dev(const char *name, struct sock *);
 void cn_queue_free_dev(struct cn_queue_dev *dev);
 
-int cn_cb_equal(struct cb_id *, struct cb_id *);
+int cn_cb_equal(const struct cb_id *, const struct cb_id *);
 
 #endif				/* __CONNECTOR_H */
-- 
cgit 


From 7061eb8cfa902daa1ec71d23b5cddb8b4391e72b Mon Sep 17 00:00:00 2001
From: Lijun Pan <ljp@linux.ibm.com>
Date: Mon, 14 Dec 2020 15:19:28 -0600
Subject: net: core: introduce __netdev_notify_peers

There are some use cases for netdev_notify_peers in the context
when rtnl lock is already held. Introduce lockless version
of netdev_notify_peers call to save the extra code to call
	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
After that, convert netdev_notify_peers to call the new helper.

Suggested-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Lijun Pan <ljp@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7bf167993c05..259be67644e3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4547,6 +4547,7 @@ void __dev_set_rx_mode(struct net_device *dev);
 int dev_set_promiscuity(struct net_device *dev, int inc);
 int dev_set_allmulti(struct net_device *dev, int inc);
 void netdev_state_change(struct net_device *dev);
+void __netdev_notify_peers(struct net_device *dev);
 void netdev_notify_peers(struct net_device *dev);
 void netdev_features_change(struct net_device *dev);
 /* Load a device via the kmod */
diff --git a/net/core/dev.c b/net/core/dev.c
index a46334906c94..8fa739259041 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1456,6 +1456,25 @@ void netdev_state_change(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_state_change);
 
+/**
+ * __netdev_notify_peers - notify network peers about existence of @dev,
+ * to be called when rtnl lock is already held.
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void __netdev_notify_peers(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+}
+EXPORT_SYMBOL(__netdev_notify_peers);
+
 /**
  * netdev_notify_peers - notify network peers about existence of @dev
  * @dev: network device
@@ -1469,8 +1488,7 @@ EXPORT_SYMBOL(netdev_state_change);
 void netdev_notify_peers(struct net_device *dev)
 {
 	rtnl_lock();
-	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
-	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+	__netdev_notify_peers(dev);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL(netdev_notify_peers);
-- 
cgit 


From 767143a18d6d743d4254de5cf55b1bd87bb2af18 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 14 Dec 2020 22:37:50 -0800
Subject: phy: fix kdoc warning

Kdoc does not like it when multiline comment follows the networking
style of starting right on the first line:

include/linux/phy.h:869: warning: Function parameter or member 'config_intr' not described in 'phy_driver'

Link: https://lore.kernel.org/r/20201215063750.3120976-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 381a95732b6a..9effb511acde 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -743,7 +743,8 @@ struct phy_driver {
 	/** @read_status: Determines the negotiated speed and duplex */
 	int (*read_status)(struct phy_device *phydev);
 
-	/** @config_intr: Enables or disables interrupts.
+	/**
+	 * @config_intr: Enables or disables interrupts.
 	 * It should also clear any pending interrupts prior to enabling the
 	 * IRQs and after disabling them.
 	 */
-- 
cgit 


From 3df23a316c4a5d1764b034c71c29d67a17d5299f Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Sat, 5 Dec 2020 17:19:24 +0100
Subject: pwm: Remove unused function pwmchip_add_inversed()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is only defined with CONFIG_PWM unset and was introduced together
with pwmchip_add_with_polarity() (which is only defined with CONFIG_PWM
enabled). I guess the series that introduced pwmchip_add_with_polarity()
had a different concept in earlier revisions and the !CONFIG_PWM part
was just not updated accordingly.

Given that there is no implementation for pwmchip_add_with_polarity()
without CONFIG_PWM, just drop pwmchip_add_inversed() instead of renaming
it to pwmchip_add_with_polarity().

Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index a13ff383fa1d..e4d84d4db293 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -473,11 +473,6 @@ static inline int pwmchip_add(struct pwm_chip *chip)
 	return -EINVAL;
 }
 
-static inline int pwmchip_add_inversed(struct pwm_chip *chip)
-{
-	return -EINVAL;
-}
-
 static inline int pwmchip_remove(struct pwm_chip *chip)
 {
 	return -EINVAL;
-- 
cgit 


From 49e27134f6e9ebcd08c04a98ab7f0574b5a81a35 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Sun, 13 Dec 2020 14:06:41 +0200
Subject: net/mlx5: Fix compilation warning for 32-bit platform
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MLX5_GENERAL_OBJECT_TYPES types bitfield is 64-bit field.

Defining an enum for such bit fields on 32-bit platform results in below
warning.

./include/vdso/bits.h:7:26: warning: left shift count >= width of type [-Wshift-count-overflow]
                         ^
./include/linux/mlx5/mlx5_ifc.h:10716:46: note: in expansion of macro ‘BIT’
 MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT(0x20),
                                             ^~~

Use 32-bit friendly BIT_ULL macro.

Fixes: 2a2970891647 ("net/mlx5: Add sample offload hardware bits and structures")
Signed-off-by: Parav Pandit <parav@nvidia.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/20201213120641.216032-1-leon@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 0d6e287d614f..8fbddec26eb8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10711,9 +10711,9 @@ struct mlx5_ifc_affiliated_event_header_bits {
 };
 
 enum {
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT(0xc),
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT(0x13),
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT(0x20),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT_ULL(0xc),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT_ULL(0x13),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT_ULL(0x20),
 };
 
 enum {
-- 
cgit 


From 9bd23c31f392bda88618008f27fd52ee9e0fac38 Mon Sep 17 00:00:00 2001
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Date: Fri, 20 Nov 2020 12:22:32 -0800
Subject: jbd2: add a helper to find out number of fast commit blocks

Add a helper to read number of fast commit blocks from jbd2 superblock
and also rename the JBD2_MIN_FC_BLKS to
JBD2_DEFAULT_FAST_COMMIT_BLOCKS since this constant is just the
default number of fast commit blocks to use in case number of fast
commit blocks isn't set in jbd2 superblock.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20201120202232.2240293-2-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c    | 8 ++------
 include/linux/jbd2.h | 9 ++++++++-
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 188f79d76988..2dc944442802 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1869,9 +1869,7 @@ static int load_superblock(journal_t *journal)
 
 	if (jbd2_has_feature_fast_commit(journal)) {
 		journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
-		num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
-		if (!num_fc_blocks)
-			num_fc_blocks = JBD2_MIN_FC_BLOCKS;
+		num_fc_blocks = jbd2_journal_get_num_fc_blks(sb);
 		if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
 			journal->j_last = journal->j_fc_last - num_fc_blocks;
 		journal->j_fc_first = journal->j_last + 1;
@@ -2102,9 +2100,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
 	journal_superblock_t *sb = journal->j_superblock;
 	unsigned long long num_fc_blks;
 
-	num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
-	if (num_fc_blks == 0)
-		num_fc_blks = JBD2_MIN_FC_BLOCKS;
+	num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
 	if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
 		return -ENOSPC;
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d2a4860feb72..99d3cd051ac3 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -68,7 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags);
 extern void jbd2_free(void *ptr, size_t size);
 
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
-#define JBD2_MIN_FC_BLOCKS	256
+#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
 
 #ifdef __KERNEL__
 
@@ -1692,6 +1692,13 @@ static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
 	return journal->j_chksum_driver != NULL;
 }
 
+static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
+{
+	int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks);
+
+	return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS;
+}
+
 /*
  * Return number of free blocks in the log. Must be called under j_state_lock.
  */
-- 
cgit 


From 476c135e321716ad7a8a5d4a19a636e2dcc50526 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 12 Nov 2020 08:39:59 +0200
Subject: vdpa: Add missing comment for virtqueue count

Add missing comment for number of virtqueue.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20201112064005.349268-2-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 30bc7a7223bb..0fefeb976877 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -42,6 +42,7 @@ struct vdpa_vq_state {
  * @config: the configuration ops for this device.
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
+ * @nvqs: maximum number of supported virtqueues
  */
 struct vdpa_device {
 	struct device dev;
-- 
cgit 


From a4055888629bc0467d12d912cd7c90acdf3d9b12 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Fri, 18 Dec 2020 14:01:31 -0800
Subject: mm/memcg: warning on !memcg after readahead page charged

Add VM_WARN_ON_ONCE_PAGE() macro.

Since readahead page is charged on memcg too, in theory we don't have to
check this exception now.  Before safely remove them all, add a warning
for the unexpected !memcg.

Link: https://lkml.kernel.org/r/1604283436-18880-3-git-send-email-alex.shi@linux.alibaba.com
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmdebug.h | 13 +++++++++++++
 mm/memcontrol.c         | 10 ++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 2ad72d2c8cc5..5d0767cb424a 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -37,6 +37,18 @@ void dump_mm(const struct mm_struct *mm);
 			BUG();						\
 		}							\
 	} while (0)
+#define VM_WARN_ON_ONCE_PAGE(cond, page)	({			\
+	static bool __section(".data.once") __warned;			\
+	int __ret_warn_once = !!(cond);					\
+									\
+	if (unlikely(__ret_warn_once && !__warned)) {			\
+		dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\
+		__warned = true;					\
+		WARN_ON(1);						\
+	}								\
+	unlikely(__ret_warn_once);					\
+})
+
 #define VM_WARN_ON(cond) (void)WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
 #define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
@@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm);
 #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5c3b054066f5..7b9766789a27 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1362,10 +1362,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
 	}
 
 	memcg = page_memcg(page);
-	/*
-	 * Swapcache readahead pages are added to the LRU - and
-	 * possibly migrated - before they are charged.
-	 */
+	VM_WARN_ON_ONCE_PAGE(!memcg, page);
 	if (!memcg)
 		memcg = root_mem_cgroup;
 
@@ -6987,6 +6984,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
 		return;
 
 	memcg = page_memcg(oldpage);
+	VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
 	if (!memcg)
 		return;
 
@@ -7186,7 +7184,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 
 	memcg = page_memcg(page);
 
-	/* Readahead page, never charged */
+	VM_WARN_ON_ONCE_PAGE(!memcg, page);
 	if (!memcg)
 		return;
 
@@ -7253,7 +7251,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 
 	memcg = page_memcg(page);
 
-	/* Readahead page, never charged */
+	VM_WARN_ON_ONCE_PAGE(!memcg, page);
 	if (!memcg)
 		return 0;
 
-- 
cgit 


From bec78efd0061365a76f88e498affd7106b256823 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 18 Dec 2020 14:01:35 -0800
Subject: mm/memcg: remove unused definitions

Some definitions are left unused, just clean them.

Link: https://lkml.kernel.org/r/20201108003834.12669-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 118 ---------------------------------------------
 1 file changed, 118 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 08ed57e02b73..196441f5dc99 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -913,41 +913,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 	local_irq_restore(flags);
 }
 
-/**
- * mod_memcg_page_state - update page state statistics
- * @page: the page
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * The @page must be locked or the caller must use lock_page_memcg()
- * to prevent double accounting when the page is concurrently being
- * moved to another memcg:
- *
- *   lock_page(page) or lock_page_memcg(page)
- *   if (TestClearPageState(page))
- *     mod_memcg_page_state(page, state, -1);
- *   unlock_page(page) or unlock_page_memcg(page)
- *
- * Kernel pages are an exception to this, since they'll never move.
- */
-static inline void __mod_memcg_page_state(struct page *page,
-					  int idx, int val)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (memcg)
-		__mod_memcg_state(memcg, idx, val);
-}
-
-static inline void mod_memcg_page_state(struct page *page,
-					int idx, int val)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (memcg)
-		mod_memcg_state(memcg, idx, val);
-}
-
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
@@ -1395,18 +1360,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 {
 }
 
-static inline void __mod_memcg_page_state(struct page *page,
-					  int idx,
-					  int nr)
-{
-}
-
-static inline void mod_memcg_page_state(struct page *page,
-					int idx,
-					int nr)
-{
-}
-
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
@@ -1479,34 +1432,6 @@ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 }
 #endif /* CONFIG_MEMCG */
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_state(struct mem_cgroup *memcg,
-				     int idx)
-{
-	__mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_state(struct mem_cgroup *memcg,
-				     int idx)
-{
-	__mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_page_state(struct page *page,
-					  int idx)
-{
-	__mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_page_state(struct page *page,
-					  int idx)
-{
-	__mod_memcg_page_state(page, idx, -1);
-}
-
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
 	__mod_lruvec_kmem_state(p, idx, 1);
@@ -1517,34 +1442,6 @@ static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
 	__mod_lruvec_kmem_state(p, idx, -1);
 }
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_state(struct mem_cgroup *memcg,
-				   int idx)
-{
-	mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_state(struct mem_cgroup *memcg,
-				   int idx)
-{
-	mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_page_state(struct page *page,
-					int idx)
-{
-	mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_page_state(struct page *page,
-					int idx)
-{
-	mod_memcg_page_state(page, idx, -1);
-}
-
 static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
 {
 	struct mem_cgroup *memcg;
@@ -1733,21 +1630,6 @@ static inline void memcg_kmem_uncharge_page(struct page *page, int order)
 		__memcg_kmem_uncharge_page(page, order);
 }
 
-static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
-				    unsigned int nr_pages)
-{
-	if (memcg_kmem_enabled())
-		return __memcg_kmem_charge(memcg, gfp, nr_pages);
-	return 0;
-}
-
-static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
-				       unsigned int nr_pages)
-{
-	if (memcg_kmem_enabled())
-		__memcg_kmem_uncharge(memcg, nr_pages);
-}
-
 /*
  * A helper for accessing memcg's kmem_id, used for getting
  * corresponding LRU lists.
-- 
cgit 


From 9a1ac2288cf16f9406ca54ef221bfcf262393b15 Mon Sep 17 00:00:00 2001
From: Hui Su <sh_def@163.com>
Date: Fri, 18 Dec 2020 14:01:41 -0800
Subject: mm/memcontrol:rewrite mem_cgroup_page_lruvec()

mem_cgroup_page_lruvec() in memcontrol.c and mem_cgroup_lruvec() in
memcontrol.h is very similar except for the param(page and memcg) which
also can be convert to each other.

So rewrite mem_cgroup_page_lruvec() with mem_cgroup_lruvec().

[alex.shi@linux.alibaba.com: add missed warning in mem_cgroup_lruvec]
  Link: https://lkml.kernel.org/r/94f17bb7-ec61-5b72-3555-fabeb5a4d73b@linux.alibaba.com
[lstoakes@gmail.com: warn on missing memcg on mem_cgroup_page_lruvec()]
  Link: https://lkml.kernel.org/r/20201125112202.387009-1-lstoakes@gmail.com

Link: https://lkml.kernel.org/r/20201108143731.GA74138@rlk
Signed-off-by: Hui Su <sh_def@163.com>
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Cc: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 19 +++++++++++++++++--
 mm/memcontrol.c            | 37 -------------------------------------
 2 files changed, 17 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 196441f5dc99..d827bd7f3bfe 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -620,9 +620,10 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
 /**
  * mem_cgroup_lruvec - get the lru list vector for a memcg & node
  * @memcg: memcg of the wanted lruvec
+ * @pgdat: pglist_data
  *
  * Returns the lru list vector holding pages for a given @memcg &
- * @node combination. This can be the node lruvec, if the memory
+ * @pgdat combination. This can be the node lruvec, if the memory
  * controller is disabled.
  */
 static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
@@ -652,7 +653,21 @@ out:
 	return lruvec;
 }
 
-struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
+/**
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
+ * @page: the page
+ * @pgdat: pgdat of the page
+ *
+ * This function relies on page->mem_cgroup being stable.
+ */
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
+						struct pglist_data *pgdat)
+{
+	struct mem_cgroup *memcg = page_memcg(page);
+
+	VM_WARN_ON_ONCE_PAGE(!memcg, page);
+	return mem_cgroup_lruvec(memcg, pgdat);
+}
 
 static inline bool lruvec_holds_page_lru_lock(struct page *page,
 					      struct lruvec *lruvec)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b9766789a27..605f671203ef 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1342,43 +1342,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 }
 #endif
 
-/**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
- * @pgdat: pgdat of the page
- *
- * This function relies on page's memcg being stable - see the
- * access rules in commit_charge().
- */
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
-{
-	struct mem_cgroup_per_node *mz;
-	struct mem_cgroup *memcg;
-	struct lruvec *lruvec;
-
-	if (mem_cgroup_disabled()) {
-		lruvec = &pgdat->__lruvec;
-		goto out;
-	}
-
-	memcg = page_memcg(page);
-	VM_WARN_ON_ONCE_PAGE(!memcg, page);
-	if (!memcg)
-		memcg = root_mem_cgroup;
-
-	mz = mem_cgroup_page_nodeinfo(memcg, page);
-	lruvec = &mz->lruvec;
-out:
-	/*
-	 * Since a node can be onlined after the mem_cgroup was created,
-	 * we have to be prepared to initialize lruvec->zone here;
-	 * and if offlined then reonlined, we need to reinitialize it.
-	 */
-	if (unlikely(lruvec->pgdat != pgdat))
-		lruvec->pgdat = pgdat;
-	return lruvec;
-}
-
 /**
  * lock_page_lruvec - lock and return lruvec for a given page.
  * @page: the page
-- 
cgit 


From b0a0c2615f6f199a656ed8549d7dce625d77aa77 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 18 Dec 2020 14:05:41 -0800
Subject: epoll: wire up syscall epoll_pwait2

Split off from prev patch in the series that implements the syscall.

Link: https://lkml.kernel.org/r/20201121144401.3727659-4-willemdebruijn.kernel@gmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
 arch/arm/tools/syscall.tbl                  | 1 +
 arch/arm64/include/asm/unistd.h             | 2 +-
 arch/arm64/include/asm/unistd32.h           | 2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       | 1 +
 arch/m68k/kernel/syscalls/syscall.tbl       | 1 +
 arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 1 +
 arch/parisc/kernel/syscalls/syscall.tbl     | 1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    | 1 +
 arch/s390/kernel/syscalls/syscall.tbl       | 1 +
 arch/sh/kernel/syscalls/syscall.tbl         | 1 +
 arch/sparc/kernel/syscalls/syscall.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 1 +
 include/linux/compat.h                      | 6 ++++++
 include/linux/syscalls.h                    | 5 +++++
 include/uapi/asm-generic/unistd.h           | 4 +++-
 kernel/sys_ni.c                             | 2 ++
 22 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index ee7b01bb7346..a6617067dbe6 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -480,3 +480,4 @@
 548	common	pidfd_getfd			sys_pidfd_getfd
 549	common	faccessat2			sys_faccessat2
 550	common	process_madvise			sys_process_madvise
+551	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index d056a548358e..20e1170e2e0a 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -454,3 +454,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index b3b2019f8d16..86a9d7b3eabe 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		441
+#define __NR_compat_syscalls		442
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 107f08e03b9f..f4bca2b90218 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SYSCALL(__NR_epoll_pwait2, sys_epoll_pwait2)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index b96ed8b8a508..bfc00f2bd437 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -361,3 +361,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 625fb6d32842..7fe4e45c864c 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index aae729c95cf9..a522adf194ab 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -446,3 +446,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 32817c954435..ad9c3dd0ab1f 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -379,3 +379,4 @@
 438	n32	pidfd_getfd			sys_pidfd_getfd
 439	n32	faccessat2			sys_faccessat2
 440	n32	process_madvise			sys_process_madvise
+441	n32	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 9e4ea3c31b1c..91649690b52f 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -355,3 +355,4 @@
 438	n64	pidfd_getfd			sys_pidfd_getfd
 439	n64	faccessat2			sys_faccessat2
 440	n64	process_madvise			sys_process_madvise
+441	n64	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 29f5f28cf5ce..4bad0c40aed6 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -428,3 +428,4 @@
 438	o32	pidfd_getfd			sys_pidfd_getfd
 439	o32	faccessat2			sys_faccessat2
 440	o32	process_madvise			sys_process_madvise
+441	o32	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index f375ea528e59..6bcc31966b44 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -438,3 +438,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 1275daec7fec..f744eb5cba88 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -530,3 +530,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 28c168000483..14f6525886a8 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -443,3 +443,4 @@
 438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
 439  common	faccessat2		sys_faccessat2			sys_faccessat2
 440  common	process_madvise		sys_process_madvise		sys_process_madvise
+441  common	epoll_pwait2		sys_epoll_pwait2		sys_epoll_pwait2
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 783738448ff5..9df40ac0ebc0 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -443,3 +443,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 78160260991b..c7da4c3271e6 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -486,3 +486,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 0d0667a9fbd7..874aeacde2dd 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -445,3 +445,4 @@
 438	i386	pidfd_getfd		sys_pidfd_getfd
 439	i386	faccessat2		sys_faccessat2
 440	i386	process_madvise		sys_process_madvise
+441	i386	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 379819244b91..78672124d28b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -362,6 +362,7 @@
 438	common	pidfd_getfd		sys_pidfd_getfd
 439	common	faccessat2		sys_faccessat2
 440	common	process_madvise		sys_process_madvise
+441	common	epoll_pwait2		sys_epoll_pwait2
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index b070f272995d..46116a28eeed 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -411,3 +411,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 400c0941c8af..6e65be753603 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -537,6 +537,12 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 			int maxevents, int timeout,
 			const compat_sigset_t __user *sigmask,
 			compat_size_t sigsetsize);
+asmlinkage long compat_sys_epoll_pwait2(int epfd,
+			struct epoll_event __user *events,
+			int maxevents,
+			const struct __kernel_timespec __user *timeout,
+			const compat_sigset_t __user *sigmask,
+			compat_size_t sigsetsize);
 
 /* fs/fcntl.c */
 asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index df0c3c74609e..f3929aff39cf 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -362,6 +362,11 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
 				int maxevents, int timeout,
 				const sigset_t __user *sigmask,
 				size_t sigsetsize);
+asmlinkage long sys_epoll_pwait2(int epfd, struct epoll_event __user *events,
+				 int maxevents,
+				 const struct __kernel_timespec __user *timeout,
+				 const sigset_t __user *sigmask,
+				 size_t sigsetsize);
 
 /* fs/fcntl.c */
 asmlinkage long sys_dup(unsigned int fildes);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index fc48c64700eb..728752917785 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
 
 #undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f27ac94d5fa7..19aa806890d5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1);
 COND_SYSCALL(epoll_ctl);
 COND_SYSCALL(epoll_pwait);
 COND_SYSCALL_COMPAT(epoll_pwait);
+COND_SYSCALL(epoll_pwait2);
+COND_SYSCALL_COMPAT(epoll_pwait2);
 
 /* fs/fcntl.c */
 
-- 
cgit 


From 3b1a4a8640876a966ab68ab4f561642e19674671 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:00:14 -0800
Subject: kasan: group vmalloc code

This is a preparatory commit for the upcoming addition of a new hardware
tag-based (MTE-based) KASAN mode.

Group all vmalloc-related function declarations in include/linux/kasan.h,
and their implementations in mm/kasan/common.c.

No functional changes.

Link: https://lkml.kernel.org/r/80a6fdd29b039962843bd6cf22ce2643a7c8904e.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 41 +++++++++++++++------------
 mm/kasan/common.c     | 78 ++++++++++++++++++++++++++-------------------------
 2 files changed, 63 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 30d343b4a40a..59538e795df4 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -75,19 +75,6 @@ struct kasan_cache {
 	int free_meta_offset;
 };
 
-/*
- * These functions provide a special case to support backing module
- * allocations with real shadow memory. With KASAN vmalloc, the special
- * case is unnecessary, as the work is handled in the generic case.
- */
-#ifndef CONFIG_KASAN_VMALLOC
-int kasan_module_alloc(void *addr, size_t size);
-void kasan_free_shadow(const struct vm_struct *vm);
-#else
-static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
-static inline void kasan_free_shadow(const struct vm_struct *vm) {}
-#endif
-
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
 
@@ -156,9 +143,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 	return false;
 }
 
-static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
-static inline void kasan_free_shadow(const struct vm_struct *vm) {}
-
 static inline int kasan_add_zero_shadow(void *start, unsigned long size)
 {
 	return 0;
@@ -211,13 +195,16 @@ static inline void *kasan_reset_tag(const void *addr)
 #endif /* CONFIG_KASAN_SW_TAGS */
 
 #ifdef CONFIG_KASAN_VMALLOC
+
 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
 void kasan_poison_vmalloc(const void *start, unsigned long size);
 void kasan_unpoison_vmalloc(const void *start, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
 			   unsigned long free_region_end);
-#else
+
+#else /* CONFIG_KASAN_VMALLOC */
+
 static inline int kasan_populate_vmalloc(unsigned long start,
 					unsigned long size)
 {
@@ -232,7 +219,25 @@ static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
 					 unsigned long free_region_end) {}
-#endif
+
+#endif /* CONFIG_KASAN_VMALLOC */
+
+#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
+
+/*
+ * These functions provide a special case to support backing module
+ * allocations with real shadow memory. With KASAN vmalloc, the special
+ * case is unnecessary, as the work is handled in the generic case.
+ */
+int kasan_module_alloc(void *addr, size_t size);
+void kasan_free_shadow(const struct vm_struct *vm);
+
+#else /* CONFIG_KASAN && !CONFIG_KASAN_VMALLOC */
+
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+
+#endif /* CONFIG_KASAN && !CONFIG_KASAN_VMALLOC */
 
 #ifdef CONFIG_KASAN_INLINE
 void kasan_non_canonical_hook(unsigned long addr);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 33d863f55db1..89e5ef9417a7 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -536,44 +536,6 @@ void kasan_kfree_large(void *ptr, unsigned long ip)
 	/* The object will be poisoned by page_alloc. */
 }
 
-#ifndef CONFIG_KASAN_VMALLOC
-int kasan_module_alloc(void *addr, size_t size)
-{
-	void *ret;
-	size_t scaled_size;
-	size_t shadow_size;
-	unsigned long shadow_start;
-
-	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
-	scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
-	shadow_size = round_up(scaled_size, PAGE_SIZE);
-
-	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
-		return -EINVAL;
-
-	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
-			shadow_start + shadow_size,
-			GFP_KERNEL,
-			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
-			__builtin_return_address(0));
-
-	if (ret) {
-		__memset(ret, KASAN_SHADOW_INIT, shadow_size);
-		find_vm_area(addr)->flags |= VM_KASAN;
-		kmemleak_ignore(ret);
-		return 0;
-	}
-
-	return -ENOMEM;
-}
-
-void kasan_free_shadow(const struct vm_struct *vm)
-{
-	if (vm->flags & VM_KASAN)
-		vfree(kasan_mem_to_shadow(vm->addr));
-}
-#endif
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 static bool shadow_mapped(unsigned long addr)
 {
@@ -685,6 +647,7 @@ core_initcall(kasan_memhotplug_init);
 #endif
 
 #ifdef CONFIG_KASAN_VMALLOC
+
 static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
 				      void *unused)
 {
@@ -923,4 +886,43 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 				       (unsigned long)shadow_end);
 	}
 }
+
+#else /* CONFIG_KASAN_VMALLOC */
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+	void *ret;
+	size_t scaled_size;
+	size_t shadow_size;
+	unsigned long shadow_start;
+
+	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+	scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
+	shadow_size = round_up(scaled_size, PAGE_SIZE);
+
+	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+		return -EINVAL;
+
+	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+			shadow_start + shadow_size,
+			GFP_KERNEL,
+			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+			__builtin_return_address(0));
+
+	if (ret) {
+		__memset(ret, KASAN_SHADOW_INIT, shadow_size);
+		find_vm_area(addr)->flags |= VM_KASAN;
+		kmemleak_ignore(ret);
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+void kasan_free_shadow(const struct vm_struct *vm)
+{
+	if (vm->flags & VM_KASAN)
+		vfree(kasan_mem_to_shadow(vm->addr));
+}
+
 #endif
-- 
cgit 


From d5750edf6da759576f91ec2b57d5553985815b40 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:00:17 -0800
Subject: kasan: shadow declarations only for software modes

This is a preparatory commit for the upcoming addition of a new hardware
tag-based (MTE-based) KASAN mode.

Group shadow-related KASAN function declarations and only define them for
the two existing software modes.

No functional changes for software modes.

  Link: https://lkml.kernel.org/r/35126.1606402815@turing-police
  Link: https://lore.kernel.org/linux-arm-kernel/24105.1606397102@turing-police/

Link: https://lkml.kernel.org/r/e88d94eff94db883a65dca52e1736d80d28dd9bc.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
[valdis.kletnieks@vt.edu: fix build issue with asmlinkage]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 48 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 59538e795df4..7828436a3a99 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -11,7 +11,7 @@ struct task_struct;
 
 #ifdef CONFIG_KASAN
 
-#include <linux/pgtable.h>
+#include <linux/linkage.h>
 #include <asm/kasan.h>
 
 /* kasan_data struct is used in KUnit tests for KASAN expected failures */
@@ -20,6 +20,20 @@ struct kunit_kasan_expectation {
 	bool report_found;
 };
 
+#endif
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
+#include <linux/pgtable.h>
+
+/* Software KASAN implementations use shadow memory. */
+
+#ifdef CONFIG_KASAN_SW_TAGS
+#define KASAN_SHADOW_INIT 0xFF
+#else
+#define KASAN_SHADOW_INIT 0
+#endif
+
 extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
 extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE];
 extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
@@ -35,6 +49,23 @@ static inline void *kasan_mem_to_shadow(const void *addr)
 		+ KASAN_SHADOW_OFFSET;
 }
 
+int kasan_add_zero_shadow(void *start, unsigned long size);
+void kasan_remove_zero_shadow(void *start, unsigned long size);
+
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+static inline int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+	return 0;
+}
+static inline void kasan_remove_zero_shadow(void *start,
+					unsigned long size)
+{}
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+#ifdef CONFIG_KASAN
+
 /* Enable reporting bugs after kasan_disable_current() */
 extern void kasan_enable_current(void);
 
@@ -75,9 +106,6 @@ struct kasan_cache {
 	int free_meta_offset;
 };
 
-int kasan_add_zero_shadow(void *start, unsigned long size);
-void kasan_remove_zero_shadow(void *start, unsigned long size);
-
 size_t __ksize(const void *);
 static inline void kasan_unpoison_slab(const void *ptr)
 {
@@ -143,14 +171,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 	return false;
 }
 
-static inline int kasan_add_zero_shadow(void *start, unsigned long size)
-{
-	return 0;
-}
-static inline void kasan_remove_zero_shadow(void *start,
-					unsigned long size)
-{}
-
 static inline void kasan_unpoison_slab(const void *ptr) { }
 static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
@@ -158,8 +178,6 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 #ifdef CONFIG_KASAN_GENERIC
 
-#define KASAN_SHADOW_INIT 0
-
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
 void kasan_record_aux_stack(void *ptr);
@@ -174,8 +192,6 @@ static inline void kasan_record_aux_stack(void *ptr) {}
 
 #ifdef CONFIG_KASAN_SW_TAGS
 
-#define KASAN_SHADOW_INIT 0xFF
-
 void kasan_init_tags(void);
 
 void *kasan_reset_tag(const void *addr);
-- 
cgit 


From cebd0eb29acdfc2f5e44e5f356ffcd0c44f16b4a Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:00:21 -0800
Subject: kasan: rename (un)poison_shadow to (un)poison_range

This is a preparatory commit for the upcoming addition of a new hardware
tag-based (MTE-based) KASAN mode.

The new mode won't be using shadow memory.  Rename external annotation
kasan_unpoison_shadow() to kasan_unpoison_range(), and introduce internal
functions (un)poison_range() (without kasan_ prefix).

Co-developed-by: Marco Elver <elver@google.com>
Link: https://lkml.kernel.org/r/fccdcaa13dc6b2211bf363d6c6d499279a54fe3a.1606161801.git.andreyknvl@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  6 +++---
 kernel/fork.c         |  4 ++--
 mm/kasan/common.c     | 49 +++++++++++++++++++++++++++----------------------
 mm/kasan/generic.c    | 23 +++++++++++------------
 mm/kasan/kasan.h      |  3 ++-
 mm/kasan/tags.c       |  2 +-
 mm/slab_common.c      |  2 +-
 7 files changed, 47 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7828436a3a99..9740c06a04a1 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -72,7 +72,7 @@ extern void kasan_enable_current(void);
 /* Disable reporting bugs for current task */
 extern void kasan_disable_current(void);
 
-void kasan_unpoison_shadow(const void *address, size_t size);
+void kasan_unpoison_range(const void *address, size_t size);
 
 void kasan_unpoison_task_stack(struct task_struct *task);
 
@@ -109,7 +109,7 @@ struct kasan_cache {
 size_t __ksize(const void *);
 static inline void kasan_unpoison_slab(const void *ptr)
 {
-	kasan_unpoison_shadow(ptr, __ksize(ptr));
+	kasan_unpoison_range(ptr, __ksize(ptr));
 }
 size_t kasan_metadata_size(struct kmem_cache *cache);
 
@@ -118,7 +118,7 @@ void kasan_restore_multi_shot(bool enabled);
 
 #else /* CONFIG_KASAN */
 
-static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
+static inline void kasan_unpoison_range(const void *address, size_t size) {}
 
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 41906a52a764..37720a6d04ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -225,8 +225,8 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
-		/* Clear the KASAN shadow of the stack. */
-		kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+		/* Mark stack accessible for KASAN. */
+		kasan_unpoison_range(s->addr, THREAD_SIZE);
 
 		/* Clear stale pointers from reused stack. */
 		memset(s->addr, 0, THREAD_SIZE);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 89e5ef9417a7..73e79a34671b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -108,7 +108,7 @@ void *memcpy(void *dest, const void *src, size_t len)
  * Poisons the shadow memory for 'size' bytes starting from 'addr'.
  * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
  */
-void kasan_poison_shadow(const void *address, size_t size, u8 value)
+void poison_range(const void *address, size_t size, u8 value)
 {
 	void *shadow_start, *shadow_end;
 
@@ -125,7 +125,7 @@ void kasan_poison_shadow(const void *address, size_t size, u8 value)
 	__memset(shadow_start, value, shadow_end - shadow_start);
 }
 
-void kasan_unpoison_shadow(const void *address, size_t size)
+void unpoison_range(const void *address, size_t size)
 {
 	u8 tag = get_tag(address);
 
@@ -136,7 +136,7 @@ void kasan_unpoison_shadow(const void *address, size_t size)
 	 */
 	address = reset_tag(address);
 
-	kasan_poison_shadow(address, size, tag);
+	poison_range(address, size, tag);
 
 	if (size & KASAN_SHADOW_MASK) {
 		u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
@@ -148,12 +148,17 @@ void kasan_unpoison_shadow(const void *address, size_t size)
 	}
 }
 
+void kasan_unpoison_range(const void *address, size_t size)
+{
+	unpoison_range(address, size);
+}
+
 static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
 {
 	void *base = task_stack_page(task);
 	size_t size = sp - base;
 
-	kasan_unpoison_shadow(base, size);
+	unpoison_range(base, size);
 }
 
 /* Unpoison the entire stack for a task. */
@@ -172,7 +177,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 	 */
 	void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
 
-	kasan_unpoison_shadow(base, watermark - base);
+	unpoison_range(base, watermark - base);
 }
 
 void kasan_alloc_pages(struct page *page, unsigned int order)
@@ -186,13 +191,13 @@ void kasan_alloc_pages(struct page *page, unsigned int order)
 	tag = random_tag();
 	for (i = 0; i < (1 << order); i++)
 		page_kasan_tag_set(page + i, tag);
-	kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+	unpoison_range(page_address(page), PAGE_SIZE << order);
 }
 
 void kasan_free_pages(struct page *page, unsigned int order)
 {
 	if (likely(!PageHighMem(page)))
-		kasan_poison_shadow(page_address(page),
+		poison_range(page_address(page),
 				PAGE_SIZE << order,
 				KASAN_FREE_PAGE);
 }
@@ -284,18 +289,18 @@ void kasan_poison_slab(struct page *page)
 
 	for (i = 0; i < compound_nr(page); i++)
 		page_kasan_tag_reset(page + i);
-	kasan_poison_shadow(page_address(page), page_size(page),
-			KASAN_KMALLOC_REDZONE);
+	poison_range(page_address(page), page_size(page),
+		     KASAN_KMALLOC_REDZONE);
 }
 
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
 {
-	kasan_unpoison_shadow(object, cache->object_size);
+	unpoison_range(object, cache->object_size);
 }
 
 void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 {
-	kasan_poison_shadow(object,
+	poison_range(object,
 			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
 			KASAN_KMALLOC_REDZONE);
 }
@@ -408,7 +413,7 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 	}
 
 	rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
-	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+	poison_range(object, rounded_up_size, KASAN_KMALLOC_FREE);
 
 	if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
 			unlikely(!(cache->flags & SLAB_KASAN)))
@@ -448,9 +453,9 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
 		tag = assign_tag(cache, object, false, keep_tag);
 
 	/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
-	kasan_unpoison_shadow(set_tag(object, tag), size);
-	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
-		KASAN_KMALLOC_REDZONE);
+	unpoison_range(set_tag(object, tag), size);
+	poison_range((void *)redzone_start, redzone_end - redzone_start,
+		     KASAN_KMALLOC_REDZONE);
 
 	if (cache->flags & SLAB_KASAN)
 		kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
@@ -489,9 +494,9 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
 				KASAN_SHADOW_SCALE_SIZE);
 	redzone_end = (unsigned long)ptr + page_size(page);
 
-	kasan_unpoison_shadow(ptr, size);
-	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
-		KASAN_PAGE_REDZONE);
+	unpoison_range(ptr, size);
+	poison_range((void *)redzone_start, redzone_end - redzone_start,
+		     KASAN_PAGE_REDZONE);
 
 	return (void *)ptr;
 }
@@ -523,7 +528,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip)
 			kasan_report_invalid_free(ptr, ip);
 			return;
 		}
-		kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
+		poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
 	} else {
 		__kasan_slab_free(page->slab_cache, ptr, ip, false);
 	}
@@ -709,7 +714,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
 	 * // vmalloc() allocates memory
 	 * // let a = area->addr
 	 * // we reach kasan_populate_vmalloc
-	 * // and call kasan_unpoison_shadow:
+	 * // and call unpoison_range:
 	 * STORE shadow(a), unpoison_val
 	 * ...
 	 * STORE shadow(a+99), unpoison_val	x = LOAD p
@@ -744,7 +749,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size)
 		return;
 
 	size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
-	kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
+	poison_range(start, size, KASAN_VMALLOC_INVALID);
 }
 
 void kasan_unpoison_vmalloc(const void *start, unsigned long size)
@@ -752,7 +757,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size)
 	if (!is_vmalloc_or_module_addr(start))
 		return;
 
-	kasan_unpoison_shadow(start, size);
+	unpoison_range(start, size);
 }
 
 static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index d341859a1b95..9fe44f9b3b30 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -202,11 +202,11 @@ static void register_global(struct kasan_global *global)
 {
 	size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
 
-	kasan_unpoison_shadow(global->beg, global->size);
+	unpoison_range(global->beg, global->size);
 
-	kasan_poison_shadow(global->beg + aligned_size,
-		global->size_with_redzone - aligned_size,
-		KASAN_GLOBAL_REDZONE);
+	poison_range(global->beg + aligned_size,
+		     global->size_with_redzone - aligned_size,
+		     KASAN_GLOBAL_REDZONE);
 }
 
 void __asan_register_globals(struct kasan_global *globals, size_t size)
@@ -285,13 +285,12 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
 
 	WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
 
-	kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
-			      size - rounded_down_size);
-	kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
-			KASAN_ALLOCA_LEFT);
-	kasan_poison_shadow(right_redzone,
-			padding_size + KASAN_ALLOCA_REDZONE_SIZE,
-			KASAN_ALLOCA_RIGHT);
+	unpoison_range((const void *)(addr + rounded_down_size),
+		       size - rounded_down_size);
+	poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+		     KASAN_ALLOCA_LEFT);
+	poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+		     KASAN_ALLOCA_RIGHT);
 }
 EXPORT_SYMBOL(__asan_alloca_poison);
 
@@ -301,7 +300,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
 	if (unlikely(!stack_top || stack_top > stack_bottom))
 		return;
 
-	kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+	unpoison_range(stack_top, stack_bottom - stack_top);
 }
 EXPORT_SYMBOL(__asan_allocas_unpoison);
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ac499456740f..42ab02c61331 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -150,7 +150,8 @@ static inline bool addr_has_shadow(const void *addr)
 	return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
 }
 
-void kasan_poison_shadow(const void *address, size_t size, u8 value);
+void poison_range(const void *address, size_t size, u8 value);
+void unpoison_range(const void *address, size_t size);
 
 /**
  * check_memory_region - Check memory region, and report if invalid access.
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 5c8b08a25715..c0b3f327812b 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -153,7 +153,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort);
 
 void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
 {
-	kasan_poison_shadow((void *)addr, size, tag);
+	poison_range((void *)addr, size, tag);
 }
 EXPORT_SYMBOL(__hwasan_tag_memory);
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2f2b55c2798e..573fbacd9ef5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1176,7 +1176,7 @@ size_t ksize(const void *objp)
 	 * We assume that ksize callers could use whole allocated area,
 	 * so we need to unpoison this area.
 	 */
-	kasan_unpoison_shadow(objp, size);
+	kasan_unpoison_range(objp, size);
 	return size;
 }
 EXPORT_SYMBOL(ksize);
-- 
cgit 


From d73b49365ee65ac48074bdb5aa717bb4644dbbb7 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:00:56 -0800
Subject: kasan, arm64: only use kasan_depth for software modes

This is a preparatory commit for the upcoming addition of a new hardware
tag-based (MTE-based) KASAN mode.

Hardware tag-based KASAN won't use kasan_depth.  Only define and use it
when one of the software KASAN modes are enabled.

No functional changes for software modes.

Link: https://lkml.kernel.org/r/e16f15aeda90bc7fb4dfc2e243a14b74cc5c8219.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/kasan_init.c | 11 ++++++++---
 include/linux/kasan.h      | 18 +++++++++---------
 include/linux/sched.h      |  2 +-
 init/init_task.c           |  2 +-
 mm/kasan/common.c          |  2 ++
 mm/kasan/report.c          |  2 ++
 6 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index ffeb80d5aa8d..5172799f831f 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -273,17 +273,22 @@ static void __init kasan_init_shadow(void)
 	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 }
 
+static void __init kasan_init_depth(void)
+{
+	init_task.kasan_depth = 0;
+}
+
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) */
 
 static inline void __init kasan_init_shadow(void) { }
 
+static inline void __init kasan_init_depth(void) { }
+
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 void __init kasan_init(void)
 {
 	kasan_init_shadow();
-
-	/* At this point kasan is fully initialized. Enable error messages */
-	init_task.kasan_depth = 0;
+	kasan_init_depth();
 	pr_info("KernelAddressSanitizer initialized\n");
 }
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 9740c06a04a1..b272960e8396 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -52,6 +52,12 @@ static inline void *kasan_mem_to_shadow(const void *addr)
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
 
+/* Enable reporting bugs after kasan_disable_current() */
+extern void kasan_enable_current(void);
+
+/* Disable reporting bugs for current task */
+extern void kasan_disable_current(void);
+
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 static inline int kasan_add_zero_shadow(void *start, unsigned long size)
@@ -62,16 +68,13 @@ static inline void kasan_remove_zero_shadow(void *start,
 					unsigned long size)
 {}
 
+static inline void kasan_enable_current(void) {}
+static inline void kasan_disable_current(void) {}
+
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 #ifdef CONFIG_KASAN
 
-/* Enable reporting bugs after kasan_disable_current() */
-extern void kasan_enable_current(void);
-
-/* Disable reporting bugs for current task */
-extern void kasan_disable_current(void);
-
 void kasan_unpoison_range(const void *address, size_t size);
 
 void kasan_unpoison_task_stack(struct task_struct *task);
@@ -122,9 +125,6 @@ static inline void kasan_unpoison_range(const void *address, size_t size) {}
 
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 
-static inline void kasan_enable_current(void) {}
-static inline void kasan_disable_current(void) {}
-
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 51d535b69bd6..6e3a5eeec509 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1234,7 +1234,7 @@ struct task_struct {
 	u64				timer_slack_ns;
 	u64				default_timer_slack_ns;
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	unsigned int			kasan_depth;
 #endif
 
diff --git a/init/init_task.c b/init/init_task.c
index 15f6eb93a04f..8a992d73e6fb 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -176,7 +176,7 @@ struct task_struct init_task
 	.numa_group	= NULL,
 	.numa_faults	= NULL,
 #endif
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	.kasan_depth	= 1,
 #endif
 #ifdef CONFIG_KCSAN
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 663ffa71cd20..d5f23b2f170a 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -46,6 +46,7 @@ void kasan_set_track(struct kasan_track *track, gfp_t flags)
 	track->stack = kasan_save_stack(flags);
 }
 
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 void kasan_enable_current(void)
 {
 	current->kasan_depth++;
@@ -55,6 +56,7 @@ void kasan_disable_current(void)
 {
 	current->kasan_depth--;
 }
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 void kasan_unpoison_range(const void *address, size_t size)
 {
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index d140d26cfb31..914ab5cfc3ea 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -292,8 +292,10 @@ static void print_shadow_for_address(const void *addr)
 
 static bool report_enabled(void)
 {
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	if (current->kasan_depth)
 		return false;
+#endif
 	if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
 		return true;
 	return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
-- 
cgit 


From 60a3a5fe950f4e6c02e9fc6676dc96de043ed743 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:01:03 -0800
Subject: kasan, arm64: rename kasan_init_tags and mark as __init

Rename kasan_init_tags() to kasan_init_sw_tags() as the upcoming hardware
tag-based KASAN mode will have its own initialization routine.  Also
similarly to kasan_init() mark kasan_init_tags() as __init.

Link: https://lkml.kernel.org/r/71e52af72a09f4b50c8042f16101c60e50649fbb.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/kernel/setup.c  | 2 +-
 arch/arm64/mm/kasan_init.c | 2 +-
 include/linux/kasan.h      | 4 ++--
 mm/kasan/sw_tags.c         | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index c44eb4b80163..c18aacde8bb0 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -358,7 +358,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	smp_build_mpidr_hash();
 
 	/* Init percpu seeds for random tags after cpus are set up. */
-	kasan_init_tags();
+	kasan_init_sw_tags();
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	/*
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index e35ce04beed1..d8e66c78440e 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -283,7 +283,7 @@ void __init kasan_init(void)
 	kasan_init_shadow();
 	kasan_init_depth();
 #if defined(CONFIG_KASAN_GENERIC)
-	/* CONFIG_KASAN_SW_TAGS also requires kasan_init_tags(). */
+	/* CONFIG_KASAN_SW_TAGS also requires kasan_init_sw_tags(). */
 	pr_info("KernelAddressSanitizer initialized\n");
 #endif
 }
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b272960e8396..d7042d129dc1 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -192,7 +192,7 @@ static inline void kasan_record_aux_stack(void *ptr) {}
 
 #ifdef CONFIG_KASAN_SW_TAGS
 
-void kasan_init_tags(void);
+void __init kasan_init_sw_tags(void);
 
 void *kasan_reset_tag(const void *addr);
 
@@ -201,7 +201,7 @@ bool kasan_report(unsigned long addr, size_t size,
 
 #else /* CONFIG_KASAN_SW_TAGS */
 
-static inline void kasan_init_tags(void) { }
+static inline void kasan_init_sw_tags(void) { }
 
 static inline void *kasan_reset_tag(const void *addr)
 {
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 9445cf4ccdc8..7317d5229b2b 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -35,7 +35,7 @@
 
 static DEFINE_PER_CPU(u32, prng_state);
 
-void kasan_init_tags(void)
+void __init kasan_init_sw_tags(void)
 {
 	int cpu;
 
-- 
cgit 


From 0fea6e9af889f1a4e072f5de999e07fe6859fc88 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:06 -0800
Subject: kasan, arm64: expand CONFIG_KASAN checks

Some #ifdef CONFIG_KASAN checks are only relevant for software KASAN modes
(either related to shadow memory or compiler instrumentation).  Expand
those into CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS.

Link: https://lkml.kernel.org/r/e6971e432dbd72bb897ff14134ebb7e169bdcf0c.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig                 |  2 +-
 arch/arm64/Makefile                |  2 +-
 arch/arm64/include/asm/assembler.h |  2 +-
 arch/arm64/include/asm/memory.h    |  2 +-
 arch/arm64/include/asm/string.h    |  5 +++--
 arch/arm64/kernel/head.S           |  2 +-
 arch/arm64/kernel/image-vars.h     |  2 +-
 arch/arm64/kernel/kaslr.c          |  3 ++-
 arch/arm64/kernel/module.c         |  6 ++++--
 arch/arm64/mm/ptdump.c             |  6 +++---
 include/linux/kasan-checks.h       |  2 +-
 include/linux/kasan.h              |  7 ++++---
 include/linux/moduleloader.h       |  3 ++-
 include/linux/string.h             |  2 +-
 mm/ptdump.c                        | 13 ++++++++-----
 scripts/Makefile.lib               |  2 ++
 16 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a33718f8b62f..9386b108b132 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -334,7 +334,7 @@ config BROKEN_GAS_INST
 
 config KASAN_SHADOW_OFFSET
 	hex
-	depends on KASAN
+	depends on KASAN_GENERIC || KASAN_SW_TAGS
 	default 0xdfff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS
 	default 0xdfffc00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS
 	default 0xdffffe0000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 05f46a60b245..6be9b3750250 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -137,7 +137,7 @@ head-y		:= arch/arm64/kernel/head.o
 
 ifeq ($(CONFIG_KASAN_SW_TAGS), y)
 KASAN_SHADOW_SCALE_SHIFT := 4
-else
+else ifeq ($(CONFIG_KASAN_GENERIC), y)
 KASAN_SHADOW_SCALE_SHIFT := 3
 endif
 
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ddbe6bf00e33..bf125c591116 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -473,7 +473,7 @@ USER(\label, ic	ivau, \tmp2)			// invalidate I line PoU
 #define NOKPROBE(x)
 #endif
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 #define EXPORT_SYMBOL_NOKASAN(name)
 #else
 #define EXPORT_SYMBOL_NOKASAN(name)	EXPORT_SYMBOL(name)
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 3bc08e6cf82e..cd671fb6707c 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -72,7 +72,7 @@
  * address space for the shadow region respectively. They can bloat the stack
  * significantly, so double the (minimum) stack size when they are in use.
  */
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 #define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 #define KASAN_SHADOW_END	((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \
 					+ KASAN_SHADOW_OFFSET)
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index b31e8e87a0db..3a3264ff47b9 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -5,7 +5,7 @@
 #ifndef __ASM_STRING_H
 #define __ASM_STRING_H
 
-#ifndef CONFIG_KASAN
+#if !(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
 #define __HAVE_ARCH_STRRCHR
 extern char *strrchr(const char *, int c);
 
@@ -48,7 +48,8 @@ extern void *__memset(void *, int, __kernel_size_t);
 void memcpy_flushcache(void *dst, const void *src, size_t cnt);
 #endif
 
-#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+	!defined(__SANITIZE_ADDRESS__)
 
 /*
  * For files that are not instrumented (e.g. mm/slub.c) we
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 42b23ce679dc..a0dc987724ed 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -433,7 +433,7 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 	bl	__pi_memset
 	dsb	ishst				// Make zero page visible to PTW
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	bl	kasan_early_init
 #endif
 #ifdef CONFIG_RANDOMIZE_BASE
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 39289d75118d..f676243abac6 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -37,7 +37,7 @@ __efistub_strncmp		= __pi_strncmp;
 __efistub_strrchr		= __pi_strrchr;
 __efistub___clean_dcache_area_poc = __pi___clean_dcache_area_poc;
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 __efistub___memcpy		= __pi_memcpy;
 __efistub___memmove		= __pi_memmove;
 __efistub___memset		= __pi_memset;
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 0921aa1520b0..1c74c45b9494 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -161,7 +161,8 @@ u64 __init kaslr_early_init(u64 dt_phys)
 	/* use the top 16 bits to randomize the linear region */
 	memstart_offset_seed = seed >> 48;
 
-	if (IS_ENABLED(CONFIG_KASAN))
+	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
 		/*
 		 * KASAN does not expect the module region to intersect the
 		 * vmalloc region, since shadow memory is allocated for each
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 2a1ad95d9b2c..fe21e0f06492 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -30,7 +30,8 @@ void *module_alloc(unsigned long size)
 	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
 		gfp_mask |= __GFP_NOWARN;
 
-	if (IS_ENABLED(CONFIG_KASAN))
+	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
 		/* don't exceed the static module region - see below */
 		module_alloc_end = MODULES_END;
 
@@ -39,7 +40,8 @@ void *module_alloc(unsigned long size)
 				NUMA_NO_NODE, __builtin_return_address(0));
 
 	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
-	    !IS_ENABLED(CONFIG_KASAN))
+	    !IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+	    !IS_ENABLED(CONFIG_KASAN_SW_TAGS))
 		/*
 		 * KASAN can only deal with module allocations being served
 		 * from the reserved module region, since the remainder of
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 807dc634bbd2..04137a8f3d2d 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -29,7 +29,7 @@
 enum address_markers_idx {
 	PAGE_OFFSET_NR = 0,
 	PAGE_END_NR,
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	KASAN_START_NR,
 #endif
 };
@@ -37,7 +37,7 @@ enum address_markers_idx {
 static struct addr_marker address_markers[] = {
 	{ PAGE_OFFSET,			"Linear Mapping start" },
 	{ 0 /* PAGE_END */,		"Linear Mapping end" },
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	{ 0 /* KASAN_SHADOW_START */,	"Kasan shadow start" },
 	{ KASAN_SHADOW_END,		"Kasan shadow end" },
 #endif
@@ -383,7 +383,7 @@ void ptdump_check_wx(void)
 static int ptdump_init(void)
 {
 	address_markers[PAGE_END_NR].start_address = PAGE_END;
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	address_markers[KASAN_START_NR].start_address = KASAN_SHADOW_START;
 #endif
 	ptdump_initialize();
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index ac6aba632f2d..ca5e89fb10d3 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -9,7 +9,7 @@
  * even in compilation units that selectively disable KASAN, but must use KASAN
  * to validate access to an address.   Never use these in header files!
  */
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 bool __kasan_check_read(const volatile void *p, unsigned int size);
 bool __kasan_check_write(const volatile void *p, unsigned int size);
 #else
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d7042d129dc1..b1381ee6922a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -238,7 +238,8 @@ static inline void kasan_release_vmalloc(unsigned long start,
 
 #endif /* CONFIG_KASAN_VMALLOC */
 
-#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+		!defined(CONFIG_KASAN_VMALLOC)
 
 /*
  * These functions provide a special case to support backing module
@@ -248,12 +249,12 @@ static inline void kasan_release_vmalloc(unsigned long start,
 int kasan_module_alloc(void *addr, size_t size);
 void kasan_free_shadow(const struct vm_struct *vm);
 
-#else /* CONFIG_KASAN && !CONFIG_KASAN_VMALLOC */
+#else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
 
-#endif /* CONFIG_KASAN && !CONFIG_KASAN_VMALLOC */
+#endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
 #ifdef CONFIG_KASAN_INLINE
 void kasan_non_canonical_hook(unsigned long addr);
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 4fa67a8b2265..9e09d11ffe5b 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -96,7 +96,8 @@ void module_arch_cleanup(struct module *mod);
 /* Any cleanup before freeing mod->module_init */
 void module_arch_freeing_init(struct module *mod);
 
-#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+		!defined(CONFIG_KASAN_VMALLOC)
 #include <linux/kasan.h>
 #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
 #else
diff --git a/include/linux/string.h b/include/linux/string.h
index 1cd63a8a23ab..4fcfb56abcf5 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -267,7 +267,7 @@ void __write_overflow(void) __compiletime_error("detected write beyond size of o
 
 #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
 extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
 extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
diff --git a/mm/ptdump.c b/mm/ptdump.c
index ba88ec43ff21..4354c1422d57 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,7 +4,7 @@
 #include <linux/ptdump.h>
 #include <linux/kasan.h>
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 /*
  * This is an optimization for KASAN=y case. Since all kasan page tables
  * eventually point to the kasan_early_shadow_page we could call note_page()
@@ -31,7 +31,8 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
 	struct ptdump_state *st = walk->private;
 	pgd_t val = READ_ONCE(*pgd);
 
-#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 4 && \
+		(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
 	if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
 		return note_kasan_page_table(walk, addr);
 #endif
@@ -51,7 +52,8 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
 	struct ptdump_state *st = walk->private;
 	p4d_t val = READ_ONCE(*p4d);
 
-#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 3 && \
+		(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
 	if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
 		return note_kasan_page_table(walk, addr);
 #endif
@@ -71,7 +73,8 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
 	struct ptdump_state *st = walk->private;
 	pud_t val = READ_ONCE(*pud);
 
-#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 2 && \
+		(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
 	if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
 		return note_kasan_page_table(walk, addr);
 #endif
@@ -91,7 +94,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
 	struct ptdump_state *st = walk->private;
 	pmd_t val = READ_ONCE(*pmd);
 
-#if defined(CONFIG_KASAN)
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
 		return note_kasan_page_table(walk, addr);
 #endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 94133708889d..213677a5ed33 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -148,10 +148,12 @@ endif
 # we don't want to check (depends on variables KASAN_SANITIZE_obj.o, KASAN_SANITIZE)
 #
 ifeq ($(CONFIG_KASAN),y)
+ifneq ($(CONFIG_KASAN_HW_TAGS),y)
 _c_flags += $(if $(patsubst n%,, \
 		$(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \
 		$(CFLAGS_KASAN), $(CFLAGS_KASAN_NOSANITIZE))
 endif
+endif
 
 ifeq ($(CONFIG_UBSAN),y)
 _c_flags += $(if $(patsubst n%,, \
-- 
cgit 


From 2e903b91479782b7dedd869603423d77e079d3de Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:10 -0800
Subject: kasan, arm64: implement HW_TAGS runtime

Provide implementation of KASAN functions required for the hardware
tag-based mode.  Those include core functions for memory and pointer
tagging (tags_hw.c) and bug reporting (report_tags_hw.c).  Also adapt
common KASAN code to support the new mode.

Link: https://lkml.kernel.org/r/cfd0fbede579a6b66755c98c88c108e54f9c56bf.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/memory.h   |  4 +-
 arch/arm64/kernel/cpufeature.c    |  3 ++
 arch/arm64/kernel/smp.c           |  2 +
 include/linux/kasan.h             | 24 ++++++++----
 include/linux/mm.h                |  2 +-
 include/linux/page-flags-layout.h |  2 +-
 mm/kasan/Makefile                 |  5 +++
 mm/kasan/common.c                 | 15 ++++----
 mm/kasan/hw_tags.c                | 80 +++++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h                  | 19 ++++++++--
 mm/kasan/report_hw_tags.c         | 42 ++++++++++++++++++++
 mm/kasan/report_sw_tags.c         |  2 +-
 mm/kasan/shadow.c                 |  2 +-
 mm/kasan/sw_tags.c                |  2 +-
 14 files changed, 178 insertions(+), 26 deletions(-)
 create mode 100644 mm/kasan/hw_tags.c
 create mode 100644 mm/kasan/report_hw_tags.c

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index cd671fb6707c..18fce223b67b 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -214,7 +214,7 @@ static inline unsigned long kaslr_offset(void)
 	(__force __typeof__(addr))__addr;				\
 })
 
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 #define __tag_shifted(tag)	((u64)(tag) << 56)
 #define __tag_reset(addr)	__untagged_addr(addr)
 #define __tag_get(addr)		(__u8)((u64)(addr) >> 56)
@@ -222,7 +222,7 @@ static inline unsigned long kaslr_offset(void)
 #define __tag_shifted(tag)	0UL
 #define __tag_reset(addr)	(addr)
 #define __tag_get(addr)		0
-#endif /* CONFIG_KASAN_SW_TAGS */
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
 
 static inline const void *__tag_set(const void *addr, u8 tag)
 {
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d87cfc6246e0..7ffb5f1d8b68 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -70,6 +70,7 @@
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
+#include <linux/kasan.h>
 #include <asm/cpu.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
@@ -1710,6 +1711,8 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
 		cleared_zero_page = true;
 		mte_clear_page_tags(lm_alias(empty_zero_page));
 	}
+
+	kasan_init_hw_tags_cpu();
 }
 #endif /* CONFIG_ARM64_MTE */
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 2499b895efea..19b1705ae5cb 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -462,6 +462,8 @@ void __init smp_prepare_boot_cpu(void)
 	/* Conditionally switch to GIC PMR for interrupt masking */
 	if (system_uses_irq_prio_masking())
 		init_gic_priority_masking();
+
+	kasan_init_hw_tags();
 }
 
 static u64 __init of_get_cpu_mpidr(struct device_node *dn)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b1381ee6922a..d22ec4c9c1bd 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -190,25 +190,35 @@ static inline void kasan_record_aux_stack(void *ptr) {}
 
 #endif /* CONFIG_KASAN_GENERIC */
 
-#ifdef CONFIG_KASAN_SW_TAGS
-
-void __init kasan_init_sw_tags(void);
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 void *kasan_reset_tag(const void *addr);
 
 bool kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
-#else /* CONFIG_KASAN_SW_TAGS */
-
-static inline void kasan_init_sw_tags(void) { }
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
 
 static inline void *kasan_reset_tag(const void *addr)
 {
 	return (void *)addr;
 }
 
-#endif /* CONFIG_KASAN_SW_TAGS */
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS*/
+
+#ifdef CONFIG_KASAN_SW_TAGS
+void __init kasan_init_sw_tags(void);
+#else
+static inline void kasan_init_sw_tags(void) { }
+#endif
+
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_init_hw_tags_cpu(void);
+void __init kasan_init_hw_tags(void);
+#else
+static inline void kasan_init_hw_tags_cpu(void) { }
+static inline void kasan_init_hw_tags(void) { }
+#endif
 
 #ifdef CONFIG_KASAN_VMALLOC
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 362579ad0758..024ec0a00c72 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1421,7 +1421,7 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 static inline u8 page_kasan_tag(const struct page *page)
 {
 	return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index e200eef6a7fd..7d4ec26d8a3e 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -77,7 +77,7 @@
 #define LAST_CPUPID_SHIFT 0
 #endif
 
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 #define KASAN_TAG_WIDTH 8
 #else
 #define KASAN_TAG_WIDTH 0
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index f1d68a34f3c9..9fe39a66388a 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -10,8 +10,10 @@ CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report_generic.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_hw_tags.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report_sw_tags.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_hw_tags.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_sw_tags.o = $(CC_FLAGS_FTRACE)
 
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
@@ -27,10 +29,13 @@ CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_report_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_report_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 
 obj-$(CONFIG_KASAN) := common.o report.o
 obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
+obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o
 obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index d5f23b2f170a..02613883846e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -118,7 +118,7 @@ void kasan_free_pages(struct page *page, unsigned int order)
  */
 static inline unsigned int optimal_redzone(unsigned int object_size)
 {
-	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+	if (!IS_ENABLED(CONFIG_KASAN_GENERIC))
 		return 0;
 
 	return
@@ -183,14 +183,14 @@ size_t kasan_metadata_size(struct kmem_cache *cache)
 struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
 					const void *object)
 {
-	return (void *)object + cache->kasan_info.alloc_meta_offset;
+	return (void *)reset_tag(object) + cache->kasan_info.alloc_meta_offset;
 }
 
 struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 				      const void *object)
 {
 	BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
-	return (void *)object + cache->kasan_info.free_meta_offset;
+	return (void *)reset_tag(object) + cache->kasan_info.free_meta_offset;
 }
 
 void kasan_poison_slab(struct page *page)
@@ -272,9 +272,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
 	alloc_info = get_alloc_info(cache, object);
 	__memset(alloc_info, 0, sizeof(*alloc_info));
 
-	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
-		object = set_tag(object,
-				assign_tag(cache, object, true, false));
+	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) || IS_ENABLED(CONFIG_KASAN_HW_TAGS))
+		object = set_tag(object, assign_tag(cache, object, true, false));
 
 	return (void *)object;
 }
@@ -342,10 +341,10 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
 	redzone_end = round_up((unsigned long)object + cache->object_size,
 				KASAN_GRANULE_SIZE);
 
-	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) || IS_ENABLED(CONFIG_KASAN_HW_TAGS))
 		tag = assign_tag(cache, object, false, keep_tag);
 
-	/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
+	/* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */
 	unpoison_range(set_tag(object, tag), size);
 	poison_range((void *)redzone_start, redzone_end - redzone_start,
 		     KASAN_KMALLOC_REDZONE);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
new file mode 100644
index 000000000000..66419e908e21
--- /dev/null
+++ b/mm/kasan/hw_tags.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core hardware tag-based KASAN code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#define pr_fmt(fmt) "kasan: " fmt
+
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+/* kasan_init_hw_tags_cpu() is called for each CPU. */
+void kasan_init_hw_tags_cpu(void)
+{
+	hw_init_tags(KASAN_TAG_MAX);
+	hw_enable_tagging();
+}
+
+/* kasan_init_hw_tags() is called once on boot CPU. */
+void __init kasan_init_hw_tags(void)
+{
+	pr_info("KernelAddressSanitizer initialized\n");
+}
+
+void *kasan_reset_tag(const void *addr)
+{
+	return reset_tag(addr);
+}
+
+void poison_range(const void *address, size_t size, u8 value)
+{
+	hw_set_mem_tag_range(reset_tag(address),
+			round_up(size, KASAN_GRANULE_SIZE), value);
+}
+
+void unpoison_range(const void *address, size_t size)
+{
+	hw_set_mem_tag_range(reset_tag(address),
+			round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
+}
+
+u8 random_tag(void)
+{
+	return hw_get_random_tag();
+}
+
+bool check_invalid_free(void *addr)
+{
+	u8 ptr_tag = get_tag(addr);
+	u8 mem_tag = hw_get_mem_tag(addr);
+
+	return (mem_tag == KASAN_TAG_INVALID) ||
+		(ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag);
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+				void *object, u8 tag)
+{
+	struct kasan_alloc_meta *alloc_meta;
+
+	alloc_meta = get_alloc_info(cache, object);
+	kasan_set_track(&alloc_meta->free_track[0], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+				void *object, u8 tag)
+{
+	struct kasan_alloc_meta *alloc_meta;
+
+	alloc_meta = get_alloc_info(cache, object);
+	return &alloc_meta->free_track[0];
+}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 92cb2c16e314..64560cc71191 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -154,6 +154,11 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
 struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 					const void *object);
 
+void poison_range(const void *address, size_t size, u8 value);
+void unpoison_range(const void *address, size_t size);
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
@@ -165,9 +170,6 @@ static inline bool addr_has_metadata(const void *addr)
 	return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
 }
 
-void poison_range(const void *address, size_t size, u8 value);
-void unpoison_range(const void *address, size_t size);
-
 /**
  * check_memory_region - Check memory region, and report if invalid access.
  * @addr: the accessed address
@@ -179,6 +181,15 @@ void unpoison_range(const void *address, size_t size);
 bool check_memory_region(unsigned long addr, size_t size, bool write,
 				unsigned long ret_ip);
 
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+static inline bool addr_has_metadata(const void *addr)
+{
+	return true;
+}
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
 bool check_invalid_free(void *addr);
 
 void *find_first_bad_addr(void *addr, size_t size);
@@ -215,7 +226,7 @@ static inline void quarantine_reduce(void) { }
 static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
 #endif
 
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 void print_tags(u8 addr_tag, const void *addr);
 
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
new file mode 100644
index 000000000000..da543eb832cd
--- /dev/null
+++ b/mm/kasan/report_hw_tags.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains hardware tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+	return "invalid-access";
+}
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+	return reset_tag(addr);
+}
+
+void metadata_fetch_row(char *buffer, void *row)
+{
+	int i;
+
+	for (i = 0; i < META_BYTES_PER_ROW; i++)
+		buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE);
+}
+
+void print_tags(u8 addr_tag, const void *addr)
+{
+	u8 memory_tag = hw_get_mem_tag((void *)addr);
+
+	pr_err("Pointer tag: [%02x], memory tag: [%02x]\n",
+		addr_tag, memory_tag);
+}
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index add2dfe6169c..aebc44a29e83 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * This file contains tag-based KASAN specific error reporting code.
+ * This file contains software tag-based KASAN specific error reporting code.
  *
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index ba84e5106585..ac6a5f57df33 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -107,7 +107,7 @@ void unpoison_range(const void *address, size_t size)
 
 		if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
 			*shadow = tag;
-		else
+		else /* CONFIG_KASAN_GENERIC */
 			*shadow = size & KASAN_GRANULE_MASK;
 	}
 }
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 7317d5229b2b..a518483f3965 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * This file contains core tag-based KASAN code.
+ * This file contains core software tag-based KASAN code.
  *
  * Copyright (c) 2018 Google, Inc.
  * Author: Andrey Konovalov <andreyknvl@google.com>
-- 
cgit 


From d56a9ef84bd0e1e8fba7a837ab12a4ec8476579f Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:42 -0800
Subject: kasan, arm64: unpoison stack only with CONFIG_KASAN_STACK

There's a config option CONFIG_KASAN_STACK that has to be enabled for
KASAN to use stack instrumentation and perform validity checks for
stack variables.

There's no need to unpoison stack when CONFIG_KASAN_STACK is not enabled.
Only call kasan_unpoison_task_stack[_below]() when CONFIG_KASAN_STACK is
enabled.

Note, that CONFIG_KASAN_STACK is an option that is currently always
defined when CONFIG_KASAN is enabled, and therefore has to be tested
with #if instead of #ifdef.

Link: https://lkml.kernel.org/r/d09dd3f8abb388da397fd11598c5edeaa83fe559.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/If8a891e9fe01ea543e00b576852685afec0887e3
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/kernel/sleep.S        |  2 +-
 arch/x86/kernel/acpi/wakeup_64.S |  2 +-
 include/linux/kasan.h            | 10 ++++++----
 mm/kasan/common.c                |  2 ++
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 4be7f7eed875..6bdef7362c0e 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -133,7 +133,7 @@ SYM_FUNC_START(_cpu_resume)
 	 */
 	bl	cpu_do_resume
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
 	mov	x0, sp
 	bl	kasan_unpoison_task_stack_below
 #endif
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index c8daa92f38dc..5d3a0b8fd379 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -112,7 +112,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
 	movq	pt_regs_r14(%rax), %r14
 	movq	pt_regs_r15(%rax), %r15
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
 	/*
 	 * The suspend path may have poisoned some areas deeper in the stack,
 	 * which we now need to unpoison.
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d22ec4c9c1bd..e638255ce906 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -77,8 +77,6 @@ static inline void kasan_disable_current(void) {}
 
 void kasan_unpoison_range(const void *address, size_t size);
 
-void kasan_unpoison_task_stack(struct task_struct *task);
-
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
@@ -123,8 +121,6 @@ void kasan_restore_multi_shot(bool enabled);
 
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
 
-static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
-
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
@@ -176,6 +172,12 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 #endif /* CONFIG_KASAN */
 
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+void kasan_unpoison_task_stack(struct task_struct *task);
+#else
+static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+#endif
+
 #ifdef CONFIG_KASAN_GENERIC
 
 void kasan_cache_shrink(struct kmem_cache *cache);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6c38fd0a9e5c..2754ce0c8334 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -63,6 +63,7 @@ void kasan_unpoison_range(const void *address, size_t size)
 	unpoison_range(address, size);
 }
 
+#if CONFIG_KASAN_STACK
 static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
 {
 	void *base = task_stack_page(task);
@@ -89,6 +90,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 
 	unpoison_range(base, watermark - base);
 }
+#endif /* CONFIG_KASAN_STACK */
 
 void kasan_alloc_pages(struct page *page, unsigned int order)
 {
-- 
cgit 


From c0054c565ae598073d6c27762c7d4f7de49a45d9 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:52 -0800
Subject: kasan: inline kasan_reset_tag for tag-based modes

Using kasan_reset_tag() currently results in a function call. As it's
called quite often from the allocator code, this leads to a noticeable
slowdown. Move it to include/linux/kasan.h and turn it into a static
inline function. Also remove the now unneeded reset_tag() internal KASAN
macro and use kasan_reset_tag() instead.

Link: https://lkml.kernel.org/r/6940383a3a9dfb416134d338d8fac97a9ebb8686.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/I4d2061acfe91d480a75df00b07c22d8494ef14b5
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h     | 5 ++++-
 mm/kasan/common.c         | 6 +++---
 mm/kasan/hw_tags.c        | 9 ++-------
 mm/kasan/kasan.h          | 4 ----
 mm/kasan/report.c         | 4 ++--
 mm/kasan/report_hw_tags.c | 2 +-
 mm/kasan/report_sw_tags.c | 4 ++--
 mm/kasan/shadow.c         | 4 ++--
 mm/kasan/sw_tags.c        | 9 ++-------
 9 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index e638255ce906..3bb72de94f90 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -194,7 +194,10 @@ static inline void kasan_record_aux_stack(void *ptr) {}
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
-void *kasan_reset_tag(const void *addr);
+static inline void *kasan_reset_tag(const void *addr)
+{
+	return (void *)arch_kasan_reset_tag(addr);
+}
 
 bool kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b71dfe7c5059..780ec27459ab 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -179,14 +179,14 @@ size_t kasan_metadata_size(struct kmem_cache *cache)
 struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
 					      const void *object)
 {
-	return (void *)reset_tag(object) + cache->kasan_info.alloc_meta_offset;
+	return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset;
 }
 
 struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
 					    const void *object)
 {
 	BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
-	return (void *)reset_tag(object) + cache->kasan_info.free_meta_offset;
+	return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset;
 }
 
 void kasan_poison_slab(struct page *page)
@@ -283,7 +283,7 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 
 	tag = get_tag(object);
 	tagged_object = object;
-	object = reset_tag(object);
+	object = kasan_reset_tag(object);
 
 	if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
 	    object)) {
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index cb849c8da978..227599e54e8e 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -30,20 +30,15 @@ void __init kasan_init_hw_tags(void)
 	pr_info("KernelAddressSanitizer initialized\n");
 }
 
-void *kasan_reset_tag(const void *addr)
-{
-	return reset_tag(addr);
-}
-
 void poison_range(const void *address, size_t size, u8 value)
 {
-	hw_set_mem_tag_range(reset_tag(address),
+	hw_set_mem_tag_range(kasan_reset_tag(address),
 			round_up(size, KASAN_GRANULE_SIZE), value);
 }
 
 void unpoison_range(const void *address, size_t size)
 {
-	hw_set_mem_tag_range(reset_tag(address),
+	hw_set_mem_tag_range(kasan_reset_tag(address),
 			round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
 }
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 0eab7e4cecb8..5e8cd2080369 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -248,15 +248,11 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
 	return addr;
 }
 #endif
-#ifndef arch_kasan_reset_tag
-#define arch_kasan_reset_tag(addr)	((void *)(addr))
-#endif
 #ifndef arch_kasan_get_tag
 #define arch_kasan_get_tag(addr)	0
 #endif
 
 #define set_tag(addr, tag)	((void *)arch_kasan_set_tag((addr), (tag)))
-#define reset_tag(addr)		((void *)arch_kasan_reset_tag(addr))
 #define get_tag(addr)		arch_kasan_get_tag(addr)
 
 #ifdef CONFIG_KASAN_HW_TAGS
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 85ce2cb2cd2b..00c590efdaea 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -328,7 +328,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
 	unsigned long flags;
 	u8 tag = get_tag(object);
 
-	object = reset_tag(object);
+	object = kasan_reset_tag(object);
 
 #if IS_ENABLED(CONFIG_KUNIT)
 	if (current->kunit_test)
@@ -361,7 +361,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
 	disable_trace_on_warning();
 
 	tagged_addr = (void *)addr;
-	untagged_addr = reset_tag(tagged_addr);
+	untagged_addr = kasan_reset_tag(tagged_addr);
 
 	info.access_addr = tagged_addr;
 	if (addr_has_metadata(untagged_addr))
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index da543eb832cd..57114f0e14d1 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -22,7 +22,7 @@ const char *get_bug_type(struct kasan_access_info *info)
 
 void *find_first_bad_addr(void *addr, size_t size)
 {
-	return reset_tag(addr);
+	return kasan_reset_tag(addr);
 }
 
 void metadata_fetch_row(char *buffer, void *row)
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 317100fd95b9..7604b46239d4 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -41,7 +41,7 @@ const char *get_bug_type(struct kasan_access_info *info)
 	int i;
 
 	tag = get_tag(info->access_addr);
-	addr = reset_tag(info->access_addr);
+	addr = kasan_reset_tag(info->access_addr);
 	page = kasan_addr_to_page(addr);
 	if (page && PageSlab(page)) {
 		cache = page->slab_cache;
@@ -72,7 +72,7 @@ const char *get_bug_type(struct kasan_access_info *info)
 void *find_first_bad_addr(void *addr, size_t size)
 {
 	u8 tag = get_tag(addr);
-	void *p = reset_tag(addr);
+	void *p = kasan_reset_tag(addr);
 	void *end = p + size;
 
 	while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index ac6a5f57df33..44a2b748f9d3 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -81,7 +81,7 @@ void poison_range(const void *address, size_t size, u8 value)
 	 * some of the callers (e.g. kasan_poison_object_data) pass tagged
 	 * addresses to this function.
 	 */
-	address = reset_tag(address);
+	address = kasan_reset_tag(address);
 
 	shadow_start = kasan_mem_to_shadow(address);
 	shadow_end = kasan_mem_to_shadow(address + size);
@@ -98,7 +98,7 @@ void unpoison_range(const void *address, size_t size)
 	 * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
 	 * addresses to this function.
 	 */
-	address = reset_tag(address);
+	address = kasan_reset_tag(address);
 
 	poison_range(address, size, tag);
 
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 6d7648cc3b98..e17de2619bbf 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -67,11 +67,6 @@ u8 random_tag(void)
 	return (u8)(state % (KASAN_TAG_MAX + 1));
 }
 
-void *kasan_reset_tag(const void *addr)
-{
-	return reset_tag(addr);
-}
-
 bool check_memory_region(unsigned long addr, size_t size, bool write,
 				unsigned long ret_ip)
 {
@@ -107,7 +102,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
 	if (tag == KASAN_TAG_KERNEL)
 		return true;
 
-	untagged_addr = reset_tag((const void *)addr);
+	untagged_addr = kasan_reset_tag((const void *)addr);
 	if (unlikely(untagged_addr <
 			kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
 		return !kasan_report(addr, size, write, ret_ip);
@@ -126,7 +121,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
 bool check_invalid_free(void *addr)
 {
 	u8 tag = get_tag(addr);
-	u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(reset_tag(addr)));
+	u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
 
 	return (shadow_byte == KASAN_TAG_INVALID) ||
 		(tag != KASAN_TAG_KERNEL && tag != shadow_byte);
-- 
cgit 


From bffe690708c8b4fdb8f0bff8ff22b347fc6c709a Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:59 -0800
Subject: kasan: open-code kasan_unpoison_slab

There's the external annotation kasan_unpoison_slab() that is currently
defined as static inline and uses kasan_unpoison_range(). Open-code this
function in mempool.c. Otherwise with an upcoming change this function
will result in an unnecessary function call.

Link: https://lkml.kernel.org/r/131a6694a978a9a8b150187e539eecc8bcbf759b.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/Ia7c8b659f79209935cbaab3913bf7f082cc43a0e
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 6 ------
 mm/mempool.c          | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 3bb72de94f90..7350de3e9fe4 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -107,11 +107,6 @@ struct kasan_cache {
 	int free_meta_offset;
 };
 
-size_t __ksize(const void *);
-static inline void kasan_unpoison_slab(const void *ptr)
-{
-	kasan_unpoison_range(ptr, __ksize(ptr));
-}
 size_t kasan_metadata_size(struct kmem_cache *cache);
 
 bool kasan_save_enable_multi_shot(void);
@@ -167,7 +162,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 	return false;
 }
 
-static inline void kasan_unpoison_slab(const void *ptr) { }
 static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 #endif /* CONFIG_KASAN */
diff --git a/mm/mempool.c b/mm/mempool.c
index f473cdddaff0..583a9865b181 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,7 +112,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 static void kasan_unpoison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
-		kasan_unpoison_slab(element);
+		kasan_unpoison_range(element, __ksize(element));
 	else if (pool->alloc == mempool_alloc_pages)
 		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
 }
-- 
cgit 


From 34303244f2615add92076a4bf2d4f39323bde4f2 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:03:10 -0800
Subject: kasan, mm: check kasan_enabled in annotations

Declare the kasan_enabled static key in include/linux/kasan.h and in
include/linux/mm.h and check it in all kasan annotations. This allows to
avoid any slowdown caused by function calls when kasan_enabled is
disabled.

Link: https://lkml.kernel.org/r/9f90e3c0aa840dbb4833367c2335193299f69023.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/I2589451d3c96c97abbcbf714baabe6161c6f153e
Co-developed-by: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Signed-off-by: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 213 +++++++++++++++++++++++++++++++++++++++-----------
 include/linux/mm.h    |  22 ++++--
 mm/kasan/common.c     |  56 ++++++-------
 3 files changed, 210 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7350de3e9fe4..9176849c4934 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_KASAN_H
 #define _LINUX_KASAN_H
 
+#include <linux/static_key.h>
 #include <linux/types.h>
 
 struct kmem_cache;
@@ -75,54 +76,176 @@ static inline void kasan_disable_current(void) {}
 
 #ifdef CONFIG_KASAN
 
-void kasan_unpoison_range(const void *address, size_t size);
+struct kasan_cache {
+	int alloc_meta_offset;
+	int free_meta_offset;
+};
 
-void kasan_alloc_pages(struct page *page, unsigned int order);
-void kasan_free_pages(struct page *page, unsigned int order);
+#ifdef CONFIG_KASAN_HW_TAGS
+DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
+static __always_inline bool kasan_enabled(void)
+{
+	return static_branch_likely(&kasan_flag_enabled);
+}
+#else
+static inline bool kasan_enabled(void)
+{
+	return true;
+}
+#endif
 
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
-			slab_flags_t *flags);
+void __kasan_unpoison_range(const void *addr, size_t size);
+static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
+{
+	if (kasan_enabled())
+		__kasan_unpoison_range(addr, size);
+}
 
-void kasan_poison_slab(struct page *page);
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
-void kasan_poison_object_data(struct kmem_cache *cache, void *object);
-void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
-					const void *object);
+void __kasan_alloc_pages(struct page *page, unsigned int order);
+static __always_inline void kasan_alloc_pages(struct page *page,
+						unsigned int order)
+{
+	if (kasan_enabled())
+		__kasan_alloc_pages(page, order);
+}
 
-void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
-						gfp_t flags);
-void kasan_kfree_large(void *ptr, unsigned long ip);
-void kasan_poison_kfree(void *ptr, unsigned long ip);
-void * __must_check kasan_kmalloc(struct kmem_cache *s, const void *object,
-					size_t size, gfp_t flags);
-void * __must_check kasan_krealloc(const void *object, size_t new_size,
-					gfp_t flags);
+void __kasan_free_pages(struct page *page, unsigned int order);
+static __always_inline void kasan_free_pages(struct page *page,
+						unsigned int order)
+{
+	if (kasan_enabled())
+		__kasan_free_pages(page, order);
+}
 
-void * __must_check kasan_slab_alloc(struct kmem_cache *s, void *object,
-					gfp_t flags);
-bool kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
+void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+				slab_flags_t *flags);
+static __always_inline void kasan_cache_create(struct kmem_cache *cache,
+				unsigned int *size, slab_flags_t *flags)
+{
+	if (kasan_enabled())
+		__kasan_cache_create(cache, size, flags);
+}
 
-struct kasan_cache {
-	int alloc_meta_offset;
-	int free_meta_offset;
-};
+size_t __kasan_metadata_size(struct kmem_cache *cache);
+static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+	if (kasan_enabled())
+		return __kasan_metadata_size(cache);
+	return 0;
+}
+
+void __kasan_poison_slab(struct page *page);
+static __always_inline void kasan_poison_slab(struct page *page)
+{
+	if (kasan_enabled())
+		__kasan_poison_slab(page);
+}
+
+void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
+static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+							void *object)
+{
+	if (kasan_enabled())
+		__kasan_unpoison_object_data(cache, object);
+}
+
+void __kasan_poison_object_data(struct kmem_cache *cache, void *object);
+static __always_inline void kasan_poison_object_data(struct kmem_cache *cache,
+							void *object)
+{
+	if (kasan_enabled())
+		__kasan_poison_object_data(cache, object);
+}
+
+void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
+					  const void *object);
+static __always_inline void * __must_check kasan_init_slab_obj(
+				struct kmem_cache *cache, const void *object)
+{
+	if (kasan_enabled())
+		return __kasan_init_slab_obj(cache, object);
+	return (void *)object;
+}
+
+bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
+static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object,
+						unsigned long ip)
+{
+	if (kasan_enabled())
+		return __kasan_slab_free(s, object, ip);
+	return false;
+}
+
+void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
+				       void *object, gfp_t flags);
+static __always_inline void * __must_check kasan_slab_alloc(
+				struct kmem_cache *s, void *object, gfp_t flags)
+{
+	if (kasan_enabled())
+		return __kasan_slab_alloc(s, object, flags);
+	return object;
+}
+
+void * __must_check __kasan_kmalloc(struct kmem_cache *s, const void *object,
+				    size_t size, gfp_t flags);
+static __always_inline void * __must_check kasan_kmalloc(struct kmem_cache *s,
+				const void *object, size_t size, gfp_t flags)
+{
+	if (kasan_enabled())
+		return __kasan_kmalloc(s, object, size, flags);
+	return (void *)object;
+}
 
-size_t kasan_metadata_size(struct kmem_cache *cache);
+void * __must_check __kasan_kmalloc_large(const void *ptr,
+					  size_t size, gfp_t flags);
+static __always_inline void * __must_check kasan_kmalloc_large(const void *ptr,
+						      size_t size, gfp_t flags)
+{
+	if (kasan_enabled())
+		return __kasan_kmalloc_large(ptr, size, flags);
+	return (void *)ptr;
+}
+
+void * __must_check __kasan_krealloc(const void *object,
+				     size_t new_size, gfp_t flags);
+static __always_inline void * __must_check kasan_krealloc(const void *object,
+						 size_t new_size, gfp_t flags)
+{
+	if (kasan_enabled())
+		return __kasan_krealloc(object, new_size, flags);
+	return (void *)object;
+}
+
+void __kasan_poison_kfree(void *ptr, unsigned long ip);
+static __always_inline void kasan_poison_kfree(void *ptr, unsigned long ip)
+{
+	if (kasan_enabled())
+		__kasan_poison_kfree(ptr, ip);
+}
+
+void __kasan_kfree_large(void *ptr, unsigned long ip);
+static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip)
+{
+	if (kasan_enabled())
+		__kasan_kfree_large(ptr, ip);
+}
 
 bool kasan_save_enable_multi_shot(void);
 void kasan_restore_multi_shot(bool enabled);
 
 #else /* CONFIG_KASAN */
 
+static inline bool kasan_enabled(void)
+{
+	return false;
+}
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
-
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
-
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      unsigned int *size,
 				      slab_flags_t *flags) {}
-
+static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
 					void *object) {}
@@ -133,36 +256,32 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
 {
 	return (void *)object;
 }
-
-static inline void *kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
+				   unsigned long ip)
 {
-	return ptr;
+	return false;
+}
+static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
+				   gfp_t flags)
+{
+	return object;
 }
-static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
-static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
 static inline void *kasan_kmalloc(struct kmem_cache *s, const void *object,
 				size_t size, gfp_t flags)
 {
 	return (void *)object;
 }
+static inline void *kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+{
+	return (void *)ptr;
+}
 static inline void *kasan_krealloc(const void *object, size_t new_size,
 				 gfp_t flags)
 {
 	return (void *)object;
 }
-
-static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
-				   gfp_t flags)
-{
-	return object;
-}
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
-				   unsigned long ip)
-{
-	return false;
-}
-
-static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
+static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
+static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
 
 #endif /* CONFIG_KASAN */
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 024ec0a00c72..5299b90a6c40 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -31,6 +31,7 @@
 #include <linux/sizes.h>
 #include <linux/sched.h>
 #include <linux/pgtable.h>
+#include <linux/kasan.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1422,22 +1423,30 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+
 static inline u8 page_kasan_tag(const struct page *page)
 {
-	return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+	if (kasan_enabled())
+		return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+	return 0xff;
 }
 
 static inline void page_kasan_tag_set(struct page *page, u8 tag)
 {
-	page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
-	page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+	if (kasan_enabled()) {
+		page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
+		page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+	}
 }
 
 static inline void page_kasan_tag_reset(struct page *page)
 {
-	page_kasan_tag_set(page, 0xff);
+	if (kasan_enabled())
+		page_kasan_tag_set(page, 0xff);
 }
-#else
+
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
 static inline u8 page_kasan_tag(const struct page *page)
 {
 	return 0xff;
@@ -1445,7 +1454,8 @@ static inline u8 page_kasan_tag(const struct page *page)
 
 static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
 static inline void page_kasan_tag_reset(struct page *page) { }
-#endif
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
 
 static inline struct zone *page_zone(const struct page *page)
 {
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 219c2979bd3e..ae0130cf9de3 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -58,7 +58,7 @@ void kasan_disable_current(void)
 }
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
-void kasan_unpoison_range(const void *address, size_t size)
+void __kasan_unpoison_range(const void *address, size_t size)
 {
 	unpoison_range(address, size);
 }
@@ -86,7 +86,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 }
 #endif /* CONFIG_KASAN_STACK */
 
-void kasan_alloc_pages(struct page *page, unsigned int order)
+void __kasan_alloc_pages(struct page *page, unsigned int order)
 {
 	u8 tag;
 	unsigned long i;
@@ -100,7 +100,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order)
 	unpoison_range(page_address(page), PAGE_SIZE << order);
 }
 
-void kasan_free_pages(struct page *page, unsigned int order)
+void __kasan_free_pages(struct page *page, unsigned int order)
 {
 	if (likely(!PageHighMem(page)))
 		poison_range(page_address(page),
@@ -127,8 +127,8 @@ static inline unsigned int optimal_redzone(unsigned int object_size)
 		object_size <= (1 << 16) - 1024 ? 1024 : 2048;
 }
 
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
-			slab_flags_t *flags)
+void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+			  slab_flags_t *flags)
 {
 	unsigned int orig_size = *size;
 	unsigned int redzone_size;
@@ -173,7 +173,7 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
 	*flags |= SLAB_KASAN;
 }
 
-size_t kasan_metadata_size(struct kmem_cache *cache)
+size_t __kasan_metadata_size(struct kmem_cache *cache)
 {
 	if (!kasan_stack_collection_enabled())
 		return 0;
@@ -196,7 +196,7 @@ struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
 	return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset;
 }
 
-void kasan_poison_slab(struct page *page)
+void __kasan_poison_slab(struct page *page)
 {
 	unsigned long i;
 
@@ -206,12 +206,12 @@ void kasan_poison_slab(struct page *page)
 		     KASAN_KMALLOC_REDZONE);
 }
 
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
 {
 	unpoison_range(object, cache->object_size);
 }
 
-void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
 {
 	poison_range(object,
 			round_up(cache->object_size, KASAN_GRANULE_SIZE),
@@ -264,7 +264,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object,
 #endif
 }
 
-void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
 						const void *object)
 {
 	struct kasan_alloc_meta *alloc_meta;
@@ -283,7 +283,7 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
 	return (void *)object;
 }
 
-static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
 			      unsigned long ip, bool quarantine)
 {
 	u8 tag;
@@ -326,9 +326,9 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 	return IS_ENABLED(CONFIG_KASAN_GENERIC);
 }
 
-bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
 {
-	return __kasan_slab_free(cache, object, ip, true);
+	return ____kasan_slab_free(cache, object, ip, true);
 }
 
 static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
@@ -336,7 +336,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_set_track(&kasan_get_alloc_meta(cache, object)->alloc_track, flags);
 }
 
-static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
 				size_t size, gfp_t flags, bool keep_tag)
 {
 	unsigned long redzone_start;
@@ -368,20 +368,20 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
 	return set_tag(object, tag);
 }
 
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
-					gfp_t flags)
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+					void *object, gfp_t flags)
 {
-	return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+	return ____kasan_kmalloc(cache, object, cache->object_size, flags, false);
 }
 
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
-				size_t size, gfp_t flags)
+void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object,
+					size_t size, gfp_t flags)
 {
-	return __kasan_kmalloc(cache, object, size, flags, true);
+	return ____kasan_kmalloc(cache, object, size, flags, true);
 }
-EXPORT_SYMBOL(kasan_kmalloc);
+EXPORT_SYMBOL(__kasan_kmalloc);
 
-void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
+void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
 						gfp_t flags)
 {
 	struct page *page;
@@ -406,7 +406,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
 	return (void *)ptr;
 }
 
-void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
+void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
 {
 	struct page *page;
 
@@ -416,13 +416,13 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
 	page = virt_to_head_page(object);
 
 	if (unlikely(!PageSlab(page)))
-		return kasan_kmalloc_large(object, size, flags);
+		return __kasan_kmalloc_large(object, size, flags);
 	else
-		return __kasan_kmalloc(page->slab_cache, object, size,
+		return ____kasan_kmalloc(page->slab_cache, object, size,
 						flags, true);
 }
 
-void kasan_poison_kfree(void *ptr, unsigned long ip)
+void __kasan_poison_kfree(void *ptr, unsigned long ip)
 {
 	struct page *page;
 
@@ -435,11 +435,11 @@ void kasan_poison_kfree(void *ptr, unsigned long ip)
 		}
 		poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
 	} else {
-		__kasan_slab_free(page->slab_cache, ptr, ip, false);
+		____kasan_slab_free(page->slab_cache, ptr, ip, false);
 	}
 }
 
-void kasan_kfree_large(void *ptr, unsigned long ip)
+void __kasan_kfree_large(void *ptr, unsigned long ip)
 {
 	if (ptr != page_address(virt_to_head_page(ptr)))
 		kasan_report_invalid_free(ptr, ip);
-- 
cgit 


From eeb3160c2419e0f1045537acac7b19cba64112f4 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:03:13 -0800
Subject: kasan, mm: rename kasan_poison_kfree

Rename kasan_poison_kfree() to kasan_slab_free_mempool() as it better
reflects what this annotation does. Also add a comment that explains the
PageSlab() check.

No functional changes.

Link: https://lkml.kernel.org/r/141675fb493555e984c5dca555e9d9f768c7bbaa.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/I5026f87364e556b506ef1baee725144bb04b8810
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 16 ++++++++--------
 mm/kasan/common.c     | 40 +++++++++++++++++++++++-----------------
 mm/mempool.c          |  2 +-
 3 files changed, 32 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 9176849c4934..6f0c5d9aa43f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -176,6 +176,13 @@ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 	return false;
 }
 
+void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
+static __always_inline void kasan_slab_free_mempool(void *ptr, unsigned long ip)
+{
+	if (kasan_enabled())
+		__kasan_slab_free_mempool(ptr, ip);
+}
+
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
 				       void *object, gfp_t flags);
 static __always_inline void * __must_check kasan_slab_alloc(
@@ -216,13 +223,6 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
-void __kasan_poison_kfree(void *ptr, unsigned long ip);
-static __always_inline void kasan_poison_kfree(void *ptr, unsigned long ip)
-{
-	if (kasan_enabled())
-		__kasan_poison_kfree(ptr, ip);
-}
-
 void __kasan_kfree_large(void *ptr, unsigned long ip);
 static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip)
 {
@@ -261,6 +261,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 {
 	return false;
 }
+static inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags)
 {
@@ -280,7 +281,6 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
-static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
 static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
 
 #endif /* CONFIG_KASAN */
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index ae0130cf9de3..d0f8d7a955cd 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -331,6 +331,29 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
 	return ____kasan_slab_free(cache, object, ip, true);
 }
 
+void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
+{
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+
+	/*
+	 * Even though this function is only called for kmem_cache_alloc and
+	 * kmalloc backed mempool allocations, those allocations can still be
+	 * !PageSlab() when the size provided to kmalloc is larger than
+	 * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
+	 */
+	if (unlikely(!PageSlab(page))) {
+		if (ptr != page_address(page)) {
+			kasan_report_invalid_free(ptr, ip);
+			return;
+		}
+		poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
+	} else {
+		____kasan_slab_free(page->slab_cache, ptr, ip, false);
+	}
+}
+
 static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
 {
 	kasan_set_track(&kasan_get_alloc_meta(cache, object)->alloc_track, flags);
@@ -422,23 +445,6 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
 						flags, true);
 }
 
-void __kasan_poison_kfree(void *ptr, unsigned long ip)
-{
-	struct page *page;
-
-	page = virt_to_head_page(ptr);
-
-	if (unlikely(!PageSlab(page))) {
-		if (ptr != page_address(page)) {
-			kasan_report_invalid_free(ptr, ip);
-			return;
-		}
-		poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
-	} else {
-		____kasan_slab_free(page->slab_cache, ptr, ip, false);
-	}
-}
-
 void __kasan_kfree_large(void *ptr, unsigned long ip)
 {
 	if (ptr != page_address(virt_to_head_page(ptr)))
diff --git a/mm/mempool.c b/mm/mempool.c
index 583a9865b181..624ed51b060f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
-		kasan_poison_kfree(element, _RET_IP_);
+		kasan_slab_free_mempool(element, _RET_IP_);
 	else if (pool->alloc == mempool_alloc_pages)
 		kasan_free_pages(element, (unsigned long)pool->pool_data);
 }
-- 
cgit 


From e86f8b09f215e3755cd2d56930487dec2de02433 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:03:31 -0800
Subject: kasan, mm: allow cache merging with no metadata

The reason cache merging is disabled with KASAN is because KASAN puts its
metadata right after the allocated object. When the merged caches have
slightly different sizes, the metadata ends up in different places, which
KASAN doesn't support.

It might be possible to adjust the metadata allocation algorithm and make
it friendly to the cache merging code. Instead this change takes a simpler
approach and allows merging caches when no metadata is present. Which is
the case for hardware tag-based KASAN with kasan.mode=prod.

Link: https://lkml.kernel.org/r/37497e940bfd4b32c0a93a702a9ae4cf061d5392.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/Ia114847dfb2244f297d2cb82d592bf6a07455dba
Co-developed-by: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Signed-off-by: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 21 +++++++++++++++++++--
 mm/kasan/common.c     | 11 +++++++++++
 mm/slab_common.c      |  3 ++-
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 6f0c5d9aa43f..5e0655fb2a6f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -82,17 +82,30 @@ struct kasan_cache {
 };
 
 #ifdef CONFIG_KASAN_HW_TAGS
+
 DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
+
 static __always_inline bool kasan_enabled(void)
 {
 	return static_branch_likely(&kasan_flag_enabled);
 }
-#else
+
+#else /* CONFIG_KASAN_HW_TAGS */
+
 static inline bool kasan_enabled(void)
 {
 	return true;
 }
-#endif
+
+#endif /* CONFIG_KASAN_HW_TAGS */
+
+slab_flags_t __kasan_never_merge(void);
+static __always_inline slab_flags_t kasan_never_merge(void)
+{
+	if (kasan_enabled())
+		return __kasan_never_merge();
+	return 0;
+}
 
 void __kasan_unpoison_range(const void *addr, size_t size);
 static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
@@ -239,6 +252,10 @@ static inline bool kasan_enabled(void)
 {
 	return false;
 }
+static inline slab_flags_t kasan_never_merge(void)
+{
+	return 0;
+}
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 0cd583d2fe1c..b25167664ead 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -86,6 +86,17 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 }
 #endif /* CONFIG_KASAN_STACK */
 
+/*
+ * Only allow cache merging when stack collection is disabled and no metadata
+ * is present.
+ */
+slab_flags_t __kasan_never_merge(void)
+{
+	if (kasan_stack_collection_enabled())
+		return SLAB_KASAN;
+	return 0;
+}
+
 void __kasan_alloc_pages(struct page *page, unsigned int order)
 {
 	u8 tag;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 573fbacd9ef5..e981c80d216c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -18,6 +18,7 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/kasan.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
@@ -53,7 +54,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
  */
 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
-		SLAB_FAILSLAB | SLAB_KASAN)
+		SLAB_FAILSLAB | kasan_never_merge())
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
-- 
cgit